commit dd923b4d24c301859c53ed9effd1f391f86c5b28 Author: ModelHub XC Date: Thu Jun 18 20:28:16 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: burtenshaw/Qwen3-1.7B-wordle Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..27e95e4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,57 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text + + +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +tokenizer.json filter=lfs diff=lfs merge=lfs -text +training_args.bin filter=lfs diff=lfs merge=lfs -text +scheduler.pt filter=lfs diff=lfs merge=lfs -text +model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text +optimizer.pt filter=lfs diff=lfs merge=lfs -text +model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +rng_state.pth filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5aff9d5 --- /dev/null +++ b/README.md @@ -0,0 +1,48 @@ +--- +license: Apache License 2.0 +tags: [] + +#model-type: +##如 gpt、phi、llama、chatglm、baichuan 等 +#- gpt + +#domain: +##如 nlp、cv、audio、multi-modal +#- nlp + +#language: +##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa +#- cn + +#metrics: +##如 CIDEr、Blue、ROUGE 等 +#- CIDEr + +#tags: +##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他 +#- pretrained + +#tools: +##如 vllm、fastchat、llamacpp、AdaSeq 等 +#- vllm +--- +### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重,可浏览“模型文件”页面获取。 +#### 您可以通过如下git clone命令,或者ModelScope SDK来下载模型 + +SDK下载 +```bash +#安装ModelScope +pip install modelscope +``` +```python +#SDK模型下载 +from modelscope import snapshot_download +model_dir = snapshot_download('burtenshaw/Qwen3-1.7B-wordle') +``` +Git下载 +``` +#Git模型下载 +git clone https://www.modelscope.cn/burtenshaw/Qwen3-1.7B-wordle.git +``` + +

如果您是本模型的贡献者,我们邀请您根据模型贡献文档,及时完善模型卡片内容。

\ No newline at end of file diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..b54f913 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..1d1ed72 --- /dev/null +++ b/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151645, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..159097f --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "others", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..0d2e61f --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "eos_token_id": [ + 151645 + ], + "pad_token_id": 151645, + "transformers_version": "4.57.1", + "use_cache": false +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00002.safetensors b/model-00001-of-00002.safetensors new file mode 100644 index 0000000..a9824a6 --- /dev/null +++ b/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56533df19205425aa0fe0c42410cfb34d1c4788c3ed3f54dbbe76c752063227a +size 4969539560 diff --git a/model-00002-of-00002.safetensors b/model-00002-of-00002.safetensors new file mode 100644 index 0000000..d29ed3a --- /dev/null +++ b/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f982491ce371a64cad53a6620111d08b328e6f2ff98ddacdf9f2b9bedc452681 +size 1912795688 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..ef81ba6 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,318 @@ +{ + "metadata": { + "total_parameters": 1720574976, + "total_size": 6882299904 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/optimizer.pt b/optimizer.pt new file mode 100644 index 0000000..1ffe6cc --- /dev/null +++ b/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62733a94a79aeb5ca4901b53b470a23b13159533b1b83f1faadd1da4e2978a4c +size 13764874747 diff --git a/rng_state.pth b/rng_state.pth new file mode 100644 index 0000000..1a4aac5 --- /dev/null +++ b/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65a73f65829fe900881cb34aa7222cb6fe02ae1d05edd1af3db93a7ebcd0c1a2 +size 14645 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000..195bbd9 --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:215c848566646ac7ba21d4d67f26323367af5f47af677081560dd65510d5dd51 +size 1465 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..aa59b33 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,25 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|im_end|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..cd71f61 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4 +size 11422654 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..d7b51cc --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,239 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151665": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151666": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151667": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151668": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 131072, + "pad_token": "<|im_end|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..4214a20 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3289 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 93, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 20.59375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.1623296411707997, + "epoch": 0.010752688172043012, + "frac_reward_zero_std": 0.5625, + "grad_norm": 2.5920114517211914, + "learning_rate": 0.0, + "loss": -0.0655, + "num_tokens": 209228.0, + "reward": 0.10260416567325592, + "reward_std": 0.12079741060733795, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.01875000074505806, + "rewards/reward_coverage/std": 0.08886408805847168, + "rewards/reward_repetition/mean": 0.08385416865348816, + "rewards/reward_repetition/std": 0.1609596610069275, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5810753107070923, + "sampling/importance_sampling_ratio/min": 8.851574766937428e-15, + "sampling/sampling_logp_difference/max": 32.35818099975586, + "sampling/sampling_logp_difference/mean": 3.2135589122772217, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 19.34375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.2037830371409655, + "epoch": 0.021505376344086023, + "frac_reward_zero_std": 0.65625, + "grad_norm": 1.522868275642395, + "learning_rate": 2.5000000000000004e-07, + "loss": -0.0672, + "num_tokens": 411382.0, + "reward": 0.06171875074505806, + "reward_std": 0.08728349208831787, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0015625000232830644, + "rewards/reward_coverage/std": 0.01250000111758709, + "rewards/reward_repetition/mean": 0.06015624850988388, + "rewards/reward_repetition/std": 0.14286737143993378, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.572687029838562, + "sampling/importance_sampling_ratio/min": 6.514152938275704e-16, + "sampling/sampling_logp_difference/max": 34.967384338378906, + "sampling/sampling_logp_difference/mean": 3.2888572216033936, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 18.015625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.1908840090036392, + "epoch": 0.03225806451612903, + "frac_reward_zero_std": 0.78125, + "grad_norm": 2.1042428016662598, + "learning_rate": 5.000000000000001e-07, + "loss": -0.0415, + "num_tokens": 589899.0, + "reward": 0.03333333134651184, + "reward_std": 0.03609190881252289, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0, + "rewards/reward_coverage/std": 0.0, + "rewards/reward_repetition/mean": 0.03333333134651184, + "rewards/reward_repetition/std": 0.09172075986862183, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.582237184047699, + "sampling/importance_sampling_ratio/min": 3.4790040246974845e-15, + "sampling/sampling_logp_difference/max": 33.292030334472656, + "sampling/sampling_logp_difference/mean": 3.1163713932037354, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 18.1875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.188672631047666, + "epoch": 0.043010752688172046, + "frac_reward_zero_std": 0.78125, + "grad_norm": 1.6742660999298096, + "learning_rate": 7.5e-07, + "loss": -0.0256, + "num_tokens": 784993.0, + "reward": 0.03932292014360428, + "reward_std": 0.046772170811891556, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0, + "rewards/reward_coverage/std": 0.0, + "rewards/reward_repetition/mean": 0.03932292014360428, + "rewards/reward_repetition/std": 0.10880006849765778, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5760836005210876, + "sampling/importance_sampling_ratio/min": 4.4242851705367583e-14, + "sampling/sampling_logp_difference/max": 30.749082565307617, + "sampling/sampling_logp_difference/mean": 3.1024224758148193, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 19.296875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.1812530495226383, + "epoch": 0.053763440860215055, + "frac_reward_zero_std": 0.625, + "grad_norm": 3.190187931060791, + "learning_rate": 1.0000000000000002e-06, + "loss": -0.061, + "num_tokens": 994168.0, + "reward": 0.07604166865348816, + "reward_std": 0.10753916203975677, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0062500000931322575, + "rewards/reward_coverage/std": 0.035073623061180115, + "rewards/reward_repetition/mean": 0.06979166716337204, + "rewards/reward_repetition/std": 0.1535550206899643, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5558198690414429, + "sampling/importance_sampling_ratio/min": 1.0327031483188718e-17, + "sampling/sampling_logp_difference/max": 39.11176681518555, + "sampling/sampling_logp_difference/mean": 3.2322075366973877, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 18.40625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.1822001421824098, + "epoch": 0.06451612903225806, + "frac_reward_zero_std": 0.71875, + "grad_norm": 2.3724122047424316, + "learning_rate": 1.25e-06, + "loss": -0.0569, + "num_tokens": 1177412.0, + "reward": 0.04401041567325592, + "reward_std": 0.062240131199359894, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0, + "rewards/reward_coverage/std": 0.0, + "rewards/reward_repetition/mean": 0.04401041567325592, + "rewards/reward_repetition/std": 0.11309215426445007, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5901535749435425, + "sampling/importance_sampling_ratio/min": 2.3622067538522137e-15, + "sampling/sampling_logp_difference/max": 33.67918014526367, + "sampling/sampling_logp_difference/mean": 3.0934648513793945, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 20.203125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.1859856164082885, + "epoch": 0.07526881720430108, + "frac_reward_zero_std": 0.625, + "grad_norm": 2.2745258808135986, + "learning_rate": 1.5e-06, + "loss": -0.07, + "num_tokens": 1379577.0, + "reward": 0.08828125149011612, + "reward_std": 0.10496117174625397, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.02031249925494194, + "rewards/reward_coverage/std": 0.08578246831893921, + "rewards/reward_repetition/mean": 0.06796875596046448, + "rewards/reward_repetition/std": 0.13983187079429626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5829952359199524, + "sampling/importance_sampling_ratio/min": 1.7831107242271977e-19, + "sampling/sampling_logp_difference/max": 43.17075729370117, + "sampling/sampling_logp_difference/mean": 3.094505786895752, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 19.765625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.1645366102457047, + "epoch": 0.08602150537634409, + "frac_reward_zero_std": 0.71875, + "grad_norm": 2.285911798477173, + "learning_rate": 1.75e-06, + "loss": -0.0463, + "num_tokens": 1562860.0, + "reward": 0.06822916865348816, + "reward_std": 0.05671586096286774, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0, + "rewards/reward_coverage/std": 0.0, + "rewards/reward_repetition/mean": 0.06822916865348816, + "rewards/reward_repetition/std": 0.13447654247283936, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.6059508323669434, + "sampling/importance_sampling_ratio/min": 1.5984405334448e-15, + "sampling/sampling_logp_difference/max": 34.06974792480469, + "sampling/sampling_logp_difference/mean": 3.0470666885375977, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.2150460854172707, + "epoch": 0.0967741935483871, + "frac_reward_zero_std": 0.5625, + "grad_norm": 2.4473977088928223, + "learning_rate": 2.0000000000000003e-06, + "loss": -0.0729, + "num_tokens": 1750970.0, + "reward": 0.07604166865348816, + "reward_std": 0.10753916203975677, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.004687500186264515, + "rewards/reward_coverage/std": 0.03750000149011612, + "rewards/reward_repetition/mean": 0.07135416567325592, + "rewards/reward_repetition/std": 0.14673006534576416, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.564116358757019, + "sampling/importance_sampling_ratio/min": 5.549738260316673e-19, + "sampling/sampling_logp_difference/max": 42.03536605834961, + "sampling/sampling_logp_difference/mean": 3.24617075920105, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 20.125, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 37.0, + "entropy": 1.1741499826312065, + "epoch": 0.10752688172043011, + "frac_reward_zero_std": 0.5625, + "grad_norm": 2.0894999504089355, + "learning_rate": 2.25e-06, + "loss": -0.0295, + "num_tokens": 1955314.0, + "reward": 0.08229167759418488, + "reward_std": 0.0766032412648201, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0062500000931322575, + "rewards/reward_coverage/std": 0.05000000447034836, + "rewards/reward_repetition/mean": 0.07604166865348816, + "rewards/reward_repetition/std": 0.12633328139781952, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5676860809326172, + "sampling/importance_sampling_ratio/min": 1.0015881904035955e-15, + "sampling/sampling_logp_difference/max": 34.53718948364258, + "sampling/sampling_logp_difference/mean": 3.318382978439331, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 20.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.19177012052387, + "epoch": 0.11827956989247312, + "frac_reward_zero_std": 0.625, + "grad_norm": 2.127878189086914, + "learning_rate": 2.5e-06, + "loss": -0.044, + "num_tokens": 2152810.0, + "reward": 0.08697916567325592, + "reward_std": 0.09207119792699814, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0, + "rewards/reward_coverage/std": 0.0, + "rewards/reward_repetition/mean": 0.08697916567325592, + "rewards/reward_repetition/std": 0.1668112874031067, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5731910467147827, + "sampling/importance_sampling_ratio/min": 5.365229931202564e-15, + "sampling/sampling_logp_difference/max": 32.85883712768555, + "sampling/sampling_logp_difference/mean": 3.2824923992156982, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 20.640625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 0.0, + "entropy": 1.2268892796710134, + "epoch": 0.12903225806451613, + "frac_reward_zero_std": 0.53125, + "grad_norm": 2.617131233215332, + "learning_rate": 2.7500000000000004e-06, + "loss": -0.0849, + "num_tokens": 2354987.0, + "reward": 0.10390624403953552, + "reward_std": 0.1211656928062439, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0078125, + "rewards/reward_coverage/std": 0.0625, + "rewards/reward_repetition/mean": 0.09609375149011612, + "rewards/reward_repetition/std": 0.16450294852256775, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5544115900993347, + "sampling/importance_sampling_ratio/min": 7.363439390216395e-18, + "sampling/sampling_logp_difference/max": 39.45000457763672, + "sampling/sampling_logp_difference/mean": 3.3240010738372803, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 23.421875, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 34.0, + "entropy": 1.1650395467877388, + "epoch": 0.13978494623655913, + "frac_reward_zero_std": 0.375, + "grad_norm": 2.670605182647705, + "learning_rate": 3e-06, + "loss": -0.0758, + "num_tokens": 2616268.0, + "reward": 0.18046876788139343, + "reward_std": 0.17567184567451477, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.04218750074505806, + "rewards/reward_coverage/std": 0.13190266489982605, + "rewards/reward_repetition/mean": 0.13828124105930328, + "rewards/reward_repetition/std": 0.18624858558177948, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5682023167610168, + "sampling/importance_sampling_ratio/min": 1.0665693960122588e-19, + "sampling/sampling_logp_difference/max": 43.684669494628906, + "sampling/sampling_logp_difference/mean": 3.6000094413757324, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 40.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.984375, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 34.0, + "entropy": 1.111030412837863, + "epoch": 0.15053763440860216, + "frac_reward_zero_std": 0.21875, + "grad_norm": 2.47416615486145, + "learning_rate": 3.2500000000000002e-06, + "loss": -0.0662, + "num_tokens": 2906261.0, + "reward": 0.2278645932674408, + "reward_std": 0.17272555828094482, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.03593749925494194, + "rewards/reward_coverage/std": 0.09489709138870239, + "rewards/reward_repetition/mean": 0.19192710518836975, + "rewards/reward_repetition/std": 0.19924886524677277, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.540465772151947, + "sampling/importance_sampling_ratio/min": 4.225438084884613e-17, + "sampling/sampling_logp_difference/max": 37.702823638916016, + "sampling/sampling_logp_difference/mean": 3.798464298248291, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 28.203125, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 34.0, + "entropy": 1.0863127624616027, + "epoch": 0.16129032258064516, + "frac_reward_zero_std": 0.15625, + "grad_norm": 3.092238664627075, + "learning_rate": 3.5e-06, + "loss": -0.0628, + "num_tokens": 3202938.0, + "reward": 0.30156248807907104, + "reward_std": 0.19813722372055054, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.0390625, + "rewards/reward_coverage/std": 0.10483121871948242, + "rewards/reward_repetition/mean": 0.26249998807907104, + "rewards/reward_repetition/std": 0.20803949236869812, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5526077151298523, + "sampling/importance_sampling_ratio/min": 3.7098329921141575e-20, + "sampling/sampling_logp_difference/max": 44.74071502685547, + "sampling/sampling_logp_difference/mean": 3.7845804691314697, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 30.90625, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 34.0, + "entropy": 1.0002785623073578, + "epoch": 0.17204301075268819, + "frac_reward_zero_std": 0.09375, + "grad_norm": 2.3450825214385986, + "learning_rate": 3.7500000000000005e-06, + "loss": -0.0623, + "num_tokens": 3523934.0, + "reward": 0.41588544845581055, + "reward_std": 0.2868938446044922, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.08281250298023224, + "rewards/reward_coverage/std": 0.15384459495544434, + "rewards/reward_repetition/mean": 0.3330729007720947, + "rewards/reward_repetition/std": 0.2281644642353058, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.544209361076355, + "sampling/importance_sampling_ratio/min": 1.4981450562263129e-16, + "sampling/sampling_logp_difference/max": 36.4371337890625, + "sampling/sampling_logp_difference/mean": 4.021841526031494, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.53125, + "completions/mean_terminated_length": 34.333335876464844, + "completions/min_length": 16.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.8482576478272676, + "epoch": 0.1827956989247312, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1.6775325536727905, + "learning_rate": 4.000000000000001e-06, + "loss": -0.0187, + "num_tokens": 3875784.0, + "reward": 0.5098958015441895, + "reward_std": 0.20402978360652924, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.13437500596046448, + "rewards/reward_coverage/std": 0.16639859974384308, + "rewards/reward_repetition/mean": 0.37552082538604736, + "rewards/reward_repetition/std": 0.17110413312911987, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5547957420349121, + "sampling/importance_sampling_ratio/min": 1.9580492778949622e-20, + "sampling/sampling_logp_difference/max": 45.37975311279297, + "sampling/sampling_logp_difference/mean": 3.881425142288208, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 36.400001525878906, + "completions/min_length": 16.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.7285963352769613, + "epoch": 0.1935483870967742, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.9497219324111938, + "learning_rate": 4.25e-06, + "loss": -0.0222, + "num_tokens": 4230450.0, + "reward": 0.6015625596046448, + "reward_std": 0.17456699907779694, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.15000000596046448, + "rewards/reward_coverage/std": 0.18856181204319, + "rewards/reward_repetition/mean": 0.4515624940395355, + "rewards/reward_repetition/std": 0.16296739876270294, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5840170383453369, + "sampling/importance_sampling_ratio/min": 1.3230608049300072e-14, + "sampling/sampling_logp_difference/max": 31.95624351501465, + "sampling/sampling_logp_difference/mean": 3.7303450107574463, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.333335876464844, + "completions/min_length": 16.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.7492767116054893, + "epoch": 0.20430107526881722, + "frac_reward_zero_std": 0.15625, + "grad_norm": 2.027407169342041, + "learning_rate": 4.5e-06, + "loss": -0.0178, + "num_tokens": 4592424.0, + "reward": 0.5973958373069763, + "reward_std": 0.2113954722881317, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.13124999403953552, + "rewards/reward_coverage/std": 0.1670234352350235, + "rewards/reward_repetition/mean": 0.4661458432674408, + "rewards/reward_repetition/std": 0.17673227190971375, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.5936897993087769, + "sampling/importance_sampling_ratio/min": 3.848576383998589e-20, + "sampling/sampling_logp_difference/max": 44.70399856567383, + "sampling/sampling_logp_difference/mean": 3.7467870712280273, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 36.33333206176758, + "completions/min_length": 24.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.6496950350701809, + "epoch": 0.21505376344086022, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1.7340210676193237, + "learning_rate": 4.75e-06, + "loss": -0.0028, + "num_tokens": 4973854.0, + "reward": 0.6968749761581421, + "reward_std": 0.1944543719291687, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.1875, + "rewards/reward_coverage/std": 0.17320507764816284, + "rewards/reward_repetition/mean": 0.5093749761581421, + "rewards/reward_repetition/std": 0.14407385885715485, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.6289246678352356, + "sampling/importance_sampling_ratio/min": 1.4430105039911333e-19, + "sampling/sampling_logp_difference/max": 43.38238525390625, + "sampling/sampling_logp_difference/mean": 3.522400379180908, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 37.96875, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.6573502826504409, + "epoch": 0.22580645161290322, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.9576199054718018, + "learning_rate": 5e-06, + "loss": -0.0085, + "num_tokens": 5357700.0, + "reward": 0.715624988079071, + "reward_std": 0.1944543719291687, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.17499999701976776, + "rewards/reward_coverage/std": 0.1736626923084259, + "rewards/reward_repetition/mean": 0.5406249761581421, + "rewards/reward_repetition/std": 0.1649615317583084, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.666456401348114, + "sampling/importance_sampling_ratio/min": 1.55370423422568e-14, + "sampling/sampling_logp_difference/max": 31.795549392700195, + "sampling/sampling_logp_difference/mean": 3.181442975997925, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 38.859375, + "completions/mean_terminated_length": 36.66666793823242, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.5784921627491713, + "epoch": 0.23655913978494625, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.6491539478302002, + "learning_rate": 4.931506849315069e-06, + "loss": 0.009, + "num_tokens": 5746525.0, + "reward": 0.792187511920929, + "reward_std": 0.1834058165550232, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20156249403953552, + "rewards/reward_coverage/std": 0.17772118747234344, + "rewards/reward_repetition/mean": 0.5906250476837158, + "rewards/reward_repetition/std": 0.1399759203195572, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.6919029355049133, + "sampling/importance_sampling_ratio/min": 2.5034810497841535e-16, + "sampling/sampling_logp_difference/max": 35.92367935180664, + "sampling/sampling_logp_difference/mean": 3.11592435836792, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.03125, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.5693019391037524, + "epoch": 0.24731182795698925, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.9411438703536987, + "learning_rate": 4.863013698630138e-06, + "loss": -0.0017, + "num_tokens": 6135539.0, + "reward": 0.859375, + "reward_std": 0.19887377321720123, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20156249403953552, + "rewards/reward_coverage/std": 0.16855332255363464, + "rewards/reward_repetition/mean": 0.6578124761581421, + "rewards/reward_repetition/std": 0.1950211226940155, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.6962894201278687, + "sampling/importance_sampling_ratio/min": 5.379221697881343e-17, + "sampling/sampling_logp_difference/max": 37.461402893066406, + "sampling/sampling_logp_difference/mean": 2.938572645187378, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.109375, + "completions/mean_terminated_length": 35.20000076293945, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.521631199400872, + "epoch": 0.25806451612903225, + "frac_reward_zero_std": 0.125, + "grad_norm": 2.231412172317505, + "learning_rate": 4.7945205479452054e-06, + "loss": -0.0117, + "num_tokens": 6524620.0, + "reward": 0.862500011920929, + "reward_std": 0.20329320430755615, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18437500298023224, + "rewards/reward_coverage/std": 0.1801399439573288, + "rewards/reward_repetition/mean": 0.6781250238418579, + "rewards/reward_repetition/std": 0.1656816154718399, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7251441478729248, + "sampling/importance_sampling_ratio/min": 1.906062915100565e-20, + "sampling/sampling_logp_difference/max": 45.40666198730469, + "sampling/sampling_logp_difference/mean": 2.8243846893310547, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.484375, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.46869752556085587, + "epoch": 0.26881720430107525, + "frac_reward_zero_std": 0.0625, + "grad_norm": 1.5307422876358032, + "learning_rate": 4.726027397260274e-06, + "loss": -0.009, + "num_tokens": 6914737.0, + "reward": 0.9578125476837158, + "reward_std": 0.19224464893341064, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.2109375, + "rewards/reward_coverage/std": 0.16914087533950806, + "rewards/reward_repetition/mean": 0.7468750476837158, + "rewards/reward_repetition/std": 0.1603258103132248, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7427772879600525, + "sampling/importance_sampling_ratio/min": 6.379089188001887e-19, + "sampling/sampling_logp_difference/max": 41.89609146118164, + "sampling/sampling_logp_difference/mean": 2.6809170246124268, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.53125, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.44641283014789224, + "epoch": 0.27956989247311825, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1.65163254737854, + "learning_rate": 4.657534246575343e-06, + "loss": -0.0002, + "num_tokens": 7305211.0, + "reward": 0.9937499761581421, + "reward_std": 0.18119609355926514, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.171875, + "rewards/reward_coverage/std": 0.15272004902362823, + "rewards/reward_repetition/mean": 0.8218749761581421, + "rewards/reward_repetition/std": 0.15580691397190094, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.742205023765564, + "sampling/importance_sampling_ratio/min": 1.740283318765294e-20, + "sampling/sampling_logp_difference/max": 45.49765396118164, + "sampling/sampling_logp_difference/mean": 2.5815634727478027, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.5625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.39991177897900343, + "epoch": 0.2903225806451613, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.0949409008026123, + "learning_rate": 4.589041095890411e-06, + "loss": -0.0053, + "num_tokens": 7695441.0, + "reward": 1.0, + "reward_std": 0.2121320217847824, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.16874998807907104, + "rewards/reward_coverage/std": 0.1780627816915512, + "rewards/reward_repetition/mean": 0.831250011920929, + "rewards/reward_repetition/std": 0.14786845445632935, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7544648051261902, + "sampling/importance_sampling_ratio/min": 6.676384620995922e-19, + "sampling/sampling_logp_difference/max": 41.85054016113281, + "sampling/sampling_logp_difference/mean": 2.4634246826171875, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.5625, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3994480683468282, + "epoch": 0.3010752688172043, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.35282301902771, + "learning_rate": 4.52054794520548e-06, + "loss": -0.0062, + "num_tokens": 8086007.0, + "reward": 1.015625, + "reward_std": 0.20329320430755615, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.21250000596046448, + "rewards/reward_coverage/std": 0.1685606688261032, + "rewards/reward_repetition/mean": 0.8031250238418579, + "rewards/reward_repetition/std": 0.15732762217521667, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7630195617675781, + "sampling/importance_sampling_ratio/min": 6.418834864883299e-17, + "sampling/sampling_logp_difference/max": 37.28470993041992, + "sampling/sampling_logp_difference/mean": 2.45412540435791, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.609375, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.33232213323935866, + "epoch": 0.3118279569892473, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1.4301323890686035, + "learning_rate": 4.4520547945205486e-06, + "loss": -0.0026, + "num_tokens": 8476412.0, + "reward": 1.0406250953674316, + "reward_std": 0.15026018023490906, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.21562501788139343, + "rewards/reward_coverage/std": 0.14498905837535858, + "rewards/reward_repetition/mean": 0.824999988079071, + "rewards/reward_repetition/std": 0.14474937319755554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7893605828285217, + "sampling/importance_sampling_ratio/min": 6.775734351848958e-14, + "sampling/sampling_logp_difference/max": 30.322843551635742, + "sampling/sampling_logp_difference/mean": 2.41098690032959, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.65625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.31409848271869123, + "epoch": 0.3225806451612903, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.2845097780227661, + "learning_rate": 4.383561643835616e-06, + "loss": -0.01, + "num_tokens": 8866724.0, + "reward": 1.0109374523162842, + "reward_std": 0.1480504870414734, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18593749403953552, + "rewards/reward_coverage/std": 0.13076962530612946, + "rewards/reward_repetition/mean": 0.824999988079071, + "rewards/reward_repetition/std": 0.14907118678092957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7840973734855652, + "sampling/importance_sampling_ratio/min": 1.471832771813246e-16, + "sampling/sampling_logp_difference/max": 36.45485305786133, + "sampling/sampling_logp_difference/mean": 2.4330239295959473, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.796875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.3091448312625289, + "epoch": 0.3333333333333333, + "frac_reward_zero_std": 0.0625, + "grad_norm": 1.2038891315460205, + "learning_rate": 4.315068493150685e-06, + "loss": -0.0041, + "num_tokens": 9257223.0, + "reward": 1.017187476158142, + "reward_std": 0.20550289750099182, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20468750596046448, + "rewards/reward_coverage/std": 0.1803257167339325, + "rewards/reward_repetition/mean": 0.8125, + "rewards/reward_repetition/std": 0.1374368518590927, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7869740128517151, + "sampling/importance_sampling_ratio/min": 8.298585072157721e-15, + "sampling/sampling_logp_difference/max": 32.422691345214844, + "sampling/sampling_logp_difference/mean": 2.4976119995117188, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.890625, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.2876437115482986, + "epoch": 0.34408602150537637, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.9871541261672974, + "learning_rate": 4.246575342465754e-06, + "loss": 0.0037, + "num_tokens": 9647784.0, + "reward": 1.032812476158142, + "reward_std": 0.17456698417663574, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18281251192092896, + "rewards/reward_coverage/std": 0.15384458005428314, + "rewards/reward_repetition/mean": 0.8500000238418579, + "rewards/reward_repetition/std": 0.1380131095647812, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7916483283042908, + "sampling/importance_sampling_ratio/min": 1.612893541327095e-14, + "sampling/sampling_logp_difference/max": 31.758161544799805, + "sampling/sampling_logp_difference/mean": 2.4329771995544434, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.796875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.27647654921747744, + "epoch": 0.3548387096774194, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1.4954354763031006, + "learning_rate": 4.178082191780822e-06, + "loss": -0.0059, + "num_tokens": 10038289.0, + "reward": 1.0218749046325684, + "reward_std": 0.18119610846042633, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20937499403953552, + "rewards/reward_coverage/std": 0.1687665432691574, + "rewards/reward_repetition/mean": 0.8125, + "rewards/reward_repetition/std": 0.12279806286096573, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7887318730354309, + "sampling/importance_sampling_ratio/min": 8.831320397643036e-19, + "sampling/sampling_logp_difference/max": 41.5708122253418, + "sampling/sampling_logp_difference/mean": 2.5059313774108887, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.84375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.27092319959774613, + "epoch": 0.3655913978494624, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.6384416818618774, + "learning_rate": 4.109589041095891e-06, + "loss": -0.0034, + "num_tokens": 10428797.0, + "reward": 1.0593750476837158, + "reward_std": 0.17235726118087769, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.23125001788139343, + "rewards/reward_coverage/std": 0.15314172208309174, + "rewards/reward_repetition/mean": 0.828125, + "rewards/reward_repetition/std": 0.12782520055770874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7671718597412109, + "sampling/importance_sampling_ratio/min": 6.987482079498287e-19, + "sampling/sampling_logp_difference/max": 41.804996490478516, + "sampling/sampling_logp_difference/mean": 2.6257989406585693, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.8125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2952587741892785, + "epoch": 0.3763440860215054, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.0413151979446411, + "learning_rate": 4.0410958904109595e-06, + "loss": -0.0099, + "num_tokens": 10819301.0, + "reward": 0.9296875596046448, + "reward_std": 0.13921163976192474, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.16093750298023224, + "rewards/reward_coverage/std": 0.12550494074821472, + "rewards/reward_repetition/mean": 0.768750011920929, + "rewards/reward_repetition/std": 0.11391307413578033, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7573365569114685, + "sampling/importance_sampling_ratio/min": 1.5061544475743168e-18, + "sampling/sampling_logp_difference/max": 41.03697204589844, + "sampling/sampling_logp_difference/mean": 2.6861050128936768, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.27068308740854263, + "epoch": 0.3870967741935484, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1.277216911315918, + "learning_rate": 3.972602739726027e-06, + "loss": -0.0004, + "num_tokens": 11209548.0, + "reward": 0.989062488079071, + "reward_std": 0.17014756798744202, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.21406251192092896, + "rewards/reward_coverage/std": 0.17262418568134308, + "rewards/reward_repetition/mean": 0.7749999761581421, + "rewards/reward_repetition/std": 0.1154700517654419, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7654402256011963, + "sampling/importance_sampling_ratio/min": 4.38677482468737e-14, + "sampling/sampling_logp_difference/max": 30.757596969604492, + "sampling/sampling_logp_difference/mean": 2.659276008605957, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.796875, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.2760939297731966, + "epoch": 0.3978494623655914, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1.504520297050476, + "learning_rate": 3.904109589041096e-06, + "loss": -0.0042, + "num_tokens": 11599953.0, + "reward": 0.9546874761581421, + "reward_std": 0.15246990323066711, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.17656250298023224, + "rewards/reward_coverage/std": 0.1318274438381195, + "rewards/reward_repetition/mean": 0.7781250476837158, + "rewards/reward_repetition/std": 0.12404395639896393, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7589582800865173, + "sampling/importance_sampling_ratio/min": 1.2950150141564556e-22, + "sampling/sampling_logp_difference/max": 50.39834976196289, + "sampling/sampling_logp_difference/mean": 2.7161073684692383, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.27736608777195215, + "epoch": 0.40860215053763443, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.6770557165145874, + "learning_rate": 3.8356164383561645e-06, + "loss": -0.0018, + "num_tokens": 11990363.0, + "reward": 0.926562488079071, + "reward_std": 0.14363107085227966, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.17031250894069672, + "rewards/reward_coverage/std": 0.15293914079666138, + "rewards/reward_repetition/mean": 0.7562500238418579, + "rewards/reward_repetition/std": 0.1152980849146843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7690252661705017, + "sampling/importance_sampling_ratio/min": 5.327883295646056e-16, + "sampling/sampling_logp_difference/max": 35.16840744018555, + "sampling/sampling_logp_difference/mean": 2.6216495037078857, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.8125, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.30535601382143795, + "epoch": 0.41935483870967744, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.455296277999878, + "learning_rate": 3.767123287671233e-06, + "loss": -0.0019, + "num_tokens": 12380581.0, + "reward": 0.948437511920929, + "reward_std": 0.2010834813117981, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.1796875, + "rewards/reward_coverage/std": 0.16825877130031586, + "rewards/reward_repetition/mean": 0.7687499523162842, + "rewards/reward_repetition/std": 0.12456272542476654, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7560758590698242, + "sampling/importance_sampling_ratio/min": 1.0004220113236836e-18, + "sampling/sampling_logp_difference/max": 41.446109771728516, + "sampling/sampling_logp_difference/mean": 2.5334110260009766, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.90625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.30565810902044177, + "epoch": 0.43010752688172044, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1.2029423713684082, + "learning_rate": 3.6986301369863014e-06, + "loss": -0.0015, + "num_tokens": 12770797.0, + "reward": 0.96875, + "reward_std": 0.19003495573997498, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19062501192092896, + "rewards/reward_coverage/std": 0.15504096448421478, + "rewards/reward_repetition/mean": 0.778124988079071, + "rewards/reward_repetition/std": 0.12404395639896393, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7597452402114868, + "sampling/importance_sampling_ratio/min": 1.379195585862747e-12, + "sampling/sampling_logp_difference/max": 27.309520721435547, + "sampling/sampling_logp_difference/mean": 2.4547033309936523, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 39.6875, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.31394060072489083, + "epoch": 0.44086021505376344, + "frac_reward_zero_std": 0.0625, + "grad_norm": 1.7512142658233643, + "learning_rate": 3.6301369863013704e-06, + "loss": 0.0003, + "num_tokens": 13161191.0, + "reward": 1.009374976158142, + "reward_std": 0.18561550974845886, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.22499999403953552, + "rewards/reward_coverage/std": 0.15936382114887238, + "rewards/reward_repetition/mean": 0.7843749523162842, + "rewards/reward_repetition/std": 0.13940775394439697, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7532055974006653, + "sampling/importance_sampling_ratio/min": 3.4107995156513595e-17, + "sampling/sampling_logp_difference/max": 37.91699981689453, + "sampling/sampling_logp_difference/mean": 2.571798086166382, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.301532520679757, + "epoch": 0.45161290322580644, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.6585807800292969, + "learning_rate": 3.5616438356164386e-06, + "loss": -0.0042, + "num_tokens": 13551781.0, + "reward": 0.953125, + "reward_std": 0.13258251547813416, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19375000894069672, + "rewards/reward_coverage/std": 0.1562202423810959, + "rewards/reward_repetition/mean": 0.7593749761581421, + "rewards/reward_repetition/std": 0.10796640068292618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7576152682304382, + "sampling/importance_sampling_ratio/min": 4.25440330295826e-18, + "sampling/sampling_logp_difference/max": 39.99857711791992, + "sampling/sampling_logp_difference/mean": 2.5987966060638428, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2994292816147208, + "epoch": 0.46236559139784944, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.1511462926864624, + "learning_rate": 3.4931506849315072e-06, + "loss": -0.0023, + "num_tokens": 13941920.0, + "reward": 0.9578125476837158, + "reward_std": 0.12595339119434357, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19218748807907104, + "rewards/reward_coverage/std": 0.15045401453971863, + "rewards/reward_repetition/mean": 0.765625, + "rewards/reward_repetition/std": 0.10422617197036743, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7622615694999695, + "sampling/importance_sampling_ratio/min": 1.1524427466063367e-15, + "sampling/sampling_logp_difference/max": 34.39689254760742, + "sampling/sampling_logp_difference/mean": 2.527230978012085, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.9375, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.2939116738270968, + "epoch": 0.4731182795698925, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1.2386677265167236, + "learning_rate": 3.4246575342465754e-06, + "loss": -0.0099, + "num_tokens": 14332340.0, + "reward": 0.9515625238418579, + "reward_std": 0.1480504721403122, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19218750298023224, + "rewards/reward_coverage/std": 0.15461647510528564, + "rewards/reward_repetition/mean": 0.7593749761581421, + "rewards/reward_repetition/std": 0.12436345219612122, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7619537711143494, + "sampling/importance_sampling_ratio/min": 3.6812737413604546e-19, + "sampling/sampling_logp_difference/max": 42.445858001708984, + "sampling/sampling_logp_difference/mean": 2.5976765155792236, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.890625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2781213163398206, + "epoch": 0.4838709677419355, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.170350193977356, + "learning_rate": 3.356164383561644e-06, + "loss": -0.0036, + "num_tokens": 14722931.0, + "reward": 0.9953124523162842, + "reward_std": 0.12153396755456924, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.1953125, + "rewards/reward_coverage/std": 0.14412261545658112, + "rewards/reward_repetition/mean": 0.800000011920929, + "rewards/reward_repetition/std": 0.08728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7608794569969177, + "sampling/importance_sampling_ratio/min": 3.331248850987206e-13, + "sampling/sampling_logp_difference/max": 28.73025894165039, + "sampling/sampling_logp_difference/mean": 2.6023521423339844, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.84375, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.27238480327650905, + "epoch": 0.4946236559139785, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1.1889655590057373, + "learning_rate": 3.2876712328767123e-06, + "loss": -0.005, + "num_tokens": 15113515.0, + "reward": 0.9437500238418579, + "reward_std": 0.17235726118087769, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19062499701976776, + "rewards/reward_coverage/std": 0.14333748817443848, + "rewards/reward_repetition/mean": 0.7531249523162842, + "rewards/reward_repetition/std": 0.09915315359830856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7437648177146912, + "sampling/importance_sampling_ratio/min": 3.385970521172965e-19, + "sampling/sampling_logp_difference/max": 42.529476165771484, + "sampling/sampling_logp_difference/mean": 2.778402328491211, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.890625, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.24875370506197214, + "epoch": 0.5053763440860215, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.1314892768859863, + "learning_rate": 3.2191780821917813e-06, + "loss": -0.0031, + "num_tokens": 15503912.0, + "reward": 0.942187488079071, + "reward_std": 0.15688931941986084, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18906250596046448, + "rewards/reward_coverage/std": 0.12230224162340164, + "rewards/reward_repetition/mean": 0.7531249523162842, + "rewards/reward_repetition/std": 0.12210943549871445, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7534346580505371, + "sampling/importance_sampling_ratio/min": 3.298192560128319e-15, + "sampling/sampling_logp_difference/max": 33.345401763916016, + "sampling/sampling_logp_difference/mean": 2.7750205993652344, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.875, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.24461835296824574, + "epoch": 0.5161290322580645, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.6554945111274719, + "learning_rate": 3.1506849315068495e-06, + "loss": -0.0048, + "num_tokens": 15894138.0, + "reward": 0.953125, + "reward_std": 0.16793785989284515, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.21875, + "rewards/reward_coverage/std": 0.13554710149765015, + "rewards/reward_repetition/mean": 0.734375, + "rewards/reward_repetition/std": 0.12372364103794098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7367614507675171, + "sampling/importance_sampling_ratio/min": 1.2913128950274503e-18, + "sampling/sampling_logp_difference/max": 41.19087219238281, + "sampling/sampling_logp_difference/mean": 2.8969175815582275, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.221066806698218, + "epoch": 0.5268817204301075, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.813700795173645, + "learning_rate": 3.082191780821918e-06, + "loss": -0.0125, + "num_tokens": 16284827.0, + "reward": 0.9203125238418579, + "reward_std": 0.1657281517982483, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18906250596046448, + "rewards/reward_coverage/std": 0.1310727298259735, + "rewards/reward_repetition/mean": 0.7312500476837158, + "rewards/reward_repetition/std": 0.12456272542476654, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7454463839530945, + "sampling/importance_sampling_ratio/min": 2.6984808100466543e-19, + "sampling/sampling_logp_difference/max": 42.75642776489258, + "sampling/sampling_logp_difference/mean": 3.0947470664978027, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2340390719473362, + "epoch": 0.5376344086021505, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.1709717512130737, + "learning_rate": 3.0136986301369864e-06, + "loss": -0.0098, + "num_tokens": 16675257.0, + "reward": 0.9140625596046448, + "reward_std": 0.13479222357273102, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18906250596046448, + "rewards/reward_coverage/std": 0.14155283570289612, + "rewards/reward_repetition/mean": 0.7250000238418579, + "rewards/reward_repetition/std": 0.10983392596244812, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.732937216758728, + "sampling/importance_sampling_ratio/min": 1.6756041496787032e-16, + "sampling/sampling_logp_difference/max": 36.32518768310547, + "sampling/sampling_logp_difference/mean": 3.183100700378418, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 39.78125, + "completions/mean_terminated_length": 39.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.20713584939949214, + "epoch": 0.5483870967741935, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.8335843086242676, + "learning_rate": 2.945205479452055e-06, + "loss": -0.0031, + "num_tokens": 17065845.0, + "reward": 0.953125, + "reward_std": 0.15026019513607025, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.21875, + "rewards/reward_coverage/std": 0.1390158236026764, + "rewards/reward_repetition/mean": 0.734375, + "rewards/reward_repetition/std": 0.10722880065441132, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.739495038986206, + "sampling/importance_sampling_ratio/min": 4.677705163318169e-13, + "sampling/sampling_logp_difference/max": 28.390798568725586, + "sampling/sampling_logp_difference/mean": 3.198160409927368, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.9375, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.19506761734373868, + "epoch": 0.5591397849462365, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.6900437474250793, + "learning_rate": 2.876712328767123e-06, + "loss": -0.0002, + "num_tokens": 17456339.0, + "reward": 0.9359375238418579, + "reward_std": 0.17898640036582947, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20468750596046448, + "rewards/reward_coverage/std": 0.17129728198051453, + "rewards/reward_repetition/mean": 0.731249988079071, + "rewards/reward_repetition/std": 0.09574270248413086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7362239956855774, + "sampling/importance_sampling_ratio/min": 2.602479520623457e-19, + "sampling/sampling_logp_difference/max": 42.79265213012695, + "sampling/sampling_logp_difference/mean": 3.2387518882751465, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.1951053044758737, + "epoch": 0.5698924731182796, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.0901892185211182, + "learning_rate": 2.8082191780821922e-06, + "loss": -0.0096, + "num_tokens": 17846929.0, + "reward": 0.9390625357627869, + "reward_std": 0.19224466383457184, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.2109375, + "rewards/reward_coverage/std": 0.17193317413330078, + "rewards/reward_repetition/mean": 0.7281249761581421, + "rewards/reward_repetition/std": 0.10307764261960983, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7550839781761169, + "sampling/importance_sampling_ratio/min": 1.1955911409525077e-18, + "sampling/sampling_logp_difference/max": 41.26789093017578, + "sampling/sampling_logp_difference/mean": 3.1413958072662354, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.765625, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.21364939608611166, + "epoch": 0.5806451612903226, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.0280330181121826, + "learning_rate": 2.7397260273972604e-06, + "loss": -0.0093, + "num_tokens": 18237620.0, + "reward": 0.9500000476837158, + "reward_std": 0.1414213478565216, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19062501192092896, + "rewards/reward_coverage/std": 0.1540137678384781, + "rewards/reward_repetition/mean": 0.7593749761581421, + "rewards/reward_repetition/std": 0.0885845422744751, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.738981306552887, + "sampling/importance_sampling_ratio/min": 4.2308257591772147e-13, + "sampling/sampling_logp_difference/max": 28.491209030151367, + "sampling/sampling_logp_difference/mean": 3.156113624572754, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.890625, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.24034091946668923, + "epoch": 0.5913978494623656, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.18429696559906, + "learning_rate": 2.671232876712329e-06, + "loss": -0.0049, + "num_tokens": 18628129.0, + "reward": 0.8953125476837158, + "reward_std": 0.1657281517982483, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.1796875, + "rewards/reward_coverage/std": 0.1299324631690979, + "rewards/reward_repetition/mean": 0.715624988079071, + "rewards/reward_repetition/std": 0.11158134788274765, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7346891760826111, + "sampling/importance_sampling_ratio/min": 1.511831024952892e-13, + "sampling/sampling_logp_difference/max": 29.52028465270996, + "sampling/sampling_logp_difference/mean": 3.2208282947540283, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 39.84375, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.23257427848875523, + "epoch": 0.6021505376344086, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.6966381072998047, + "learning_rate": 2.6027397260273973e-06, + "loss": -0.0088, + "num_tokens": 19018723.0, + "reward": 0.9343750476837158, + "reward_std": 0.15026018023490906, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19687500596046448, + "rewards/reward_coverage/std": 0.1284446120262146, + "rewards/reward_repetition/mean": 0.737500011920929, + "rewards/reward_repetition/std": 0.106159508228302, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7426254153251648, + "sampling/importance_sampling_ratio/min": 7.579855932368998e-14, + "sampling/sampling_logp_difference/max": 30.210697174072266, + "sampling/sampling_logp_difference/mean": 3.221386194229126, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2281794489827007, + "epoch": 0.6129032258064516, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7310183048248291, + "learning_rate": 2.534246575342466e-06, + "loss": -0.0083, + "num_tokens": 19409232.0, + "reward": 0.9671875238418579, + "reward_std": 0.14363107085227966, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.22968751192092896, + "rewards/reward_coverage/std": 0.14979319274425507, + "rewards/reward_repetition/mean": 0.737500011920929, + "rewards/reward_repetition/std": 0.10000000149011612, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7459241151809692, + "sampling/importance_sampling_ratio/min": 1.3522516009469616e-19, + "sampling/sampling_logp_difference/max": 43.44734573364258, + "sampling/sampling_logp_difference/mean": 3.226292133331299, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.24348380486480892, + "epoch": 0.6236559139784946, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.9412605166435242, + "learning_rate": 2.4657534246575345e-06, + "loss": -0.0063, + "num_tokens": 19799925.0, + "reward": 0.948437511920929, + "reward_std": 0.15246990323066711, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20156249403953552, + "rewards/reward_coverage/std": 0.14637655019760132, + "rewards/reward_repetition/mean": 0.746874988079071, + "rewards/reward_repetition/std": 0.10833332687616348, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7302582263946533, + "sampling/importance_sampling_ratio/min": 1.1974077329752507e-18, + "sampling/sampling_logp_difference/max": 41.26637268066406, + "sampling/sampling_logp_difference/mean": 3.3912689685821533, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.22413912834599614, + "epoch": 0.6344086021505376, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.6372097730636597, + "learning_rate": 2.3972602739726027e-06, + "loss": -0.0078, + "num_tokens": 20190710.0, + "reward": 0.953125, + "reward_std": 0.11048543453216553, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.1875, + "rewards/reward_coverage/std": 0.1278640329837799, + "rewards/reward_repetition/mean": 0.765625, + "rewards/reward_repetition/std": 0.08398554474115372, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7377102971076965, + "sampling/importance_sampling_ratio/min": 5.862870793152246e-21, + "sampling/sampling_logp_difference/max": 46.58564758300781, + "sampling/sampling_logp_difference/mean": 3.426024913787842, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.90625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.23593324795365334, + "epoch": 0.6451612903225806, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.7991491556167603, + "learning_rate": 2.3287671232876713e-06, + "loss": -0.0066, + "num_tokens": 20581308.0, + "reward": 0.9515625238418579, + "reward_std": 0.13921165466308594, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18906250596046448, + "rewards/reward_coverage/std": 0.12487097084522247, + "rewards/reward_repetition/mean": 0.7625000476837158, + "rewards/reward_repetition/std": 0.11751393228769302, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7420527935028076, + "sampling/importance_sampling_ratio/min": 1.627108762957747e-23, + "sampling/sampling_logp_difference/max": 52.472652435302734, + "sampling/sampling_logp_difference/mean": 3.326465606689453, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.9375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.25969044235534966, + "epoch": 0.6559139784946236, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.9682433009147644, + "learning_rate": 2.26027397260274e-06, + "loss": -0.0109, + "num_tokens": 20972092.0, + "reward": 0.9828125238418579, + "reward_std": 0.15688931941986084, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20781250298023224, + "rewards/reward_coverage/std": 0.1336955726146698, + "rewards/reward_repetition/mean": 0.7749999761581421, + "rewards/reward_repetition/std": 0.09759000688791275, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7352744936943054, + "sampling/importance_sampling_ratio/min": 4.3607164320474956e-13, + "sampling/sampling_logp_difference/max": 28.460969924926758, + "sampling/sampling_logp_difference/mean": 3.418292999267578, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.23248756467364728, + "epoch": 0.6666666666666666, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.9272934198379517, + "learning_rate": 2.191780821917808e-06, + "loss": -0.0034, + "num_tokens": 21362873.0, + "reward": 0.9390624761581421, + "reward_std": 0.13921163976192474, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18593750894069672, + "rewards/reward_coverage/std": 0.11800886690616608, + "rewards/reward_repetition/mean": 0.7531249523162842, + "rewards/reward_repetition/std": 0.1053621917963028, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7381278276443481, + "sampling/importance_sampling_ratio/min": 5.011335584784865e-14, + "sampling/sampling_logp_difference/max": 30.624488830566406, + "sampling/sampling_logp_difference/mean": 3.4408388137817383, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.890625, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.2272115428932011, + "epoch": 0.6774193548387096, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.6223781704902649, + "learning_rate": 2.123287671232877e-06, + "loss": -0.0014, + "num_tokens": 21753644.0, + "reward": 0.9937499761581421, + "reward_std": 0.16793787479400635, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20625001192092896, + "rewards/reward_coverage/std": 0.15210169553756714, + "rewards/reward_repetition/mean": 0.7875000238418579, + "rewards/reward_repetition/std": 0.08637312799692154, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7413582801818848, + "sampling/importance_sampling_ratio/min": 8.111444921513807e-17, + "sampling/sampling_logp_difference/max": 37.0506706237793, + "sampling/sampling_logp_difference/mean": 3.607243776321411, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2311963385436684, + "epoch": 0.6881720430107527, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.8859496116638184, + "learning_rate": 2.0547945205479454e-06, + "loss": 0.0046, + "num_tokens": 22144251.0, + "reward": 1.0171875953674316, + "reward_std": 0.15688931941986084, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.2421875, + "rewards/reward_coverage/std": 0.16407963633537292, + "rewards/reward_repetition/mean": 0.7749999761581421, + "rewards/reward_repetition/std": 0.09085135161876678, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7465033531188965, + "sampling/importance_sampling_ratio/min": 4.7572778301925226e-18, + "sampling/sampling_logp_difference/max": 39.88685607910156, + "sampling/sampling_logp_difference/mean": 3.552140951156616, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2426956002600491, + "epoch": 0.6989247311827957, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1.269731044769287, + "learning_rate": 1.9863013698630136e-06, + "loss": -0.0092, + "num_tokens": 22535042.0, + "reward": 0.9437500238418579, + "reward_std": 0.16793784499168396, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.16562500596046448, + "rewards/reward_coverage/std": 0.1382644772529602, + "rewards/reward_repetition/mean": 0.778124988079071, + "rewards/reward_repetition/std": 0.10759823769330978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.745509684085846, + "sampling/importance_sampling_ratio/min": 2.5291580594867795e-16, + "sampling/sampling_logp_difference/max": 35.913475036621094, + "sampling/sampling_logp_difference/mean": 3.580662488937378, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 39.90625, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.25972409872338176, + "epoch": 0.7096774193548387, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.2360777854919434, + "learning_rate": 1.9178082191780823e-06, + "loss": -0.0021, + "num_tokens": 22925826.0, + "reward": 0.989062488079071, + "reward_std": 0.13921163976192474, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20156249403953552, + "rewards/reward_coverage/std": 0.1578548550605774, + "rewards/reward_repetition/mean": 0.7875000238418579, + "rewards/reward_repetition/std": 0.07867958396673203, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.742576003074646, + "sampling/importance_sampling_ratio/min": 3.8009925881678924e-15, + "sampling/sampling_logp_difference/max": 33.203514099121094, + "sampling/sampling_logp_difference/mean": 3.5737202167510986, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.9375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.24233098467811942, + "epoch": 0.7204301075268817, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1.0064667463302612, + "learning_rate": 1.8493150684931507e-06, + "loss": -0.0074, + "num_tokens": 23316620.0, + "reward": 0.9796874523162842, + "reward_std": 0.16130872070789337, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18281251192092896, + "rewards/reward_coverage/std": 0.13280214369297028, + "rewards/reward_repetition/mean": 0.796875, + "rewards/reward_repetition/std": 0.11542708426713943, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7610887885093689, + "sampling/importance_sampling_ratio/min": 3.512002747491507e-17, + "sampling/sampling_logp_difference/max": 37.887760162353516, + "sampling/sampling_logp_difference/mean": 3.572136878967285, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.26319174305535853, + "epoch": 0.7311827956989247, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.0560470819473267, + "learning_rate": 1.7808219178082193e-06, + "loss": -0.012, + "num_tokens": 23707121.0, + "reward": 0.9937500953674316, + "reward_std": 0.16351842880249023, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.21562500298023224, + "rewards/reward_coverage/std": 0.13359349966049194, + "rewards/reward_repetition/mean": 0.778124988079071, + "rewards/reward_repetition/std": 0.12404395639896393, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7442783713340759, + "sampling/importance_sampling_ratio/min": 1.375699243920652e-15, + "sampling/sampling_logp_difference/max": 34.21981430053711, + "sampling/sampling_logp_difference/mean": 3.5864293575286865, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.78125, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.27109498833306134, + "epoch": 0.7419354838709677, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1.1270716190338135, + "learning_rate": 1.7123287671232877e-06, + "loss": -0.0065, + "num_tokens": 24097617.0, + "reward": 0.9968750476837158, + "reward_std": 0.1237436830997467, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20937500894069672, + "rewards/reward_coverage/std": 0.1376892626285553, + "rewards/reward_repetition/mean": 0.7875000238418579, + "rewards/reward_repetition/std": 0.09343531727790833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7473452687263489, + "sampling/importance_sampling_ratio/min": 1.4495503789956396e-16, + "sampling/sampling_logp_difference/max": 36.47010803222656, + "sampling/sampling_logp_difference/mean": 3.4904568195343018, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.90625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.26768106454983354, + "epoch": 0.7526881720430108, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1.1696867942810059, + "learning_rate": 1.6438356164383561e-06, + "loss": -0.0057, + "num_tokens": 24488387.0, + "reward": 1.0328125953674316, + "reward_std": 0.13921163976192474, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.22031250596046448, + "rewards/reward_coverage/std": 0.13590343296527863, + "rewards/reward_repetition/mean": 0.8125, + "rewards/reward_repetition/std": 0.11198072135448456, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7499436736106873, + "sampling/importance_sampling_ratio/min": 1.4622693992618306e-15, + "sampling/sampling_logp_difference/max": 34.15878677368164, + "sampling/sampling_logp_difference/mean": 3.6728289127349854, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.24969360628165305, + "epoch": 0.7634408602150538, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.8783805966377258, + "learning_rate": 1.5753424657534248e-06, + "loss": -0.0031, + "num_tokens": 24879084.0, + "reward": 1.0343749523162842, + "reward_std": 0.1590990126132965, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19062501192092896, + "rewards/reward_coverage/std": 0.13179922103881836, + "rewards/reward_repetition/mean": 0.8437500596046448, + "rewards/reward_repetition/std": 0.10965313017368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7588081955909729, + "sampling/importance_sampling_ratio/min": 3.0506860601055286e-14, + "sampling/sampling_logp_difference/max": 31.120824813842773, + "sampling/sampling_logp_difference/mean": 3.584549903869629, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.796875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2575332070700824, + "epoch": 0.7741935483870968, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.6952300071716309, + "learning_rate": 1.5068493150684932e-06, + "loss": -0.0028, + "num_tokens": 25269863.0, + "reward": 1.029687523841858, + "reward_std": 0.14363105595111847, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.1953125, + "rewards/reward_coverage/std": 0.1396477371454239, + "rewards/reward_repetition/mean": 0.8343750238418579, + "rewards/reward_repetition/std": 0.11014961451292038, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.767298698425293, + "sampling/importance_sampling_ratio/min": 1.6540065300593926e-15, + "sampling/sampling_logp_difference/max": 34.03557586669922, + "sampling/sampling_logp_difference/mean": 3.5412185192108154, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.796875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.27034957450814545, + "epoch": 0.7849462365591398, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1.3305058479309082, + "learning_rate": 1.4383561643835616e-06, + "loss": -0.0086, + "num_tokens": 25660624.0, + "reward": 0.9765625, + "reward_std": 0.1303728073835373, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19218750298023224, + "rewards/reward_coverage/std": 0.11724982410669327, + "rewards/reward_repetition/mean": 0.784375011920929, + "rewards/reward_repetition/std": 0.10269193351268768, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7455124258995056, + "sampling/importance_sampling_ratio/min": 4.9776697520764746e-14, + "sampling/sampling_logp_difference/max": 30.631229400634766, + "sampling/sampling_logp_difference/mean": 3.557706594467163, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.2672195213381201, + "epoch": 0.7956989247311828, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.368238925933838, + "learning_rate": 1.3698630136986302e-06, + "loss": -0.0091, + "num_tokens": 26051389.0, + "reward": 1.0171875953674316, + "reward_std": 0.15246988832950592, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.1953125, + "rewards/reward_coverage/std": 0.1361951231956482, + "rewards/reward_repetition/mean": 0.8218749761581421, + "rewards/reward_repetition/std": 0.10759823024272919, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7550181746482849, + "sampling/importance_sampling_ratio/min": 4.837896576403988e-13, + "sampling/sampling_logp_difference/max": 28.357126235961914, + "sampling/sampling_logp_difference/mean": 3.518388271331787, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2512192933354527, + "epoch": 0.8064516129032258, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.8075931072235107, + "learning_rate": 1.3013698630136986e-06, + "loss": -0.0055, + "num_tokens": 26442164.0, + "reward": 0.9906250238418579, + "reward_std": 0.12816309928894043, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.16562500596046448, + "rewards/reward_coverage/std": 0.12626346945762634, + "rewards/reward_repetition/mean": 0.8250000476837158, + "rewards/reward_repetition/std": 0.09085134416818619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7599254250526428, + "sampling/importance_sampling_ratio/min": 1.1269083539586222e-12, + "sampling/sampling_logp_difference/max": 27.51154327392578, + "sampling/sampling_logp_difference/mean": 3.5641071796417236, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.90625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.25094706076197326, + "epoch": 0.8172043010752689, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.8058338165283203, + "learning_rate": 1.2328767123287673e-06, + "loss": -0.008, + "num_tokens": 26832860.0, + "reward": 1.0093750953674316, + "reward_std": 0.1767766773700714, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19374999403953552, + "rewards/reward_coverage/std": 0.16122055053710938, + "rewards/reward_repetition/mean": 0.815625011920929, + "rewards/reward_repetition/std": 0.10269193351268768, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7578780055046082, + "sampling/importance_sampling_ratio/min": 1.1848823085411635e-15, + "sampling/sampling_logp_difference/max": 34.36913299560547, + "sampling/sampling_logp_difference/mean": 3.489642858505249, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.90625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.26383460965007544, + "epoch": 0.8279569892473119, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.9045315384864807, + "learning_rate": 1.1643835616438357e-06, + "loss": -0.0039, + "num_tokens": 27223636.0, + "reward": 1.032812476158142, + "reward_std": 0.14363105595111847, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20156249403953552, + "rewards/reward_coverage/std": 0.12407395988702774, + "rewards/reward_repetition/mean": 0.831250011920929, + "rewards/reward_repetition/std": 0.12456272542476654, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7593458890914917, + "sampling/importance_sampling_ratio/min": 1.2729455923859382e-14, + "sampling/sampling_logp_difference/max": 31.994857788085938, + "sampling/sampling_logp_difference/mean": 3.625725030899048, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.859375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2579868610482663, + "epoch": 0.8387096774193549, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.0903428792953491, + "learning_rate": 1.095890410958904e-06, + "loss": -0.0077, + "num_tokens": 27614409.0, + "reward": 1.053125023841858, + "reward_std": 0.18119609355926514, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.22187501192092896, + "rewards/reward_coverage/std": 0.15682236850261688, + "rewards/reward_repetition/mean": 0.8312499523162842, + "rewards/reward_repetition/std": 0.12456272542476654, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7564480304718018, + "sampling/importance_sampling_ratio/min": 3.942305127637401e-14, + "sampling/sampling_logp_difference/max": 30.864425659179688, + "sampling/sampling_logp_difference/mean": 3.604686975479126, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.24791082250885665, + "epoch": 0.8494623655913979, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.090151071548462, + "learning_rate": 1.0273972602739727e-06, + "loss": -0.0028, + "num_tokens": 28005198.0, + "reward": 1.029687523841858, + "reward_std": 0.1480504870414734, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19218750298023224, + "rewards/reward_coverage/std": 0.16837665438652039, + "rewards/reward_repetition/mean": 0.8375000357627869, + "rewards/reward_repetition/std": 0.106159508228302, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.771141767501831, + "sampling/importance_sampling_ratio/min": 3.6129458528665753e-14, + "sampling/sampling_logp_difference/max": 30.95166778564453, + "sampling/sampling_logp_difference/mean": 3.4720849990844727, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.859375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.25930464873090386, + "epoch": 0.8602150537634409, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.083723545074463, + "learning_rate": 9.589041095890411e-07, + "loss": -0.0051, + "num_tokens": 28395507.0, + "reward": 1.046875, + "reward_std": 0.15026018023490906, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.23125000298023224, + "rewards/reward_coverage/std": 0.14015299081802368, + "rewards/reward_repetition/mean": 0.815625011920929, + "rewards/reward_repetition/std": 0.152459979057312, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7536813020706177, + "sampling/importance_sampling_ratio/min": 9.19962348487624e-14, + "sampling/sampling_logp_difference/max": 30.01702880859375, + "sampling/sampling_logp_difference/mean": 3.546005964279175, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.24061511480249465, + "epoch": 0.8709677419354839, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.8584389686584473, + "learning_rate": 8.904109589041097e-07, + "loss": -0.0066, + "num_tokens": 28786289.0, + "reward": 1.0609374046325684, + "reward_std": 0.1303727924823761, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.2109375, + "rewards/reward_coverage/std": 0.15130877494812012, + "rewards/reward_repetition/mean": 0.8500000238418579, + "rewards/reward_repetition/std": 0.10690449178218842, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7686938643455505, + "sampling/importance_sampling_ratio/min": 2.277888706651854e-13, + "sampling/sampling_logp_difference/max": 29.1103572845459, + "sampling/sampling_logp_difference/mean": 3.516740560531616, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 39.84375, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.23985581938177347, + "epoch": 0.8817204301075269, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.9327372312545776, + "learning_rate": 8.219178082191781e-07, + "loss": -0.0095, + "num_tokens": 29177061.0, + "reward": 1.0640625953674316, + "reward_std": 0.15688930451869965, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.21718749403953552, + "rewards/reward_coverage/std": 0.13634072244167328, + "rewards/reward_repetition/mean": 0.846875011920929, + "rewards/reward_repetition/std": 0.12210942804813385, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7691041827201843, + "sampling/importance_sampling_ratio/min": 9.13759844699269e-13, + "sampling/sampling_logp_difference/max": 27.721208572387695, + "sampling/sampling_logp_difference/mean": 3.6279892921447754, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 39.609375, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.2540010770317167, + "epoch": 0.8924731182795699, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7506739497184753, + "learning_rate": 7.534246575342466e-07, + "loss": -0.0126, + "num_tokens": 29567814.0, + "reward": 1.0328125953674316, + "reward_std": 0.1834058165550232, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20156249403953552, + "rewards/reward_coverage/std": 0.13857802748680115, + "rewards/reward_repetition/mean": 0.831250011920929, + "rewards/reward_repetition/std": 0.12456272542476654, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7523236274719238, + "sampling/importance_sampling_ratio/min": 5.4132771128059115e-14, + "sampling/sampling_logp_difference/max": 30.54733657836914, + "sampling/sampling_logp_difference/mean": 3.681413173675537, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.25169974751770496, + "epoch": 0.9032258064516129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.8115216493606567, + "learning_rate": 6.849315068493151e-07, + "loss": -0.0139, + "num_tokens": 29958486.0, + "reward": 1.0359375476837158, + "reward_std": 0.13479222357273102, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.18593750894069672, + "rewards/reward_coverage/std": 0.14014413952827454, + "rewards/reward_repetition/mean": 0.8500000238418579, + "rewards/reward_repetition/std": 0.10690449178218842, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.755419909954071, + "sampling/importance_sampling_ratio/min": 5.566194003652804e-14, + "sampling/sampling_logp_difference/max": 30.519479751586914, + "sampling/sampling_logp_difference/mean": 3.5999581813812256, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2387481287587434, + "epoch": 0.9139784946236559, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.7745798826217651, + "learning_rate": 6.164383561643836e-07, + "loss": -0.0112, + "num_tokens": 30349169.0, + "reward": 1.0515624284744263, + "reward_std": 0.1834058165550232, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20156250894069672, + "rewards/reward_coverage/std": 0.15274441242218018, + "rewards/reward_repetition/mean": 0.8500000238418579, + "rewards/reward_repetition/std": 0.11818736046552658, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.76320481300354, + "sampling/importance_sampling_ratio/min": 1.7256138737983123e-13, + "sampling/sampling_logp_difference/max": 29.388023376464844, + "sampling/sampling_logp_difference/mean": 3.6501305103302, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.765625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2610483162570745, + "epoch": 0.9247311827956989, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1.2502846717834473, + "learning_rate": 5.47945205479452e-07, + "loss": -0.0119, + "num_tokens": 30739838.0, + "reward": 1.0515625476837158, + "reward_std": 0.12595339119434357, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20781250298023224, + "rewards/reward_coverage/std": 0.12885908782482147, + "rewards/reward_repetition/mean": 0.84375, + "rewards/reward_repetition/std": 0.11529809236526489, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7513002753257751, + "sampling/importance_sampling_ratio/min": 2.8611465139379705e-14, + "sampling/sampling_logp_difference/max": 31.184968948364258, + "sampling/sampling_logp_difference/mean": 3.6847875118255615, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.23906523222103715, + "epoch": 0.9354838709677419, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7462826371192932, + "learning_rate": 4.794520547945206e-07, + "loss": -0.0061, + "num_tokens": 31130530.0, + "reward": 1.045312523841858, + "reward_std": 0.1303728073835373, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20156250894069672, + "rewards/reward_coverage/std": 0.13391800224781036, + "rewards/reward_repetition/mean": 0.84375, + "rewards/reward_repetition/std": 0.10965313017368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7613617777824402, + "sampling/importance_sampling_ratio/min": 9.466830531296155e-13, + "sampling/sampling_logp_difference/max": 27.68581199645996, + "sampling/sampling_logp_difference/mean": 3.6363892555236816, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2408477501012385, + "epoch": 0.946236559139785, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.0979928970336914, + "learning_rate": 4.1095890410958903e-07, + "loss": -0.0084, + "num_tokens": 31521313.0, + "reward": 1.0375001430511475, + "reward_std": 0.1590990126132965, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.19062499701976776, + "rewards/reward_coverage/std": 0.14662203192710876, + "rewards/reward_repetition/mean": 0.846875011920929, + "rewards/reward_repetition/std": 0.09915315359830856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7597459554672241, + "sampling/importance_sampling_ratio/min": 1.093270764716825e-12, + "sampling/sampling_logp_difference/max": 27.541847229003906, + "sampling/sampling_logp_difference/mean": 3.6034116744995117, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 40.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 39.859375, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.254688891582191, + "epoch": 0.956989247311828, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.8742080926895142, + "learning_rate": 3.4246575342465755e-07, + "loss": -0.0096, + "num_tokens": 31911900.0, + "reward": 1.0859375, + "reward_std": 0.183405801653862, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.2515625059604645, + "rewards/reward_coverage/std": 0.16522441804409027, + "rewards/reward_repetition/mean": 0.8343749642372131, + "rewards/reward_repetition/std": 0.13119566440582275, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7481080889701843, + "sampling/importance_sampling_ratio/min": 1.0778268598939867e-16, + "sampling/sampling_logp_difference/max": 36.766414642333984, + "sampling/sampling_logp_difference/mean": 3.735830307006836, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.24119682773016393, + "epoch": 0.967741935483871, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.9537080526351929, + "learning_rate": 2.73972602739726e-07, + "loss": -0.0102, + "num_tokens": 32302677.0, + "reward": 1.078125, + "reward_std": 0.15026018023490906, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.22812499105930328, + "rewards/reward_coverage/std": 0.15783129632472992, + "rewards/reward_repetition/mean": 0.8500000238418579, + "rewards/reward_repetition/std": 0.11268723756074905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7631887793540955, + "sampling/importance_sampling_ratio/min": 4.8462983300891216e-14, + "sampling/sampling_logp_difference/max": 30.657976150512695, + "sampling/sampling_logp_difference/mean": 3.6851108074188232, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.24475648440420628, + "epoch": 0.978494623655914, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.890794038772583, + "learning_rate": 2.0547945205479452e-07, + "loss": -0.004, + "num_tokens": 32693369.0, + "reward": 1.0171875953674316, + "reward_std": 0.14363105595111847, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.17343750596046448, + "rewards/reward_coverage/std": 0.13362134993076324, + "rewards/reward_repetition/mean": 0.84375, + "rewards/reward_repetition/std": 0.11529809236526489, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7631818652153015, + "sampling/importance_sampling_ratio/min": 3.372482685910089e-14, + "sampling/sampling_logp_difference/max": 31.02054214477539, + "sampling/sampling_logp_difference/mean": 3.65973162651062, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2510380311869085, + "epoch": 0.989247311827957, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.9964897632598877, + "learning_rate": 1.36986301369863e-07, + "loss": -0.0126, + "num_tokens": 33084143.0, + "reward": 1.0359375476837158, + "reward_std": 0.17898640036582947, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.1953125, + "rewards/reward_coverage/std": 0.1385064274072647, + "rewards/reward_repetition/mean": 0.8406250476837158, + "rewards/reward_repetition/std": 0.134186252951622, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.753570556640625, + "sampling/importance_sampling_ratio/min": 6.38058653628466e-15, + "sampling/sampling_logp_difference/max": 32.685516357421875, + "sampling/sampling_logp_difference/mean": 3.6861093044281006, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 39.953125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.2551991257350892, + "epoch": 1.0, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1.1012099981307983, + "learning_rate": 6.84931506849315e-08, + "loss": -0.0126, + "num_tokens": 33474818.0, + "reward": 1.0421874523162842, + "reward_std": 0.15246988832950592, + "rewards/reward_correct/mean": 0.0, + "rewards/reward_correct/std": 0.0, + "rewards/reward_coverage/mean": 0.20156250894069672, + "rewards/reward_coverage/std": 0.1290898472070694, + "rewards/reward_repetition/mean": 0.8406250476837158, + "rewards/reward_repetition/std": 0.1293681114912033, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.7444443702697754, + "sampling/importance_sampling_ratio/min": 2.801043902031508e-13, + "sampling/sampling_logp_difference/max": 28.903614044189453, + "sampling/sampling_logp_difference/mean": 3.683718204498291, + "step": 93 + } + ], + "logging_steps": 1, + "max_steps": 93, + "num_input_tokens_seen": 33474818, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..1dc6da4 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:610585f0bea9b9f46d1c3b03122cfc2ea633bf3eff44378e90f590fd111f9ed8 +size 7313 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833