commit d9a065aa11b380ee8cc7749111ae3b16adcd4189 Author: ModelHub XC Date: Sat May 2 05:32:12 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Laksh718/daedalus-designer-v2 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..2816fbd --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +--- +base_model: unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit +tags: +- text-generation-inference +- transformers +- unsloth +- qwen2 +license: apache-2.0 +language: +- en +--- + +# Uploaded finetuned model + +- **Developed by:** Laksh718 +- **License:** apache-2.0 +- **Finetuned from model :** unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit + +This qwen2 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. + +[](https://github.com/unslothai/unsloth) diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/config.json b/config.json new file mode 100644 index 0000000..e7111d4 --- /dev/null +++ b/config.json @@ -0,0 +1,62 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": null, + "torch_dtype": "bfloat16", + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "pad_token_id": 151665, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "unsloth_fixed": true, + "unsloth_version": "2026.4.8", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..18e5edb --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0958751a7987a6a846d2b916645f449f163d556c5479e613f8702f6f917bea46 +size 3087467144 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..544df20 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,202 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "left", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + }, + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n" +} \ No newline at end of file diff --git a/training_history.json b/training_history.json new file mode 100644 index 0000000..5f2d092 --- /dev/null +++ b/training_history.json @@ -0,0 +1,2313 @@ +{ + "phase": "sft+grpo", + "sft_history": [ + { + "loss": 1.8329996109008788, + "grad_norm": 2.6284756660461426, + "learning_rate": 2.6666666666666667e-05, + "epoch": 0.16666666666666666, + "step": 5 + }, + { + "loss": 1.641743278503418, + "grad_norm": 0.9074174761772156, + "learning_rate": 6e-05, + "epoch": 0.3333333333333333, + "step": 10 + }, + { + "loss": 1.3325251579284667, + "grad_norm": 0.772527277469635, + "learning_rate": 9.333333333333334e-05, + "epoch": 0.5, + "step": 15 + }, + { + "loss": 0.908332347869873, + "grad_norm": 0.8558230400085449, + "learning_rate": 0.00012666666666666666, + "epoch": 0.6666666666666666, + "step": 20 + }, + { + "loss": 0.4191232204437256, + "grad_norm": 0.6383947134017944, + "learning_rate": 0.00016, + "epoch": 0.8333333333333334, + "step": 25 + }, + { + "loss": 0.20252063274383544, + "grad_norm": 0.24536560475826263, + "learning_rate": 0.00019333333333333333, + "epoch": 1.0, + "step": 30 + }, + { + "loss": 0.1843562602996826, + "grad_norm": 0.1841956526041031, + "learning_rate": 0.0001913545457642601, + "epoch": 1.1666666666666667, + "step": 35 + }, + { + "loss": 0.1743373155593872, + "grad_norm": 0.12225674837827682, + "learning_rate": 0.00015877852522924732, + "epoch": 1.3333333333333333, + "step": 40 + }, + { + "loss": 0.1707882285118103, + "grad_norm": 0.11675203591585159, + "learning_rate": 0.00011045284632676536, + "epoch": 1.5, + "step": 45 + }, + { + "loss": 0.17305984497070312, + "grad_norm": 0.168966606259346, + "learning_rate": 5.9326335692419995e-05, + "epoch": 1.6666666666666665, + "step": 50 + }, + { + "loss": 0.1723298192024231, + "grad_norm": 0.14092567563056946, + "learning_rate": 1.9098300562505266e-05, + "epoch": 1.8333333333333335, + "step": 55 + }, + { + "loss": 0.16860610246658325, + "grad_norm": 0.13329552114009857, + "learning_rate": 5.478104631726711e-07, + "epoch": 2.0, + "step": 60 + }, + { + "train_runtime": 1079.0765, + "train_samples_per_second": 1.112, + "train_steps_per_second": 0.056, + "total_flos": 5520149869086720.0, + "train_loss": 0.6150601516167323, + "epoch": 2.0, + "step": 60 + } + ], + "grpo_history": [ + { + "loss": 0.00047351792454719543, + "grad_norm": 0.72265625, + "learning_rate": 8.333333333333333e-07, + "num_tokens": 13592.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.4375, + "rewards/reward_format/std": 0.3535533770918846, + "rewards/reward_welfare/mean": 0.0625, + "rewards/reward_welfare/std": 0.1767766922712326, + "rewards/reward_fairness/mean": 0.03318497911095619, + "rewards/reward_fairness/std": 0.09386129677295685, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.02344914712011814, + "rewards/reward_composite/std": 0.06632420420646667, + "reward": 0.6816341280937195, + "reward_std": 0.48826825618743896, + "frac_reward_zero_std": 0.5, + "completion_length": 400.0, + "kl": 0.47351907938718796, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.016666666666666666, + "step": 2 + }, + { + "loss": 0.00041250139474868774, + "grad_norm": 0.68359375, + "learning_rate": 2.5e-06, + "num_tokens": 26592.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.33522727340459824, + "rewards/reward_format/std": 0.3089452385902405, + "rewards/reward_welfare/mean": 0.125, + "rewards/reward_welfare/std": 0.2314550280570984, + "rewards/reward_fairness/mean": 0.037382133305072784, + "rewards/reward_fairness/std": 0.07023922353982925, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.03359023481607437, + "rewards/reward_composite/std": 0.06231452897191048, + "reward": 0.8607450723648071, + "reward_std": 0.4173068106174469, + "frac_reward_zero_std": 0.75, + "completion_length": 400.0, + "kl": 0.4125128909945488, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.03333333333333333, + "step": 4 + }, + { + "loss": 0.0003954425919800997, + "grad_norm": 0.00665283203125, + "learning_rate": 4.166666666666667e-06, + "num_tokens": 39888.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.53125, + "rewards/reward_format/std": 0.0883883461356163, + "rewards/reward_welfare/mean": 0.0, + "rewards/reward_welfare/std": 0.0, + "rewards/reward_fairness/mean": 0.0, + "rewards/reward_fairness/std": 0.0, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.0, + "rewards/reward_composite/std": 0.0, + "reward": 0.46875, + "reward_std": 0.0625, + "frac_reward_zero_std": 0.75, + "completion_length": 400.0, + "kl": 0.3954422175884247, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.05, + "step": 6 + }, + { + "loss": 0.00040989843546412885, + "grad_norm": 0.013671875, + "learning_rate": 4.995770395678171e-06, + "num_tokens": 53776.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.5, + "rewards/reward_format/std": 0.0, + "rewards/reward_welfare/mean": 0.0, + "rewards/reward_welfare/std": 0.0, + "rewards/reward_fairness/mean": 0.0, + "rewards/reward_fairness/std": 0.0, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.0, + "rewards/reward_composite/std": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 400.0, + "kl": 0.40989840030670166, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.06666666666666667, + "step": 8 + }, + { + "loss": 0.00042488425970077515, + "grad_norm": 0.4921875, + "learning_rate": 4.962019382530521e-06, + "num_tokens": 67664.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.46875, + "rewards/reward_format/std": 0.2651650384068489, + "rewards/reward_welfare/mean": 0.0625, + "rewards/reward_welfare/std": 0.1767766922712326, + "rewards/reward_fairness/mean": 0.02351469174027443, + "rewards/reward_fairness/std": 0.06650959700345993, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.023048987612128258, + "rewards/reward_composite/std": 0.06519238650798798, + "reward": 0.6403136849403381, + "reward_std": 0.40562736988067627, + "frac_reward_zero_std": 0.5, + "completion_length": 400.0, + "kl": 0.42487896233797073, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.08333333333333333, + "step": 10 + }, + { + "loss": 0.0003492364485282451, + "grad_norm": 0.703125, + "learning_rate": 4.894973780788722e-06, + "num_tokens": 81552.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.34659090638160706, + "rewards/reward_format/std": 0.6987431943416595, + "rewards/reward_welfare/mean": 0.1875, + "rewards/reward_welfare/std": 0.408231720328331, + "rewards/reward_fairness/mean": 0.060186946764588356, + "rewards/reward_fairness/std": 0.13657810539007187, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.05276940576732159, + "rewards/reward_composite/std": 0.11823124438524246, + "reward": 0.9538654386997223, + "reward_std": 1.2448847889900208, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.34924405813217163, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1, + "step": 12 + }, + { + "loss": 0.00039254588773474097, + "grad_norm": 0.7890625, + "learning_rate": 4.7955402672006855e-06, + "num_tokens": 95440.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.2755681872367859, + "rewards/reward_format/std": 0.5705045461654663, + "rewards/reward_welfare/mean": 0.1875, + "rewards/reward_welfare/std": 0.408231720328331, + "rewards/reward_fairness/mean": 0.05876787751913071, + "rewards/reward_fairness/std": 0.13999952003359795, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.051001692190766335, + "rewards/reward_composite/std": 0.12243235111236572, + "reward": 1.0217013657093048, + "reward_std": 1.1684027314186096, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3925560265779495, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.11666666666666667, + "step": 14 + }, + { + "loss": 0.00040383817395195365, + "grad_norm": 0.86328125, + "learning_rate": 4.665063509461098e-06, + "num_tokens": 108736.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.16477272659540176, + "rewards/reward_format/std": 0.6252594292163849, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.10286042466759682, + "rewards/reward_fairness/std": 0.16851608455181122, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.09322065114974976, + "rewards/reward_composite/std": 0.1486019790172577, + "reward": 1.343808352947235, + "reward_std": 1.4776567816734314, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.40385157614946365, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.13333333333333333, + "step": 16 + }, + { + "loss": 0.0004179440438747406, + "grad_norm": 0.65234375, + "learning_rate": 4.50530798188761e-06, + "num_tokens": 122624.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.34375, + "rewards/reward_format/std": 0.5564062297344208, + "rewards/reward_welfare/mean": 0.125, + "rewards/reward_welfare/std": 0.3535533845424652, + "rewards/reward_fairness/mean": 0.04079132154583931, + "rewards/reward_fairness/std": 0.11537527851760387, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.035708064679056406, + "rewards/reward_composite/std": 0.10099766962230206, + "reward": 0.8577493727207184, + "reward_std": 0.8404987752437592, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.41793932020664215, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.15, + "step": 18 + }, + { + "loss": 0.00039753690361976624, + "grad_norm": 0.75, + "learning_rate": 4.318434103932622e-06, + "num_tokens": 136512.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.35795454680919647, + "rewards/reward_format/std": 0.6613655686378479, + "rewards/reward_welfare/mean": 0.1875, + "rewards/reward_welfare/std": 0.408231720328331, + "rewards/reward_fairness/mean": 0.05811220221221447, + "rewards/reward_fairness/std": 0.1433359570801258, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.04960842803120613, + "rewards/reward_composite/std": 0.11969681829214096, + "reward": 0.9372660517692566, + "reward_std": 0.9138101935386658, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.39755555987358093, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.16666666666666666, + "step": 20 + }, + { + "loss": 0.0003871597582474351, + "grad_norm": 0.005157470703125, + "learning_rate": 4.106969024216348e-06, + "num_tokens": 150104.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.21875, + "rewards/reward_format/std": 0.38816189765930176, + "rewards/reward_welfare/mean": 0.1875, + "rewards/reward_welfare/std": 0.25877460837364197, + "rewards/reward_fairness/mean": 0.09772966802120209, + "rewards/reward_fairness/std": 0.1411271095275879, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.07592316716909409, + "rewards/reward_composite/std": 0.10568810254335403, + "reward": 1.1424028873443604, + "reward_std": 0.9167249202728271, + "frac_reward_zero_std": 0.5, + "completion_length": 400.0, + "kl": 0.3871647119522095, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.18333333333333332, + "step": 22 + }, + { + "loss": 0.00046034157276153564, + "grad_norm": 0.828125, + "learning_rate": 3.8737724451770155e-06, + "num_tokens": 163992.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.053977273404598236, + "rewards/reward_format/std": 0.6971071362495422, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.11436978727579117, + "rewards/reward_fairness/std": 0.21275469660758972, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.08883418142795563, + "rewards/reward_composite/std": 0.15858761221170425, + "reward": 1.4617266654968262, + "reward_std": 1.6014615297317505, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.46036188304424286, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2, + "step": 24 + }, + { + "loss": 0.00037954188883304596, + "grad_norm": 0.75390625, + "learning_rate": 3.621997950501156e-06, + "num_tokens": 177584.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.11079545319080353, + "rewards/reward_format/std": 0.7605703175067902, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.13740837946534157, + "rewards/reward_fairness/std": 0.23384775966405869, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.1134110763669014, + "rewards/reward_composite/std": 0.1934959888458252, + "reward": 1.4525240659713745, + "reward_std": 1.4633366465568542, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3795487657189369, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.21666666666666667, + "step": 26 + }, + { + "loss": 0.00034568458795547485, + "grad_norm": 0.67578125, + "learning_rate": 3.3550503583141726e-06, + "num_tokens": 190880.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.014204561710357666, + "rewards/reward_format/std": 0.45640653371810913, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.2314550280570984, + "rewards/reward_fairness/mean": 0.17191734910011292, + "rewards/reward_fairness/std": 0.165505051612854, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.12545911967754364, + "rewards/reward_composite/std": 0.09392639249563217, + "reward": 1.6581718921661377, + "reward_std": 0.9023097902536392, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.3456726223230362, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.23333333333333334, + "step": 28 + }, + { + "loss": 0.0004424452781677246, + "grad_norm": 0.79296875, + "learning_rate": 3.0765396768561005e-06, + "num_tokens": 204472.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.05681818723678589, + "rewards/reward_format/std": 0.8016891181468964, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.49871626496315, + "rewards/reward_fairness/mean": 0.11990131065249443, + "rewards/reward_fairness/std": 0.19920051097869873, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.10109234228730202, + "rewards/reward_composite/std": 0.15720761567354202, + "reward": 1.5391755104064941, + "reward_std": 1.311523675918579, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.44244876503944397, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.25, + "step": 30 + }, + { + "loss": 0.00044108927249908447, + "grad_norm": 0.79296875, + "learning_rate": 2.7902322853130758e-06, + "num_tokens": 218360.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.23011364042758942, + "rewards/reward_format/std": 0.7126934230327606, + "rewards/reward_welfare/mean": 0.25, + "rewards/reward_welfare/std": 0.4355513006448746, + "rewards/reward_fairness/mean": 0.09009831957519054, + "rewards/reward_fairness/std": 0.17519650608301163, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.07756261341273785, + "rewards/reward_composite/std": 0.1447325348854065, + "reward": 1.1875473260879517, + "reward_std": 1.3125466108322144, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.4411006420850754, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.26666666666666666, + "step": 32 + }, + { + "loss": 0.00043725594878196716, + "grad_norm": 0.765625, + "learning_rate": 2.5e-06, + "num_tokens": 232248.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.3238636404275894, + "rewards/reward_format/std": 0.7694187164306641, + "rewards/reward_welfare/mean": 0.5625, + "rewards/reward_welfare/std": 0.5260358452796936, + "rewards/reward_fairness/mean": 0.19406583905220032, + "rewards/reward_fairness/std": 0.20889797061681747, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.1601117104291916, + "rewards/reward_composite/std": 0.16504594683647156, + "reward": 2.2405412197113037, + "reward_std": 1.72732412815094, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.43723437190055847, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2833333333333333, + "step": 34 + }, + { + "loss": 0.0003281831741333008, + "grad_norm": 0.66015625, + "learning_rate": 2.2097677146869242e-06, + "num_tokens": 245840.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.0625, + "rewards/reward_format/std": 0.8398386240005493, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.5175492167472839, + "rewards/reward_fairness/mean": 0.208244688808918, + "rewards/reward_fairness/std": 0.3558191955089569, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.12120818346738815, + "rewards/reward_composite/std": 0.17605619877576828, + "reward": 1.6419528722763062, + "reward_std": 1.9059234857559204, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.32817772775888443, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.3, + "step": 36 + }, + { + "loss": 0.0003703221445903182, + "grad_norm": 0.65625, + "learning_rate": 1.9234603231439e-06, + "num_tokens": 258840.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.34375, + "rewards/reward_format/std": 0.4355708882212639, + "rewards/reward_welfare/mean": 0.125, + "rewards/reward_welfare/std": 0.2314550280570984, + "rewards/reward_fairness/mean": 0.032129574567079544, + "rewards/reward_fairness/std": 0.06755802780389786, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.0333767905831337, + "rewards/reward_composite/std": 0.07086637616157532, + "reward": 0.8467563986778259, + "reward_std": 0.8185127973556519, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.3703107312321663, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.31666666666666665, + "step": 38 + }, + { + "loss": 0.0003897678107023239, + "grad_norm": 0.76171875, + "learning_rate": 1.6449496416858285e-06, + "num_tokens": 272136.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.07102272659540176, + "rewards/reward_format/std": 0.8448813557624817, + "rewards/reward_welfare/mean": 0.4375, + "rewards/reward_welfare/std": 0.5260358452796936, + "rewards/reward_fairness/mean": 0.13828522339463234, + "rewards/reward_fairness/std": 0.19835777580738068, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.11603889241814613, + "rewards/reward_composite/std": 0.15580761432647705, + "reward": 1.762846827507019, + "reward_std": 1.750555157661438, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.38978311419487, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.3333333333333333, + "step": 40 + }, + { + "loss": 0.00036280229687690735, + "grad_norm": 0.52734375, + "learning_rate": 1.3780020494988447e-06, + "num_tokens": 286024.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.21875, + "rewards/reward_format/std": 0.7038120031356812, + "rewards/reward_welfare/mean": 0.25, + "rewards/reward_welfare/std": 0.4629100561141968, + "rewards/reward_fairness/mean": 0.08049380034208298, + "rewards/reward_fairness/std": 0.14985806494951248, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.06840312853455544, + "rewards/reward_composite/std": 0.12792598456144333, + "reward": 1.1801469326019287, + "reward_std": 1.2528201341629028, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.36278442293405533, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.35, + "step": 42 + }, + { + "loss": 0.00041984766721725464, + "grad_norm": 0.734375, + "learning_rate": 1.1262275548229852e-06, + "num_tokens": 299320.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.125, + "rewards/reward_format/std": 0.7535476386547089, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.44403792917728424, + "rewards/reward_fairness/mean": 0.08010485023260117, + "rewards/reward_fairness/std": 0.13119615614414215, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.06627386063337326, + "rewards/reward_composite/std": 0.0994122326374054, + "reward": 1.333878755569458, + "reward_std": 1.2175767719745636, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.4198339805006981, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.36666666666666664, + "step": 44 + }, + { + "loss": 0.00037025846540927887, + "grad_norm": 0.74609375, + "learning_rate": 8.930309757836517e-07, + "num_tokens": 312912.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.15625, + "rewards/reward_format/std": 0.7469770908355713, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.44403792917728424, + "rewards/reward_fairness/mean": 0.0987030416727066, + "rewards/reward_fairness/std": 0.13824082165956497, + "rewards/reward_stability/mean": 0.9375, + "rewards/reward_stability/std": 0.1767766922712326, + "rewards/reward_composite/mean": 0.0686455499380827, + "rewards/reward_composite/std": 0.11963466554880142, + "reward": 1.2610985934734344, + "reward_std": 1.1380138397216797, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.3702595606446266, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.38333333333333336, + "step": 46 + }, + { + "loss": 0.00037697795778512955, + "grad_norm": 0.68359375, + "learning_rate": 6.815658960673782e-07, + "num_tokens": 326208.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.07954545319080353, + "rewards/reward_format/std": 0.7752929627895355, + "rewards/reward_welfare/mean": 0.4375, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.1642819568514824, + "rewards/reward_fairness/std": 0.2405308187007904, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.12362180650234222, + "rewards/reward_composite/std": 0.16839426010847092, + "reward": 1.8049492835998535, + "reward_std": 1.6678152084350586, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3769652917981148, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.4, + "step": 48 + }, + { + "loss": 0.00039646029472351074, + "grad_norm": 0.6953125, + "learning_rate": 4.946920181123904e-07, + "num_tokens": 339800.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.06818181276321411, + "rewards/reward_format/std": 0.7570639848709106, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.12952633947134018, + "rewards/reward_fairness/std": 0.2085839882493019, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.10791970416903496, + "rewards/reward_composite/std": 0.1719568744301796, + "reward": 1.4817642569541931, + "reward_std": 1.4108701944351196, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.39645931124687195, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.4166666666666667, + "step": 50 + }, + { + "loss": 0.000441722571849823, + "grad_norm": 0.7265625, + "learning_rate": 3.3493649053890325e-07, + "num_tokens": 353688.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.03125, + "rewards/reward_format/std": 0.8107390403747559, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.5175492167472839, + "rewards/reward_fairness/mean": 0.10219378396868706, + "rewards/reward_fairness/std": 0.1520508974790573, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.09825712814927101, + "rewards/reward_composite/std": 0.1404884159564972, + "reward": 1.6067009568214417, + "reward_std": 1.2484731674194336, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.44170165807008743, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.43333333333333335, + "step": 52 + }, + { + "loss": 0.00037848297506570816, + "grad_norm": 0.796875, + "learning_rate": 2.044597327993153e-07, + "num_tokens": 367280.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.23579545319080353, + "rewards/reward_format/std": 0.7612137198448181, + "rewards/reward_welfare/mean": 0.5625, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.2322249710559845, + "rewards/reward_fairness/std": 0.20398348569869995, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.19660773873329163, + "rewards/reward_composite/std": 0.17612425237894058, + "reward": 2.2271281480789185, + "reward_std": 1.4199119210243225, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3786723464727402, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.45, + "step": 54 + }, + { + "loss": 0.00040780752897262573, + "grad_norm": 0.65234375, + "learning_rate": 1.0502621921127776e-07, + "num_tokens": 380872.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.0625, + "rewards/reward_format/std": 0.7646470665931702, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.09930047020316124, + "rewards/reward_fairness/std": 0.18247877806425095, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.08683200180530548, + "rewards/reward_composite/std": 0.15039421617984772, + "reward": 1.4361324906349182, + "reward_std": 1.6369856595993042, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.4077882617712021, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.4666666666666667, + "step": 56 + }, + { + "loss": 0.00038760900497436523, + "grad_norm": 0.74609375, + "learning_rate": 3.798061746947995e-08, + "num_tokens": 394168.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.002840910106897354, + "rewards/reward_format/std": 0.8362354636192322, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.5175492167472839, + "rewards/reward_fairness/mean": 0.11489119380712509, + "rewards/reward_fairness/std": 0.17957812547683716, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.11398349702358246, + "rewards/reward_composite/std": 0.17128486931324005, + "reward": 1.6010336875915527, + "reward_std": 1.725760817527771, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3876567706465721, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.48333333333333334, + "step": 58 + }, + { + "loss": 0.00037222355604171753, + "grad_norm": 0.703125, + "learning_rate": 4.229604321829561e-09, + "num_tokens": 407760.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.46875, + "rewards/reward_format/std": 0.6302918791770935, + "rewards/reward_welfare/mean": 0.125, + "rewards/reward_welfare/std": 0.3535533845424652, + "rewards/reward_fairness/mean": 0.015625, + "rewards/reward_fairness/std": 0.04419417306780815, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.018206155858933926, + "rewards/reward_composite/std": 0.05149478651583195, + "reward": 0.6900811493396759, + "reward_std": 0.873493492603302, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3722131997346878, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.5, + "step": 60 + }, + { + "train_runtime": 1880.1016, + "train_samples_per_second": 0.255, + "train_steps_per_second": 0.032, + "total_flos": 0.0, + "train_loss": 0.0003991109939912955, + "epoch": 0.5, + "step": 60 + } + ], + "history": [ + { + "loss": 1.8329996109008788, + "grad_norm": 2.6284756660461426, + "learning_rate": 2.6666666666666667e-05, + "epoch": 0.16666666666666666, + "step": 5 + }, + { + "loss": 1.641743278503418, + "grad_norm": 0.9074174761772156, + "learning_rate": 6e-05, + "epoch": 0.3333333333333333, + "step": 10 + }, + { + "loss": 1.3325251579284667, + "grad_norm": 0.772527277469635, + "learning_rate": 9.333333333333334e-05, + "epoch": 0.5, + "step": 15 + }, + { + "loss": 0.908332347869873, + "grad_norm": 0.8558230400085449, + "learning_rate": 0.00012666666666666666, + "epoch": 0.6666666666666666, + "step": 20 + }, + { + "loss": 0.4191232204437256, + "grad_norm": 0.6383947134017944, + "learning_rate": 0.00016, + "epoch": 0.8333333333333334, + "step": 25 + }, + { + "loss": 0.20252063274383544, + "grad_norm": 0.24536560475826263, + "learning_rate": 0.00019333333333333333, + "epoch": 1.0, + "step": 30 + }, + { + "loss": 0.1843562602996826, + "grad_norm": 0.1841956526041031, + "learning_rate": 0.0001913545457642601, + "epoch": 1.1666666666666667, + "step": 35 + }, + { + "loss": 0.1743373155593872, + "grad_norm": 0.12225674837827682, + "learning_rate": 0.00015877852522924732, + "epoch": 1.3333333333333333, + "step": 40 + }, + { + "loss": 0.1707882285118103, + "grad_norm": 0.11675203591585159, + "learning_rate": 0.00011045284632676536, + "epoch": 1.5, + "step": 45 + }, + { + "loss": 0.17305984497070312, + "grad_norm": 0.168966606259346, + "learning_rate": 5.9326335692419995e-05, + "epoch": 1.6666666666666665, + "step": 50 + }, + { + "loss": 0.1723298192024231, + "grad_norm": 0.14092567563056946, + "learning_rate": 1.9098300562505266e-05, + "epoch": 1.8333333333333335, + "step": 55 + }, + { + "loss": 0.16860610246658325, + "grad_norm": 0.13329552114009857, + "learning_rate": 5.478104631726711e-07, + "epoch": 2.0, + "step": 60 + }, + { + "train_runtime": 1079.0765, + "train_samples_per_second": 1.112, + "train_steps_per_second": 0.056, + "total_flos": 5520149869086720.0, + "train_loss": 0.6150601516167323, + "epoch": 2.0, + "step": 60 + }, + { + "loss": 0.00047351792454719543, + "grad_norm": 0.72265625, + "learning_rate": 8.333333333333333e-07, + "num_tokens": 13592.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.4375, + "rewards/reward_format/std": 0.3535533770918846, + "rewards/reward_welfare/mean": 0.0625, + "rewards/reward_welfare/std": 0.1767766922712326, + "rewards/reward_fairness/mean": 0.03318497911095619, + "rewards/reward_fairness/std": 0.09386129677295685, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.02344914712011814, + "rewards/reward_composite/std": 0.06632420420646667, + "reward": 0.6816341280937195, + "reward_std": 0.48826825618743896, + "frac_reward_zero_std": 0.5, + "completion_length": 400.0, + "kl": 0.47351907938718796, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.016666666666666666, + "step": 2 + }, + { + "loss": 0.00041250139474868774, + "grad_norm": 0.68359375, + "learning_rate": 2.5e-06, + "num_tokens": 26592.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.33522727340459824, + "rewards/reward_format/std": 0.3089452385902405, + "rewards/reward_welfare/mean": 0.125, + "rewards/reward_welfare/std": 0.2314550280570984, + "rewards/reward_fairness/mean": 0.037382133305072784, + "rewards/reward_fairness/std": 0.07023922353982925, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.03359023481607437, + "rewards/reward_composite/std": 0.06231452897191048, + "reward": 0.8607450723648071, + "reward_std": 0.4173068106174469, + "frac_reward_zero_std": 0.75, + "completion_length": 400.0, + "kl": 0.4125128909945488, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.03333333333333333, + "step": 4 + }, + { + "loss": 0.0003954425919800997, + "grad_norm": 0.00665283203125, + "learning_rate": 4.166666666666667e-06, + "num_tokens": 39888.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.53125, + "rewards/reward_format/std": 0.0883883461356163, + "rewards/reward_welfare/mean": 0.0, + "rewards/reward_welfare/std": 0.0, + "rewards/reward_fairness/mean": 0.0, + "rewards/reward_fairness/std": 0.0, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.0, + "rewards/reward_composite/std": 0.0, + "reward": 0.46875, + "reward_std": 0.0625, + "frac_reward_zero_std": 0.75, + "completion_length": 400.0, + "kl": 0.3954422175884247, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.05, + "step": 6 + }, + { + "loss": 0.00040989843546412885, + "grad_norm": 0.013671875, + "learning_rate": 4.995770395678171e-06, + "num_tokens": 53776.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.5, + "rewards/reward_format/std": 0.0, + "rewards/reward_welfare/mean": 0.0, + "rewards/reward_welfare/std": 0.0, + "rewards/reward_fairness/mean": 0.0, + "rewards/reward_fairness/std": 0.0, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.0, + "rewards/reward_composite/std": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 400.0, + "kl": 0.40989840030670166, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.06666666666666667, + "step": 8 + }, + { + "loss": 0.00042488425970077515, + "grad_norm": 0.4921875, + "learning_rate": 4.962019382530521e-06, + "num_tokens": 67664.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.46875, + "rewards/reward_format/std": 0.2651650384068489, + "rewards/reward_welfare/mean": 0.0625, + "rewards/reward_welfare/std": 0.1767766922712326, + "rewards/reward_fairness/mean": 0.02351469174027443, + "rewards/reward_fairness/std": 0.06650959700345993, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.023048987612128258, + "rewards/reward_composite/std": 0.06519238650798798, + "reward": 0.6403136849403381, + "reward_std": 0.40562736988067627, + "frac_reward_zero_std": 0.5, + "completion_length": 400.0, + "kl": 0.42487896233797073, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.08333333333333333, + "step": 10 + }, + { + "loss": 0.0003492364485282451, + "grad_norm": 0.703125, + "learning_rate": 4.894973780788722e-06, + "num_tokens": 81552.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.34659090638160706, + "rewards/reward_format/std": 0.6987431943416595, + "rewards/reward_welfare/mean": 0.1875, + "rewards/reward_welfare/std": 0.408231720328331, + "rewards/reward_fairness/mean": 0.060186946764588356, + "rewards/reward_fairness/std": 0.13657810539007187, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.05276940576732159, + "rewards/reward_composite/std": 0.11823124438524246, + "reward": 0.9538654386997223, + "reward_std": 1.2448847889900208, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.34924405813217163, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1, + "step": 12 + }, + { + "loss": 0.00039254588773474097, + "grad_norm": 0.7890625, + "learning_rate": 4.7955402672006855e-06, + "num_tokens": 95440.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.2755681872367859, + "rewards/reward_format/std": 0.5705045461654663, + "rewards/reward_welfare/mean": 0.1875, + "rewards/reward_welfare/std": 0.408231720328331, + "rewards/reward_fairness/mean": 0.05876787751913071, + "rewards/reward_fairness/std": 0.13999952003359795, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.051001692190766335, + "rewards/reward_composite/std": 0.12243235111236572, + "reward": 1.0217013657093048, + "reward_std": 1.1684027314186096, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3925560265779495, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.11666666666666667, + "step": 14 + }, + { + "loss": 0.00040383817395195365, + "grad_norm": 0.86328125, + "learning_rate": 4.665063509461098e-06, + "num_tokens": 108736.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.16477272659540176, + "rewards/reward_format/std": 0.6252594292163849, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.10286042466759682, + "rewards/reward_fairness/std": 0.16851608455181122, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.09322065114974976, + "rewards/reward_composite/std": 0.1486019790172577, + "reward": 1.343808352947235, + "reward_std": 1.4776567816734314, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.40385157614946365, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.13333333333333333, + "step": 16 + }, + { + "loss": 0.0004179440438747406, + "grad_norm": 0.65234375, + "learning_rate": 4.50530798188761e-06, + "num_tokens": 122624.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.34375, + "rewards/reward_format/std": 0.5564062297344208, + "rewards/reward_welfare/mean": 0.125, + "rewards/reward_welfare/std": 0.3535533845424652, + "rewards/reward_fairness/mean": 0.04079132154583931, + "rewards/reward_fairness/std": 0.11537527851760387, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.035708064679056406, + "rewards/reward_composite/std": 0.10099766962230206, + "reward": 0.8577493727207184, + "reward_std": 0.8404987752437592, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.41793932020664215, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.15, + "step": 18 + }, + { + "loss": 0.00039753690361976624, + "grad_norm": 0.75, + "learning_rate": 4.318434103932622e-06, + "num_tokens": 136512.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.35795454680919647, + "rewards/reward_format/std": 0.6613655686378479, + "rewards/reward_welfare/mean": 0.1875, + "rewards/reward_welfare/std": 0.408231720328331, + "rewards/reward_fairness/mean": 0.05811220221221447, + "rewards/reward_fairness/std": 0.1433359570801258, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.04960842803120613, + "rewards/reward_composite/std": 0.11969681829214096, + "reward": 0.9372660517692566, + "reward_std": 0.9138101935386658, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.39755555987358093, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.16666666666666666, + "step": 20 + }, + { + "loss": 0.0003871597582474351, + "grad_norm": 0.005157470703125, + "learning_rate": 4.106969024216348e-06, + "num_tokens": 150104.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.21875, + "rewards/reward_format/std": 0.38816189765930176, + "rewards/reward_welfare/mean": 0.1875, + "rewards/reward_welfare/std": 0.25877460837364197, + "rewards/reward_fairness/mean": 0.09772966802120209, + "rewards/reward_fairness/std": 0.1411271095275879, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.07592316716909409, + "rewards/reward_composite/std": 0.10568810254335403, + "reward": 1.1424028873443604, + "reward_std": 0.9167249202728271, + "frac_reward_zero_std": 0.5, + "completion_length": 400.0, + "kl": 0.3871647119522095, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.18333333333333332, + "step": 22 + }, + { + "loss": 0.00046034157276153564, + "grad_norm": 0.828125, + "learning_rate": 3.8737724451770155e-06, + "num_tokens": 163992.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.053977273404598236, + "rewards/reward_format/std": 0.6971071362495422, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.11436978727579117, + "rewards/reward_fairness/std": 0.21275469660758972, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.08883418142795563, + "rewards/reward_composite/std": 0.15858761221170425, + "reward": 1.4617266654968262, + "reward_std": 1.6014615297317505, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.46036188304424286, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2, + "step": 24 + }, + { + "loss": 0.00037954188883304596, + "grad_norm": 0.75390625, + "learning_rate": 3.621997950501156e-06, + "num_tokens": 177584.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.11079545319080353, + "rewards/reward_format/std": 0.7605703175067902, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.13740837946534157, + "rewards/reward_fairness/std": 0.23384775966405869, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.1134110763669014, + "rewards/reward_composite/std": 0.1934959888458252, + "reward": 1.4525240659713745, + "reward_std": 1.4633366465568542, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3795487657189369, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.21666666666666667, + "step": 26 + }, + { + "loss": 0.00034568458795547485, + "grad_norm": 0.67578125, + "learning_rate": 3.3550503583141726e-06, + "num_tokens": 190880.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.014204561710357666, + "rewards/reward_format/std": 0.45640653371810913, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.2314550280570984, + "rewards/reward_fairness/mean": 0.17191734910011292, + "rewards/reward_fairness/std": 0.165505051612854, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.12545911967754364, + "rewards/reward_composite/std": 0.09392639249563217, + "reward": 1.6581718921661377, + "reward_std": 0.9023097902536392, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.3456726223230362, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.23333333333333334, + "step": 28 + }, + { + "loss": 0.0004424452781677246, + "grad_norm": 0.79296875, + "learning_rate": 3.0765396768561005e-06, + "num_tokens": 204472.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.05681818723678589, + "rewards/reward_format/std": 0.8016891181468964, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.49871626496315, + "rewards/reward_fairness/mean": 0.11990131065249443, + "rewards/reward_fairness/std": 0.19920051097869873, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.10109234228730202, + "rewards/reward_composite/std": 0.15720761567354202, + "reward": 1.5391755104064941, + "reward_std": 1.311523675918579, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.44244876503944397, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.25, + "step": 30 + }, + { + "loss": 0.00044108927249908447, + "grad_norm": 0.79296875, + "learning_rate": 2.7902322853130758e-06, + "num_tokens": 218360.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.23011364042758942, + "rewards/reward_format/std": 0.7126934230327606, + "rewards/reward_welfare/mean": 0.25, + "rewards/reward_welfare/std": 0.4355513006448746, + "rewards/reward_fairness/mean": 0.09009831957519054, + "rewards/reward_fairness/std": 0.17519650608301163, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.07756261341273785, + "rewards/reward_composite/std": 0.1447325348854065, + "reward": 1.1875473260879517, + "reward_std": 1.3125466108322144, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.4411006420850754, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.26666666666666666, + "step": 32 + }, + { + "loss": 0.00043725594878196716, + "grad_norm": 0.765625, + "learning_rate": 2.5e-06, + "num_tokens": 232248.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.3238636404275894, + "rewards/reward_format/std": 0.7694187164306641, + "rewards/reward_welfare/mean": 0.5625, + "rewards/reward_welfare/std": 0.5260358452796936, + "rewards/reward_fairness/mean": 0.19406583905220032, + "rewards/reward_fairness/std": 0.20889797061681747, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.1601117104291916, + "rewards/reward_composite/std": 0.16504594683647156, + "reward": 2.2405412197113037, + "reward_std": 1.72732412815094, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.43723437190055847, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2833333333333333, + "step": 34 + }, + { + "loss": 0.0003281831741333008, + "grad_norm": 0.66015625, + "learning_rate": 2.2097677146869242e-06, + "num_tokens": 245840.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.0625, + "rewards/reward_format/std": 0.8398386240005493, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.5175492167472839, + "rewards/reward_fairness/mean": 0.208244688808918, + "rewards/reward_fairness/std": 0.3558191955089569, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.12120818346738815, + "rewards/reward_composite/std": 0.17605619877576828, + "reward": 1.6419528722763062, + "reward_std": 1.9059234857559204, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.32817772775888443, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.3, + "step": 36 + }, + { + "loss": 0.0003703221445903182, + "grad_norm": 0.65625, + "learning_rate": 1.9234603231439e-06, + "num_tokens": 258840.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.34375, + "rewards/reward_format/std": 0.4355708882212639, + "rewards/reward_welfare/mean": 0.125, + "rewards/reward_welfare/std": 0.2314550280570984, + "rewards/reward_fairness/mean": 0.032129574567079544, + "rewards/reward_fairness/std": 0.06755802780389786, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.0333767905831337, + "rewards/reward_composite/std": 0.07086637616157532, + "reward": 0.8467563986778259, + "reward_std": 0.8185127973556519, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.3703107312321663, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.31666666666666665, + "step": 38 + }, + { + "loss": 0.0003897678107023239, + "grad_norm": 0.76171875, + "learning_rate": 1.6449496416858285e-06, + "num_tokens": 272136.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.07102272659540176, + "rewards/reward_format/std": 0.8448813557624817, + "rewards/reward_welfare/mean": 0.4375, + "rewards/reward_welfare/std": 0.5260358452796936, + "rewards/reward_fairness/mean": 0.13828522339463234, + "rewards/reward_fairness/std": 0.19835777580738068, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.11603889241814613, + "rewards/reward_composite/std": 0.15580761432647705, + "reward": 1.762846827507019, + "reward_std": 1.750555157661438, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.38978311419487, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.3333333333333333, + "step": 40 + }, + { + "loss": 0.00036280229687690735, + "grad_norm": 0.52734375, + "learning_rate": 1.3780020494988447e-06, + "num_tokens": 286024.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.21875, + "rewards/reward_format/std": 0.7038120031356812, + "rewards/reward_welfare/mean": 0.25, + "rewards/reward_welfare/std": 0.4629100561141968, + "rewards/reward_fairness/mean": 0.08049380034208298, + "rewards/reward_fairness/std": 0.14985806494951248, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.06840312853455544, + "rewards/reward_composite/std": 0.12792598456144333, + "reward": 1.1801469326019287, + "reward_std": 1.2528201341629028, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.36278442293405533, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.35, + "step": 42 + }, + { + "loss": 0.00041984766721725464, + "grad_norm": 0.734375, + "learning_rate": 1.1262275548229852e-06, + "num_tokens": 299320.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.125, + "rewards/reward_format/std": 0.7535476386547089, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.44403792917728424, + "rewards/reward_fairness/mean": 0.08010485023260117, + "rewards/reward_fairness/std": 0.13119615614414215, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.06627386063337326, + "rewards/reward_composite/std": 0.0994122326374054, + "reward": 1.333878755569458, + "reward_std": 1.2175767719745636, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.4198339805006981, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.36666666666666664, + "step": 44 + }, + { + "loss": 0.00037025846540927887, + "grad_norm": 0.74609375, + "learning_rate": 8.930309757836517e-07, + "num_tokens": 312912.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.15625, + "rewards/reward_format/std": 0.7469770908355713, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.44403792917728424, + "rewards/reward_fairness/mean": 0.0987030416727066, + "rewards/reward_fairness/std": 0.13824082165956497, + "rewards/reward_stability/mean": 0.9375, + "rewards/reward_stability/std": 0.1767766922712326, + "rewards/reward_composite/mean": 0.0686455499380827, + "rewards/reward_composite/std": 0.11963466554880142, + "reward": 1.2610985934734344, + "reward_std": 1.1380138397216797, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.3702595606446266, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.38333333333333336, + "step": 46 + }, + { + "loss": 0.00037697795778512955, + "grad_norm": 0.68359375, + "learning_rate": 6.815658960673782e-07, + "num_tokens": 326208.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.07954545319080353, + "rewards/reward_format/std": 0.7752929627895355, + "rewards/reward_welfare/mean": 0.4375, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.1642819568514824, + "rewards/reward_fairness/std": 0.2405308187007904, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.12362180650234222, + "rewards/reward_composite/std": 0.16839426010847092, + "reward": 1.8049492835998535, + "reward_std": 1.6678152084350586, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3769652917981148, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.4, + "step": 48 + }, + { + "loss": 0.00039646029472351074, + "grad_norm": 0.6953125, + "learning_rate": 4.946920181123904e-07, + "num_tokens": 339800.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.06818181276321411, + "rewards/reward_format/std": 0.7570639848709106, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.12952633947134018, + "rewards/reward_fairness/std": 0.2085839882493019, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.10791970416903496, + "rewards/reward_composite/std": 0.1719568744301796, + "reward": 1.4817642569541931, + "reward_std": 1.4108701944351196, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.39645931124687195, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.4166666666666667, + "step": 50 + }, + { + "loss": 0.000441722571849823, + "grad_norm": 0.7265625, + "learning_rate": 3.3493649053890325e-07, + "num_tokens": 353688.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.03125, + "rewards/reward_format/std": 0.8107390403747559, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.5175492167472839, + "rewards/reward_fairness/mean": 0.10219378396868706, + "rewards/reward_fairness/std": 0.1520508974790573, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.09825712814927101, + "rewards/reward_composite/std": 0.1404884159564972, + "reward": 1.6067009568214417, + "reward_std": 1.2484731674194336, + "frac_reward_zero_std": 0.25, + "completion_length": 400.0, + "kl": 0.44170165807008743, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.43333333333333335, + "step": 52 + }, + { + "loss": 0.00037848297506570816, + "grad_norm": 0.796875, + "learning_rate": 2.044597327993153e-07, + "num_tokens": 367280.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": 0.23579545319080353, + "rewards/reward_format/std": 0.7612137198448181, + "rewards/reward_welfare/mean": 0.5625, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.2322249710559845, + "rewards/reward_fairness/std": 0.20398348569869995, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.19660773873329163, + "rewards/reward_composite/std": 0.17612425237894058, + "reward": 2.2271281480789185, + "reward_std": 1.4199119210243225, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3786723464727402, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.45, + "step": 54 + }, + { + "loss": 0.00040780752897262573, + "grad_norm": 0.65234375, + "learning_rate": 1.0502621921127776e-07, + "num_tokens": 380872.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.0625, + "rewards/reward_format/std": 0.7646470665931702, + "rewards/reward_welfare/mean": 0.3125, + "rewards/reward_welfare/std": 0.49022963643074036, + "rewards/reward_fairness/mean": 0.09930047020316124, + "rewards/reward_fairness/std": 0.18247877806425095, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.08683200180530548, + "rewards/reward_composite/std": 0.15039421617984772, + "reward": 1.4361324906349182, + "reward_std": 1.6369856595993042, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.4077882617712021, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.4666666666666667, + "step": 56 + }, + { + "loss": 0.00038760900497436523, + "grad_norm": 0.74609375, + "learning_rate": 3.798061746947995e-08, + "num_tokens": 394168.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.002840910106897354, + "rewards/reward_format/std": 0.8362354636192322, + "rewards/reward_welfare/mean": 0.375, + "rewards/reward_welfare/std": 0.5175492167472839, + "rewards/reward_fairness/mean": 0.11489119380712509, + "rewards/reward_fairness/std": 0.17957812547683716, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.11398349702358246, + "rewards/reward_composite/std": 0.17128486931324005, + "reward": 1.6010336875915527, + "reward_std": 1.725760817527771, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3876567706465721, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.48333333333333334, + "step": 58 + }, + { + "loss": 0.00037222355604171753, + "grad_norm": 0.703125, + "learning_rate": 4.229604321829561e-09, + "num_tokens": 407760.0, + "completions/mean_length": 400.0, + "completions/min_length": 400.0, + "completions/max_length": 400.0, + "completions/clipped_ratio": 1.0, + "completions/mean_terminated_length": 0.0, + "completions/min_terminated_length": 0.0, + "completions/max_terminated_length": 0.0, + "rewards/reward_format/mean": -0.46875, + "rewards/reward_format/std": 0.6302918791770935, + "rewards/reward_welfare/mean": 0.125, + "rewards/reward_welfare/std": 0.3535533845424652, + "rewards/reward_fairness/mean": 0.015625, + "rewards/reward_fairness/std": 0.04419417306780815, + "rewards/reward_stability/mean": 1.0, + "rewards/reward_stability/std": 0.0, + "rewards/reward_composite/mean": 0.018206155858933926, + "rewards/reward_composite/std": 0.05149478651583195, + "reward": 0.6900811493396759, + "reward_std": 0.873493492603302, + "frac_reward_zero_std": 0.0, + "completion_length": 400.0, + "kl": 0.3722131997346878, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.5, + "step": 60 + }, + { + "train_runtime": 1880.1016, + "train_samples_per_second": 0.255, + "train_steps_per_second": 0.032, + "total_flos": 0.0, + "train_loss": 0.0003991109939912955, + "epoch": 0.5, + "step": 60 + } + ] +} \ No newline at end of file