commit 5ba183ae8d85ba959dd97c2c7725193e2f5e8fd7 Author: ModelHub XC Date: Fri May 29 00:44:31 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: leonMW/DeepSeek-R1-Distill-Qwen-1.5B-GSPO-Basic Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..66c46bf --- /dev/null +++ b/README.md @@ -0,0 +1,68 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: transformers +model_name: DeepSeek-R1-Distill-Qwen-1.5B-GSPO-Basic +tags: +- generated_from_trainer +- grpo +- trl +licence: license +--- + +# Model Card for DeepSeek-R1-Distill-Qwen-1.5B-GSPO-Basic + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="leonMW/DeepSeek-R1-Distill-Qwen-1.5B-GSPO-Basic", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/leonwenderoth-tu-darmstadt/huggingface/runs/6bd0c08q) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.23.1 +- Transformers: 4.57.1 +- Pytorch: 2.8.0 +- Datasets: 4.4.1 +- Tokenizers: 0.22.1 + +## Citations + +Cite GRPO as: + +```bibtex +@article{shao2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..c23f709 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.013345611529686698, + "train_runtime": 16054.0136, + "train_samples": 3053, + "train_samples_per_second": 0.951, + "train_steps_per_second": 0.118 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..05417b8 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..141d783 --- /dev/null +++ b/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151646, + "dtype": "bfloat16", + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "4.57.1", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..4b37b43 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,10 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "pad_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.57.1" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..7a8521a --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6bb2017d2ed8f4ba6a59aacafb73dca01f2e50d806c2124948fa7d9366094f2 +size 3554214752 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..d252dd4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..c23f709 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.013345611529686698, + "train_runtime": 16054.0136, + "train_samples": 3053, + "train_samples_per_second": 0.951, + "train_steps_per_second": 0.118 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..ac82b74 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,36333 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.197265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 6760.4375, + "completions/mean_terminated_length": 4395.52294921875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.2916708439588547, + "epoch": 0.002631578947368421, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.035724807530641556, + "learning_rate": 1e-06, + "loss": 0.1316, + "num_tokens": 3865152.0, + "reward": 0.07896194607019424, + "reward_std": 0.10644184798002243, + "rewards/progression_diversity/mean": -0.009079374372959137, + "rewards/progression_diversity/std": 0.053816426545381546, + "rewards/symbolic_reward_accuracy/mean": 0.009765625, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.30517578125, + "rewards/symbolic_reward_partial_score/std": 0.25365734100341797, + "rewards/tag_count_reward/mean": -0.18359375, + "rewards/tag_count_reward/std": 0.3875311613082886, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0618481636047363, + "sampling/importance_sampling_ratio/min": 0.00013467908138409257, + "sampling/sampling_logp_difference/max": 8.912615776062012, + "sampling/sampling_logp_difference/mean": 0.10774467885494232, + "step": 1 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2698264718055725, + "epoch": 0.005263157894736842, + "grad_norm": 0.032932084053754807, + "learning_rate": 1e-06, + "loss": 0.1813, + "step": 2 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.26824212074279785, + "epoch": 0.007894736842105263, + "grad_norm": 0.0307021327316761, + "learning_rate": 1e-06, + "loss": 0.2115, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2804792523384094, + "epoch": 0.010526315789473684, + "grad_norm": 0.031359054148197174, + "learning_rate": 1e-06, + "loss": 0.1183, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.166015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15442.0, + "completions/mean_length": 6071.873046875, + "completions/mean_terminated_length": 4019.107666015625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 0.28796158730983734, + "epoch": 0.013157894736842105, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032425932586193085, + "learning_rate": 1e-06, + "loss": 0.1857, + "num_tokens": 7368415.0, + "reward": 0.08325499296188354, + "reward_std": 0.0968296229839325, + "rewards/progression_diversity/mean": -0.004579948727041483, + "rewards/progression_diversity/std": 0.0396733395755291, + "rewards/symbolic_reward_accuracy/mean": 0.005859375, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.3193359375, + "rewards/symbolic_reward_partial_score/std": 0.2464565932750702, + "rewards/tag_count_reward/mean": -0.16015625, + "rewards/tag_count_reward/std": 0.3671095669269562, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0649909973144531, + "sampling/importance_sampling_ratio/min": 4.260127752786502e-07, + "sampling/sampling_logp_difference/max": 14.66879653930664, + "sampling/sampling_logp_difference/mean": 0.11339153349399567, + "step": 5 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2807093560695648, + "epoch": 0.015789473684210527, + "grad_norm": 0.02408398687839508, + "learning_rate": 1e-06, + "loss": 0.1071, + "step": 6 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.28542497754096985, + "epoch": 0.018421052631578946, + "grad_norm": 0.0270535945892334, + "learning_rate": 1e-06, + "loss": 0.1096, + "step": 7 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2942824959754944, + "epoch": 0.021052631578947368, + "grad_norm": 0.023121848702430725, + "learning_rate": 1e-06, + "loss": 0.2167, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.162109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15805.0, + "completions/mean_length": 5919.259765625, + "completions/mean_terminated_length": 3894.613037109375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.28526371717453003, + "epoch": 0.02368421052631579, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033250465989112854, + "learning_rate": 1e-06, + "loss": 0.1934, + "num_tokens": 10792196.0, + "reward": 0.08689245581626892, + "reward_std": 0.0983029454946518, + "rewards/progression_diversity/mean": -0.007043677382171154, + "rewards/progression_diversity/std": 0.04186973348259926, + "rewards/symbolic_reward_accuracy/mean": 0.0078125, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.32568359375, + "rewards/symbolic_reward_partial_score/std": 0.24731339514255524, + "rewards/tag_count_reward/mean": -0.154296875, + "rewards/tag_count_reward/std": 0.36158639192581177, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0634013414382935, + "sampling/importance_sampling_ratio/min": 0.0004096508782822639, + "sampling/sampling_logp_difference/max": 7.800205230712891, + "sampling/sampling_logp_difference/mean": 0.11012717336416245, + "step": 9 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2955394983291626, + "epoch": 0.02631578947368421, + "grad_norm": 0.02542749233543873, + "learning_rate": 1e-06, + "loss": 0.1529, + "step": 10 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.28635716438293457, + "epoch": 0.02894736842105263, + "grad_norm": 0.02619725838303566, + "learning_rate": 1e-06, + "loss": 0.1198, + "step": 11 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.28302156925201416, + "epoch": 0.031578947368421054, + "grad_norm": 0.022730253636837006, + "learning_rate": 1e-06, + "loss": 0.1629, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16322.0, + "completions/mean_length": 6270.1875, + "completions/mean_terminated_length": 3692.156982421875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.2731642723083496, + "epoch": 0.034210526315789476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030661821365356445, + "learning_rate": 1e-06, + "loss": 0.1548, + "num_tokens": 14396676.0, + "reward": 0.08138585090637207, + "reward_std": 0.10238391160964966, + "rewards/progression_diversity/mean": -0.00594722805544734, + "rewards/progression_diversity/std": 0.03126208111643791, + "rewards/symbolic_reward_accuracy/mean": 0.0078125, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.3177083432674408, + "rewards/symbolic_reward_partial_score/std": 0.24931958317756653, + "rewards/tag_count_reward/mean": -0.185546875, + "rewards/tag_count_reward/std": 0.38912075757980347, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0586175918579102, + "sampling/importance_sampling_ratio/min": 1.9201727354811737e-06, + "sampling/sampling_logp_difference/max": 13.163095474243164, + "sampling/sampling_logp_difference/mean": 0.10211914777755737, + "step": 13 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.28947339951992035, + "epoch": 0.03684210526315789, + "grad_norm": 0.024929087609052658, + "learning_rate": 1e-06, + "loss": 0.1721, + "step": 14 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.26907335221767426, + "epoch": 0.039473684210526314, + "grad_norm": 0.03776842728257179, + "learning_rate": 1e-06, + "loss": 0.1558, + "step": 15 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.27320994436740875, + "epoch": 0.042105263157894736, + "grad_norm": 0.02968466654419899, + "learning_rate": 1e-06, + "loss": 0.2034, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.189453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6075.619140625, + "completions/mean_terminated_length": 3666.1904296875, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 0.2953624576330185, + "epoch": 0.04473684210526316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03018520586192608, + "learning_rate": 1e-06, + "loss": 0.106, + "num_tokens": 17914529.0, + "reward": 0.08459560573101044, + "reward_std": 0.11007633805274963, + "rewards/progression_diversity/mean": -0.007236707955598831, + "rewards/progression_diversity/std": 0.039540816098451614, + "rewards/symbolic_reward_accuracy/mean": 0.013671875, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.3193359375, + "rewards/symbolic_reward_partial_score/std": 0.2542729675769806, + "rewards/tag_count_reward/mean": -0.193359375, + "rewards/tag_count_reward/std": 0.39531853795051575, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059499979019165, + "sampling/importance_sampling_ratio/min": 5.811744358652504e-06, + "sampling/sampling_logp_difference/max": 12.05562973022461, + "sampling/sampling_logp_difference/mean": 0.10546315461397171, + "step": 17 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2861950248479843, + "epoch": 0.04736842105263158, + "grad_norm": 0.03124823048710823, + "learning_rate": 1e-06, + "loss": 0.1526, + "step": 18 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.27276280522346497, + "epoch": 0.05, + "grad_norm": 0.027596795931458473, + "learning_rate": 1e-06, + "loss": 0.1598, + "step": 19 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2472858428955078, + "epoch": 0.05263157894736842, + "grad_norm": 0.023518428206443787, + "learning_rate": 1e-06, + "loss": 0.2783, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.216796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14747.0, + "completions/mean_length": 6167.828125, + "completions/mean_terminated_length": 3339.910400390625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 0.24753674864768982, + "epoch": 0.05526315789473684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033617325127124786, + "learning_rate": 1e-06, + "loss": 0.23, + "num_tokens": 21476745.0, + "reward": 0.08549793064594269, + "reward_std": 0.09992466866970062, + "rewards/progression_diversity/mean": -0.009777872823178768, + "rewards/progression_diversity/std": 0.05313728749752045, + "rewards/symbolic_reward_accuracy/mean": 0.0078125, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.33740234375, + "rewards/symbolic_reward_partial_score/std": 0.24341119825839996, + "rewards/tag_count_reward/mean": -0.203125, + "rewards/tag_count_reward/std": 0.4027182459831238, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.055070161819458, + "sampling/importance_sampling_ratio/min": 9.157028398476541e-05, + "sampling/sampling_logp_difference/max": 9.2984037399292, + "sampling/sampling_logp_difference/mean": 0.09585890173912048, + "step": 21 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.26523521542549133, + "epoch": 0.05789473684210526, + "grad_norm": 0.02759598009288311, + "learning_rate": 1e-06, + "loss": 0.1624, + "step": 22 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.28485071659088135, + "epoch": 0.060526315789473685, + "grad_norm": 0.029536111280322075, + "learning_rate": 1e-06, + "loss": 0.1678, + "step": 23 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.274502232670784, + "epoch": 0.06315789473684211, + "grad_norm": 0.028490744531154633, + "learning_rate": 1e-06, + "loss": 0.1451, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.208984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14513.0, + "completions/mean_length": 6365.98828125, + "completions/mean_terminated_length": 3719.25439453125, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.25441184639930725, + "epoch": 0.06578947368421052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03975823521614075, + "learning_rate": 1e-06, + "loss": 0.2797, + "num_tokens": 25147619.0, + "reward": 0.086149662733078, + "reward_std": 0.10140813887119293, + "rewards/progression_diversity/mean": -0.00808083638548851, + "rewards/progression_diversity/std": 0.04241418465971947, + "rewards/symbolic_reward_accuracy/mean": 0.0078125, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.3395182490348816, + "rewards/symbolic_reward_partial_score/std": 0.24165737628936768, + "rewards/tag_count_reward/mean": -0.203125, + "rewards/tag_count_reward/std": 0.4027182459831238, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0544767379760742, + "sampling/importance_sampling_ratio/min": 3.484850741308776e-10, + "sampling/sampling_logp_difference/max": 21.77742576599121, + "sampling/sampling_logp_difference/mean": 0.09544065594673157, + "step": 25 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.24812303483486176, + "epoch": 0.06842105263157895, + "grad_norm": 0.026934118941426277, + "learning_rate": 1e-06, + "loss": 0.2254, + "step": 26 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2512124478816986, + "epoch": 0.07105263157894737, + "grad_norm": 0.03495849668979645, + "learning_rate": 1e-06, + "loss": 0.1695, + "step": 27 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.28809893131256104, + "epoch": 0.07368421052631578, + "grad_norm": 0.02224835380911827, + "learning_rate": 1e-06, + "loss": 0.0802, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.169921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15402.0, + "completions/mean_length": 5648.064453125, + "completions/mean_terminated_length": 3450.355224609375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.268292635679245, + "epoch": 0.07631578947368421, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032919030636548996, + "learning_rate": 1e-06, + "loss": 0.2178, + "num_tokens": 28440228.0, + "reward": 0.09082336723804474, + "reward_std": 0.10907775163650513, + "rewards/progression_diversity/mean": -0.0045784548856318, + "rewards/progression_diversity/std": 0.02940884418785572, + "rewards/symbolic_reward_accuracy/mean": 0.01171875, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.3361002802848816, + "rewards/symbolic_reward_partial_score/std": 0.24807466566562653, + "rewards/tag_count_reward/mean": -0.169921875, + "rewards/tag_count_reward/std": 0.3759314715862274, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0602651834487915, + "sampling/importance_sampling_ratio/min": 0.00284669641405344, + "sampling/sampling_logp_difference/max": 5.86159610748291, + "sampling/sampling_logp_difference/mean": 0.10585412383079529, + "step": 29 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2800886482000351, + "epoch": 0.07894736842105263, + "grad_norm": 0.029134223237633705, + "learning_rate": 1e-06, + "loss": 0.1811, + "step": 30 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2816573828458786, + "epoch": 0.08157894736842106, + "grad_norm": 0.033635616302490234, + "learning_rate": 1e-06, + "loss": 0.127, + "step": 31 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2971540093421936, + "epoch": 0.08421052631578947, + "grad_norm": 0.023819033056497574, + "learning_rate": 1e-06, + "loss": 0.1382, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.173828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13264.0, + "completions/mean_length": 5243.712890625, + "completions/mean_terminated_length": 2899.775390625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.273567333817482, + "epoch": 0.0868421052631579, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029521364718675613, + "learning_rate": 1e-06, + "loss": 0.1954, + "num_tokens": 31522481.0, + "reward": 0.10308191180229187, + "reward_std": 0.1121080070734024, + "rewards/progression_diversity/mean": -0.00919200200587511, + "rewards/progression_diversity/std": 0.04991353675723076, + "rewards/symbolic_reward_accuracy/mean": 0.015625, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.3712565004825592, + "rewards/symbolic_reward_partial_score/std": 0.23796787858009338, + "rewards/tag_count_reward/mean": -0.17578125, + "rewards/tag_count_reward/std": 0.3810062110424042, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0566962957382202, + "sampling/importance_sampling_ratio/min": 1.2478832331908052e-06, + "sampling/sampling_logp_difference/max": 13.594061851501465, + "sampling/sampling_logp_difference/mean": 0.09926008433103561, + "step": 33 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2565145939588547, + "epoch": 0.08947368421052632, + "grad_norm": 0.04480559378862381, + "learning_rate": 1e-06, + "loss": 0.2772, + "step": 34 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2905775457620621, + "epoch": 0.09210526315789473, + "grad_norm": 0.02144668437540531, + "learning_rate": 1e-06, + "loss": 0.0548, + "step": 35 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.28261056542396545, + "epoch": 0.09473684210526316, + "grad_norm": 0.01856975071132183, + "learning_rate": 1e-06, + "loss": 0.1399, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.208984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15877.0, + "completions/mean_length": 5636.087890625, + "completions/mean_terminated_length": 2796.51611328125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.27043572068214417, + "epoch": 0.09736842105263158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026228487491607666, + "learning_rate": 1e-06, + "loss": 0.1468, + "num_tokens": 34785758.0, + "reward": 0.09762196242809296, + "reward_std": 0.12116604298353195, + "rewards/progression_diversity/mean": -0.008311246521770954, + "rewards/progression_diversity/std": 0.046763546764850616, + "rewards/symbolic_reward_accuracy/mean": 0.01953125, + "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, + "rewards/symbolic_reward_partial_score/mean": 0.35302734375, + "rewards/symbolic_reward_partial_score/std": 0.24829283356666565, + "rewards/tag_count_reward/mean": -0.19921875, + "rewards/tag_count_reward/std": 0.39980348944664, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0502464771270752, + "sampling/importance_sampling_ratio/min": 8.39968015498016e-06, + "sampling/sampling_logp_difference/max": 11.68731689453125, + "sampling/sampling_logp_difference/mean": 0.08828973770141602, + "step": 37 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.26759083569049835, + "epoch": 0.1, + "grad_norm": 0.02601916529238224, + "learning_rate": 1e-06, + "loss": 0.1606, + "step": 38 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2611129730939865, + "epoch": 0.10263157894736842, + "grad_norm": 0.02105417661368847, + "learning_rate": 1e-06, + "loss": 0.2227, + "step": 39 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.23032942414283752, + "epoch": 0.10526315789473684, + "grad_norm": 0.02530878223478794, + "learning_rate": 1e-06, + "loss": 0.1808, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.23828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13911.0, + "completions/mean_length": 5937.931640625, + "completions/mean_terminated_length": 2670.187255859375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.24540461599826813, + "epoch": 0.10789473684210527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02969300001859665, + "learning_rate": 1e-06, + "loss": 0.1987, + "num_tokens": 38220283.0, + "reward": 0.09507475793361664, + "reward_std": 0.11622758209705353, + "rewards/progression_diversity/mean": -0.014009159058332443, + "rewards/progression_diversity/std": 0.06170212849974632, + "rewards/symbolic_reward_accuracy/mean": 0.01953125, + "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, + "rewards/symbolic_reward_partial_score/mean": 0.3531900942325592, + "rewards/symbolic_reward_partial_score/std": 0.2495628297328949, + "rewards/tag_count_reward/mean": -0.224609375, + "rewards/tag_count_reward/std": 0.41773295402526855, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0466715097427368, + "sampling/importance_sampling_ratio/min": 1.8073988030664623e-05, + "sampling/sampling_logp_difference/max": 10.921036720275879, + "sampling/sampling_logp_difference/mean": 0.08196896314620972, + "step": 41 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.24679341912269592, + "epoch": 0.11052631578947368, + "grad_norm": 0.028460616245865822, + "learning_rate": 1e-06, + "loss": 0.234, + "step": 42 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.23327426612377167, + "epoch": 0.11315789473684211, + "grad_norm": 0.030783407390117645, + "learning_rate": 1e-06, + "loss": 0.244, + "step": 43 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2536231279373169, + "epoch": 0.11578947368421053, + "grad_norm": 0.023375684395432472, + "learning_rate": 1e-06, + "loss": 0.1264, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.224609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 5743.708984375, + "completions/mean_terminated_length": 2661.5087890625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.23496582359075546, + "epoch": 0.11842105263157894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02578672021627426, + "learning_rate": 1e-06, + "loss": 0.2409, + "num_tokens": 41567718.0, + "reward": 0.08303609490394592, + "reward_std": 0.08830304443836212, + "rewards/progression_diversity/mean": -0.011820437386631966, + "rewards/progression_diversity/std": 0.057891786098480225, + "rewards/symbolic_reward_accuracy/mean": 0.001953125, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.3468424677848816, + "rewards/symbolic_reward_partial_score/std": 0.23540066182613373, + "rewards/tag_count_reward/mean": -0.220703125, + "rewards/tag_count_reward/std": 0.4151262938976288, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0478100776672363, + "sampling/importance_sampling_ratio/min": 1.1134955002489733e-07, + "sampling/sampling_logp_difference/max": 16.010591506958008, + "sampling/sampling_logp_difference/mean": 0.08461372554302216, + "step": 45 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2394789233803749, + "epoch": 0.12105263157894737, + "grad_norm": 0.024003252387046814, + "learning_rate": 1e-06, + "loss": 0.1561, + "step": 46 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.25297604501247406, + "epoch": 0.12368421052631579, + "grad_norm": 0.022317850962281227, + "learning_rate": 1e-06, + "loss": 0.1457, + "step": 47 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2308100312948227, + "epoch": 0.12631578947368421, + "grad_norm": 0.027584845200181007, + "learning_rate": 1e-06, + "loss": 0.252, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16384.0, + "completions/mean_length": 5191.498046875, + "completions/mean_terminated_length": 2474.871337890625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.2666362375020981, + "epoch": 0.12894736842105264, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.025992998853325844, + "learning_rate": 1e-06, + "loss": 0.1962, + "num_tokens": 44623557.0, + "reward": 0.10004402697086334, + "reward_std": 0.10086282342672348, + "rewards/progression_diversity/mean": -0.015129456296563148, + "rewards/progression_diversity/std": 0.06190123409032822, + "rewards/symbolic_reward_accuracy/mean": 0.013671875, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.3704427182674408, + "rewards/symbolic_reward_partial_score/std": 0.23484936356544495, + "rewards/tag_count_reward/mean": -0.19140625, + "rewards/tag_count_reward/std": 0.3937928080558777, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.048860788345337, + "sampling/importance_sampling_ratio/min": 2.264517434014124e-06, + "sampling/sampling_logp_difference/max": 12.998148918151855, + "sampling/sampling_logp_difference/mean": 0.08639080822467804, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2490471974015236, + "epoch": 0.13157894736842105, + "grad_norm": 0.021342728286981583, + "learning_rate": 1e-06, + "loss": 0.2177, + "step": 50 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2422972321510315, + "epoch": 0.13421052631578947, + "grad_norm": 0.02412317879498005, + "learning_rate": 1e-06, + "loss": 0.2011, + "step": 51 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.25571010261774063, + "epoch": 0.1368421052631579, + "grad_norm": 0.018414035439491272, + "learning_rate": 1e-06, + "loss": 0.1253, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.228515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 5547.751953125, + "completions/mean_terminated_length": 2338.02783203125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.222146138548851, + "epoch": 0.1394736842105263, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02548670582473278, + "learning_rate": 1e-06, + "loss": 0.2067, + "num_tokens": 47871142.0, + "reward": 0.09103752672672272, + "reward_std": 0.108045294880867, + "rewards/progression_diversity/mean": -0.017341770231723785, + "rewards/progression_diversity/std": 0.06780385971069336, + "rewards/symbolic_reward_accuracy/mean": 0.009765625, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.3548177182674408, + "rewards/symbolic_reward_partial_score/std": 0.23816773295402527, + "rewards/tag_count_reward/mean": -0.2109375, + "rewards/tag_count_reward/std": 0.4083731174468994, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.041557788848877, + "sampling/importance_sampling_ratio/min": 3.7289887586666737e-06, + "sampling/sampling_logp_difference/max": 12.499373435974121, + "sampling/sampling_logp_difference/mean": 0.07308885455131531, + "step": 53 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.22613000124692917, + "epoch": 0.14210526315789473, + "grad_norm": 0.023181065917015076, + "learning_rate": 1e-06, + "loss": 0.2628, + "step": 54 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2536846548318863, + "epoch": 0.14473684210526316, + "grad_norm": 0.018339237198233604, + "learning_rate": 1e-06, + "loss": 0.0959, + "step": 55 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.22903449833393097, + "epoch": 0.14736842105263157, + "grad_norm": 0.026370180770754814, + "learning_rate": 1e-06, + "loss": 0.2495, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.248046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16286.0, + "completions/mean_length": 5735.234375, + "completions/mean_terminated_length": 2222.524658203125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "entropy": 0.2157709002494812, + "epoch": 0.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02145228534936905, + "learning_rate": 1e-06, + "loss": 0.1847, + "num_tokens": 51218910.0, + "reward": 0.09276480972766876, + "reward_std": 0.1126212552189827, + "rewards/progression_diversity/mean": -0.01062830537557602, + "rewards/progression_diversity/std": 0.049262356013059616, + "rewards/symbolic_reward_accuracy/mean": 0.013671875, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.3681640625, + "rewards/symbolic_reward_partial_score/std": 0.23709788918495178, + "rewards/tag_count_reward/mean": -0.2578125, + "rewards/tag_count_reward/std": 0.43785804510116577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.038439154624939, + "sampling/importance_sampling_ratio/min": 1.4415817428670152e-09, + "sampling/sampling_logp_difference/max": 20.357524871826172, + "sampling/sampling_logp_difference/mean": 0.0689418762922287, + "step": 57 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2207200676202774, + "epoch": 0.15263157894736842, + "grad_norm": 0.02863519825041294, + "learning_rate": 1e-06, + "loss": 0.2579, + "step": 58 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21071892976760864, + "epoch": 0.15526315789473685, + "grad_norm": 0.02227974310517311, + "learning_rate": 1e-06, + "loss": 0.2465, + "step": 59 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.23656637221574783, + "epoch": 0.15789473684210525, + "grad_norm": 0.01663924567401409, + "learning_rate": 1e-06, + "loss": 0.2336, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14423.0, + "completions/mean_length": 5544.6953125, + "completions/mean_terminated_length": 2080.587646484375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.23988273739814758, + "epoch": 0.16052631578947368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027361398562788963, + "learning_rate": 1e-06, + "loss": 0.1632, + "num_tokens": 54458434.0, + "reward": 0.09711417555809021, + "reward_std": 0.11729248613119125, + "rewards/progression_diversity/mean": -0.010262314230203629, + "rewards/progression_diversity/std": 0.05750845745205879, + "rewards/symbolic_reward_accuracy/mean": 0.015625, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.3767903745174408, + "rewards/symbolic_reward_partial_score/std": 0.2343655377626419, + "rewards/tag_count_reward/mean": -0.251953125, + "rewards/tag_count_reward/std": 0.43455907702445984, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0386930704116821, + "sampling/importance_sampling_ratio/min": 2.470594381520641e-06, + "sampling/sampling_logp_difference/max": 12.911051750183105, + "sampling/sampling_logp_difference/mean": 0.0692630261182785, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.22496215999126434, + "epoch": 0.1631578947368421, + "grad_norm": 0.01953737623989582, + "learning_rate": 1e-06, + "loss": 0.2318, + "step": 62 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.23897476494312286, + "epoch": 0.16578947368421051, + "grad_norm": 0.018175503239035606, + "learning_rate": 1e-06, + "loss": 0.1639, + "step": 63 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.19628984481096268, + "epoch": 0.16842105263157894, + "grad_norm": 0.02357460930943489, + "learning_rate": 1e-06, + "loss": 0.3235, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.259765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15521.0, + "completions/mean_length": 5778.126953125, + "completions/mean_terminated_length": 2056.277099609375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.22985464334487915, + "epoch": 0.17105263157894737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02823863923549652, + "learning_rate": 1e-06, + "loss": 0.2111, + "num_tokens": 57850179.0, + "reward": 0.08986608684062958, + "reward_std": 0.10897202044725418, + "rewards/progression_diversity/mean": -0.017297808080911636, + "rewards/progression_diversity/std": 0.06803877651691437, + "rewards/symbolic_reward_accuracy/mean": 0.009765625, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.365234375, + "rewards/symbolic_reward_partial_score/std": 0.23282982409000397, + "rewards/tag_count_reward/mean": -0.25390625, + "rewards/tag_count_reward/std": 0.43567025661468506, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0341827869415283, + "sampling/importance_sampling_ratio/min": 3.349011603859253e-05, + "sampling/sampling_logp_difference/max": 10.30426025390625, + "sampling/sampling_logp_difference/mean": 0.06156843155622482, + "step": 65 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.22416777163743973, + "epoch": 0.1736842105263158, + "grad_norm": 0.026787322014570236, + "learning_rate": 1e-06, + "loss": 0.232, + "step": 66 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.1903344914317131, + "epoch": 0.1763157894736842, + "grad_norm": 0.026732493191957474, + "learning_rate": 1e-06, + "loss": 0.286, + "step": 67 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.1874106228351593, + "epoch": 0.17894736842105263, + "grad_norm": 0.02596643753349781, + "learning_rate": 1e-06, + "loss": 0.2322, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15584.0, + "completions/mean_length": 5451.146484375, + "completions/mean_terminated_length": 1496.7100830078125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.1980566829442978, + "epoch": 0.18157894736842106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02380208671092987, + "learning_rate": 1e-06, + "loss": 0.2229, + "num_tokens": 61031118.0, + "reward": 0.09820634126663208, + "reward_std": 0.12823785841464996, + "rewards/progression_diversity/mean": -0.018233338370919228, + "rewards/progression_diversity/std": 0.07207600027322769, + "rewards/symbolic_reward_accuracy/mean": 0.01953125, + "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, + "rewards/symbolic_reward_partial_score/mean": 0.37744140625, + "rewards/symbolic_reward_partial_score/std": 0.23769913613796234, + "rewards/tag_count_reward/mean": -0.265625, + "rewards/tag_count_reward/std": 0.44209739565849304, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.030740737915039, + "sampling/importance_sampling_ratio/min": 5.1614351832540706e-05, + "sampling/sampling_logp_difference/max": 9.871710777282715, + "sampling/sampling_logp_difference/mean": 0.055819302797317505, + "step": 69 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2236703857779503, + "epoch": 0.18421052631578946, + "grad_norm": 0.018748750910162926, + "learning_rate": 1e-06, + "loss": 0.19, + "step": 70 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.22987084090709686, + "epoch": 0.1868421052631579, + "grad_norm": 0.021655604243278503, + "learning_rate": 1e-06, + "loss": 0.1942, + "step": 71 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.17527379840612411, + "epoch": 0.18947368421052632, + "grad_norm": 0.025045355781912804, + "learning_rate": 1e-06, + "loss": 0.2901, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15275.0, + "completions/mean_length": 5332.80859375, + "completions/mean_terminated_length": 1335.569091796875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.2162635251879692, + "epoch": 0.19210526315789472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02276972495019436, + "learning_rate": 1e-06, + "loss": 0.2146, + "num_tokens": 64162156.0, + "reward": 0.10593515634536743, + "reward_std": 0.12418779730796814, + "rewards/progression_diversity/mean": -0.021718915551900864, + "rewards/progression_diversity/std": 0.07438741624355316, + "rewards/symbolic_reward_accuracy/mean": 0.025390625, + "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, + "rewards/symbolic_reward_partial_score/mean": 0.3896484375, + "rewards/symbolic_reward_partial_score/std": 0.23622576892375946, + "rewards/tag_count_reward/mean": -0.259765625, + "rewards/tag_count_reward/std": 0.4389347732067108, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0349256992340088, + "sampling/importance_sampling_ratio/min": 1.1178142813150771e-05, + "sampling/sampling_logp_difference/max": 11.40155029296875, + "sampling/sampling_logp_difference/mean": 0.06305186450481415, + "step": 73 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.19458654522895813, + "epoch": 0.19473684210526315, + "grad_norm": 0.020284034311771393, + "learning_rate": 1e-06, + "loss": 0.2623, + "step": 74 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.22814790159463882, + "epoch": 0.19736842105263158, + "grad_norm": 0.017509233206510544, + "learning_rate": 1e-06, + "loss": 0.1189, + "step": 75 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.20635511726140976, + "epoch": 0.2, + "grad_norm": 0.017491551116108894, + "learning_rate": 1e-06, + "loss": 0.2417, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.263671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16298.0, + "completions/mean_length": 5354.87109375, + "completions/mean_terminated_length": 1405.4482421875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.20123746991157532, + "epoch": 0.2026315789473684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.018690288066864014, + "learning_rate": 1e-06, + "loss": 0.1404, + "num_tokens": 67315338.0, + "reward": 0.09713856130838394, + "reward_std": 0.11651948094367981, + "rewards/progression_diversity/mean": -0.022472595795989037, + "rewards/progression_diversity/std": 0.0805739313364029, + "rewards/symbolic_reward_accuracy/mean": 0.015625, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.3818359375, + "rewards/symbolic_reward_partial_score/std": 0.23188191652297974, + "rewards/tag_count_reward/mean": -0.265625, + "rewards/tag_count_reward/std": 0.44209739565849304, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0267584323883057, + "sampling/importance_sampling_ratio/min": 6.460865731305887e-10, + "sampling/sampling_logp_difference/max": 21.16008758544922, + "sampling/sampling_logp_difference/mean": 0.04901731014251709, + "step": 77 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.19783911854028702, + "epoch": 0.20526315789473684, + "grad_norm": 0.019168507307767868, + "learning_rate": 1e-06, + "loss": 0.2772, + "step": 78 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.18744147568941116, + "epoch": 0.20789473684210527, + "grad_norm": 0.021555200219154358, + "learning_rate": 1e-06, + "loss": 0.2715, + "step": 79 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.18239572644233704, + "epoch": 0.21052631578947367, + "grad_norm": 0.015542907640337944, + "learning_rate": 1e-06, + "loss": 0.2586, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.24609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14582.0, + "completions/mean_length": 4839.45703125, + "completions/mean_terminated_length": 1071.031005859375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.215584859251976, + "epoch": 0.2131578947368421, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.016407115384936333, + "learning_rate": 1e-06, + "loss": 0.1701, + "num_tokens": 70206132.0, + "reward": 0.10850296914577484, + "reward_std": 0.12094779312610626, + "rewards/progression_diversity/mean": -0.018844161182641983, + "rewards/progression_diversity/std": 0.07712793350219727, + "rewards/symbolic_reward_accuracy/mean": 0.0234375, + "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, + "rewards/symbolic_reward_partial_score/mean": 0.3974609375, + "rewards/symbolic_reward_partial_score/std": 0.22982890903949738, + "rewards/tag_count_reward/mean": -0.24609375, + "rewards/tag_count_reward/std": 0.4311550557613373, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0279490947723389, + "sampling/importance_sampling_ratio/min": 0.00020903853874187917, + "sampling/sampling_logp_difference/max": 8.472991943359375, + "sampling/sampling_logp_difference/mean": 0.05078686401247978, + "step": 81 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.1952200084924698, + "epoch": 0.21578947368421053, + "grad_norm": 0.017001083120703697, + "learning_rate": 1e-06, + "loss": 0.267, + "step": 82 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.1928211748600006, + "epoch": 0.21842105263157896, + "grad_norm": 0.017411494627594948, + "learning_rate": 1e-06, + "loss": 0.2582, + "step": 83 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.22684676200151443, + "epoch": 0.22105263157894736, + "grad_norm": 0.014094403013586998, + "learning_rate": 1e-06, + "loss": 0.2321, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.16796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14469.0, + "completions/mean_length": 3569.38671875, + "completions/mean_terminated_length": 982.3990478515625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.24647942930459976, + "epoch": 0.2236842105263158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.016102120280265808, + "learning_rate": 1e-06, + "loss": 0.1723, + "num_tokens": 72421946.0, + "reward": 0.11379130184650421, + "reward_std": 0.10348815470933914, + "rewards/progression_diversity/mean": -0.01735476776957512, + "rewards/progression_diversity/std": 0.07398708909749985, + "rewards/symbolic_reward_accuracy/mean": 0.013671875, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.4085286259651184, + "rewards/symbolic_reward_partial_score/std": 0.21359524130821228, + "rewards/tag_count_reward/mean": -0.16796875, + "rewards/tag_count_reward/std": 0.374204158782959, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0335094928741455, + "sampling/importance_sampling_ratio/min": 0.000267473777057603, + "sampling/sampling_logp_difference/max": 8.226489067077637, + "sampling/sampling_logp_difference/mean": 0.06112096086144447, + "step": 85 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2344566285610199, + "epoch": 0.22631578947368422, + "grad_norm": 0.019014785066246986, + "learning_rate": 1e-06, + "loss": 0.2905, + "step": 86 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.24500514566898346, + "epoch": 0.22894736842105262, + "grad_norm": 0.015879755839705467, + "learning_rate": 1e-06, + "loss": 0.2251, + "step": 87 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.25136934220790863, + "epoch": 0.23157894736842105, + "grad_norm": 0.01475497055798769, + "learning_rate": 1e-06, + "loss": 0.1835, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.12109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 2819.962890625, + "completions/mean_terminated_length": 951.1400146484375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.24405641853809357, + "epoch": 0.23421052631578948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.024288885295391083, + "learning_rate": 1e-06, + "loss": 0.212, + "num_tokens": 74253031.0, + "reward": 0.13387255370616913, + "reward_std": 0.12053509056568146, + "rewards/progression_diversity/mean": -0.011182607151567936, + "rewards/progression_diversity/std": 0.06163937970995903, + "rewards/symbolic_reward_accuracy/mean": 0.029296875, + "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, + "rewards/symbolic_reward_partial_score/mean": 0.4270833134651184, + "rewards/symbolic_reward_partial_score/std": 0.21671809256076813, + "rewards/tag_count_reward/mean": -0.1171875, + "rewards/tag_count_reward/std": 0.32195815443992615, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.041147232055664, + "sampling/importance_sampling_ratio/min": 0.0011610162910073996, + "sampling/sampling_logp_difference/max": 6.758459568023682, + "sampling/sampling_logp_difference/mean": 0.0764530748128891, + "step": 89 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2542632222175598, + "epoch": 0.23684210526315788, + "grad_norm": 0.01583160273730755, + "learning_rate": 1e-06, + "loss": 0.1479, + "step": 90 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.27341794967651367, + "epoch": 0.2394736842105263, + "grad_norm": 0.013665684498846531, + "learning_rate": 1e-06, + "loss": 0.1461, + "step": 91 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.24389638006687164, + "epoch": 0.24210526315789474, + "grad_norm": 0.0186525397002697, + "learning_rate": 1e-06, + "loss": 0.1631, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 3517.671875, + "completions/mean_terminated_length": 992.504638671875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.23533198237419128, + "epoch": 0.24473684210526317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.018597114831209183, + "learning_rate": 1e-06, + "loss": 0.1957, + "num_tokens": 76442207.0, + "reward": 0.1313730925321579, + "reward_std": 0.13323497772216797, + "rewards/progression_diversity/mean": -0.01698843017220497, + "rewards/progression_diversity/std": 0.07352975755929947, + "rewards/symbolic_reward_accuracy/mean": 0.03515625, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.4254557192325592, + "rewards/symbolic_reward_partial_score/std": 0.2250940501689911, + "rewards/tag_count_reward/mean": -0.171875, + "rewards/tag_count_reward/std": 0.3776407241821289, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0345077514648438, + "sampling/importance_sampling_ratio/min": 2.56181897384522e-06, + "sampling/sampling_logp_difference/max": 12.87479305267334, + "sampling/sampling_logp_difference/mean": 0.06349129974842072, + "step": 93 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.24531496316194534, + "epoch": 0.24736842105263157, + "grad_norm": 0.015834983438253403, + "learning_rate": 1e-06, + "loss": 0.1729, + "step": 94 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.21955663710832596, + "epoch": 0.25, + "grad_norm": 0.01730695739388466, + "learning_rate": 1e-06, + "loss": 0.2377, + "step": 95 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.21611076593399048, + "epoch": 0.25263157894736843, + "grad_norm": 0.015181013382971287, + "learning_rate": 1e-06, + "loss": 0.1776, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15096.0, + "completions/mean_length": 2637.765625, + "completions/mean_terminated_length": 813.0442504882812, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.25482119619846344, + "epoch": 0.25526315789473686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.014081789180636406, + "learning_rate": 1e-06, + "loss": 0.1076, + "num_tokens": 78194887.0, + "reward": 0.14719998836517334, + "reward_std": 0.14902149140834808, + "rewards/progression_diversity/mean": -0.011447252705693245, + "rewards/progression_diversity/std": 0.06131342053413391, + "rewards/symbolic_reward_accuracy/mean": 0.046875, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.4383138120174408, + "rewards/symbolic_reward_partial_score/std": 0.22779904305934906, + "rewards/tag_count_reward/mean": -0.123046875, + "rewards/tag_count_reward/std": 0.32881227135658264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0369794368743896, + "sampling/importance_sampling_ratio/min": 3.3109614605564275e-08, + "sampling/sampling_logp_difference/max": 17.22344207763672, + "sampling/sampling_logp_difference/mean": 0.06858888268470764, + "step": 97 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.24463753402233124, + "epoch": 0.2578947368421053, + "grad_norm": 0.06137360259890556, + "learning_rate": 1e-06, + "loss": 0.1628, + "step": 98 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.24564764648675919, + "epoch": 0.26052631578947366, + "grad_norm": 0.01690017245709896, + "learning_rate": 1e-06, + "loss": 0.1851, + "step": 99 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.24623940885066986, + "epoch": 0.2631578947368421, + "grad_norm": 0.013457868248224258, + "learning_rate": 1e-06, + "loss": 0.1721, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15936.0, + "completions/mean_length": 2532.294921875, + "completions/mean_terminated_length": 899.1244506835938, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "entropy": 0.24528837949037552, + "epoch": 0.2657894736842105, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.014075527898967266, + "learning_rate": 1e-06, + "loss": 0.1249, + "num_tokens": 79901246.0, + "reward": 0.1632891744375229, + "reward_std": 0.15703746676445007, + "rewards/progression_diversity/mean": -0.008974803611636162, + "rewards/progression_diversity/std": 0.05761713534593582, + "rewards/symbolic_reward_accuracy/mean": 0.068359375, + "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, + "rewards/symbolic_reward_partial_score/mean": 0.4469400942325592, + "rewards/symbolic_reward_partial_score/std": 0.24197350442409515, + "rewards/tag_count_reward/mean": -0.1171875, + "rewards/tag_count_reward/std": 0.32195815443992615, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.041948676109314, + "sampling/importance_sampling_ratio/min": 1.5186431001623646e-09, + "sampling/sampling_logp_difference/max": 20.305448532104492, + "sampling/sampling_logp_difference/mean": 0.07734344899654388, + "step": 101 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.26768821477890015, + "epoch": 0.26842105263157895, + "grad_norm": 0.014358071610331535, + "learning_rate": 1e-06, + "loss": 0.125, + "step": 102 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2417154163122177, + "epoch": 0.2710526315789474, + "grad_norm": 0.019036587327718735, + "learning_rate": 1e-06, + "loss": 0.221, + "step": 103 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.26473505795001984, + "epoch": 0.2736842105263158, + "grad_norm": 0.014921830035746098, + "learning_rate": 1e-06, + "loss": 0.073, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.080078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5375.0, + "completions/mean_length": 1962.900390625, + "completions/mean_terminated_length": 707.560546875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.2539156824350357, + "epoch": 0.27631578947368424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01651758700609207, + "learning_rate": 1e-06, + "loss": 0.1921, + "num_tokens": 81315211.0, + "reward": 0.15442626178264618, + "reward_std": 0.14438994228839874, + "rewards/progression_diversity/mean": -0.011476235464215279, + "rewards/progression_diversity/std": 0.06588800996541977, + "rewards/symbolic_reward_accuracy/mean": 0.044921875, + "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, + "rewards/symbolic_reward_partial_score/mean": 0.4532877802848816, + "rewards/symbolic_reward_partial_score/std": 0.21151751279830933, + "rewards/tag_count_reward/mean": -0.083984375, + "rewards/tag_count_reward/std": 0.2776356339454651, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0463956594467163, + "sampling/importance_sampling_ratio/min": 1.2895628742626286e-07, + "sampling/sampling_logp_difference/max": 15.863792419433594, + "sampling/sampling_logp_difference/mean": 0.08601226657629013, + "step": 105 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.26433275640010834, + "epoch": 0.2789473684210526, + "grad_norm": 0.012559227645397186, + "learning_rate": 1e-06, + "loss": 0.154, + "step": 106 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.27126292884349823, + "epoch": 0.28157894736842104, + "grad_norm": 0.011530612595379353, + "learning_rate": 1e-06, + "loss": 0.0815, + "step": 107 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.28323958814144135, + "epoch": 0.28421052631578947, + "grad_norm": 0.015649767592549324, + "learning_rate": 1e-06, + "loss": 0.115, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16114.0, + "completions/mean_length": 2091.642578125, + "completions/mean_terminated_length": 781.255859375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.2563259154558182, + "epoch": 0.2868421052631579, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.01467968337237835, + "learning_rate": 1e-06, + "loss": 0.0872, + "num_tokens": 82791284.0, + "reward": 0.17522379755973816, + "reward_std": 0.14781689643859863, + "rewards/progression_diversity/mean": -0.0069169411435723305, + "rewards/progression_diversity/std": 0.05124044418334961, + "rewards/symbolic_reward_accuracy/mean": 0.06640625, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.4807942509651184, + "rewards/symbolic_reward_partial_score/std": 0.20960326492786407, + "rewards/tag_count_reward/mean": -0.087890625, + "rewards/tag_count_reward/std": 0.2834126651287079, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0412476062774658, + "sampling/importance_sampling_ratio/min": 0.0006217172485776246, + "sampling/sampling_logp_difference/max": 7.383025169372559, + "sampling/sampling_logp_difference/mean": 0.07769767940044403, + "step": 109 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.27861388027668, + "epoch": 0.2894736842105263, + "grad_norm": 0.012335779145359993, + "learning_rate": 1e-06, + "loss": 0.0808, + "step": 110 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2279139682650566, + "epoch": 0.29210526315789476, + "grad_norm": 0.01729213260114193, + "learning_rate": 1e-06, + "loss": 0.1974, + "step": 111 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.28070947527885437, + "epoch": 0.29473684210526313, + "grad_norm": 0.014035679399967194, + "learning_rate": 1e-06, + "loss": 0.06, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16285.0, + "completions/mean_length": 1909.013671875, + "completions/mean_terminated_length": 846.90771484375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.2596060037612915, + "epoch": 0.29736842105263156, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.011908638291060925, + "learning_rate": 1e-06, + "loss": 0.1146, + "num_tokens": 84171323.0, + "reward": 0.19407010078430176, + "reward_std": 0.17639988660812378, + "rewards/progression_diversity/mean": -0.011936957947909832, + "rewards/progression_diversity/std": 0.07680150121450424, + "rewards/symbolic_reward_accuracy/mean": 0.091796875, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.48779296875, + "rewards/symbolic_reward_partial_score/std": 0.23039613664150238, + "rewards/tag_count_reward/mean": -0.072265625, + "rewards/tag_count_reward/std": 0.2591804563999176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0448040962219238, + "sampling/importance_sampling_ratio/min": 0.0012326554860919714, + "sampling/sampling_logp_difference/max": 6.69858455657959, + "sampling/sampling_logp_difference/mean": 0.08408962190151215, + "step": 113 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.23410920798778534, + "epoch": 0.3, + "grad_norm": 0.017887957394123077, + "learning_rate": 1e-06, + "loss": 0.229, + "step": 114 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2956717312335968, + "epoch": 0.3026315789473684, + "grad_norm": 0.013839812017977238, + "learning_rate": 1e-06, + "loss": 0.0347, + "step": 115 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.27330218255519867, + "epoch": 0.30526315789473685, + "grad_norm": 0.011987395584583282, + "learning_rate": 1e-06, + "loss": 0.0768, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13832.0, + "completions/mean_length": 1699.599609375, + "completions/mean_terminated_length": 720.6396484375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.26409123837947845, + "epoch": 0.3078947368421053, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.017784157767891884, + "learning_rate": 1e-06, + "loss": 0.1191, + "num_tokens": 85467502.0, + "reward": 0.19023296236991882, + "reward_std": 0.1661926507949829, + "rewards/progression_diversity/mean": -0.005026046186685562, + "rewards/progression_diversity/std": 0.033581074327230453, + "rewards/symbolic_reward_accuracy/mean": 0.0859375, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.4851887822151184, + "rewards/symbolic_reward_partial_score/std": 0.23089143633842468, + "rewards/tag_count_reward/mean": -0.068359375, + "rewards/tag_count_reward/std": 0.25260838866233826, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0457016229629517, + "sampling/importance_sampling_ratio/min": 1.8673212025532848e-06, + "sampling/sampling_logp_difference/max": 13.19100570678711, + "sampling/sampling_logp_difference/mean": 0.0859871432185173, + "step": 117 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2728601396083832, + "epoch": 0.3105263157894737, + "grad_norm": 0.016618477180600166, + "learning_rate": 1e-06, + "loss": 0.1202, + "step": 118 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2710384130477905, + "epoch": 0.3131578947368421, + "grad_norm": 0.015051553957164288, + "learning_rate": 1e-06, + "loss": 0.1286, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.281009241938591, + "epoch": 0.3157894736842105, + "grad_norm": 0.01171377208083868, + "learning_rate": 1e-06, + "loss": 0.0637, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1757.0, + "completions/mean_length": 1197.052734375, + "completions/mean_terminated_length": 643.6821899414062, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.2854345440864563, + "epoch": 0.31842105263157894, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.016470612958073616, + "learning_rate": 1e-06, + "loss": 0.0497, + "num_tokens": 86488713.0, + "reward": 0.19141869246959686, + "reward_std": 0.158242329955101, + "rewards/progression_diversity/mean": -0.00363965705037117, + "rewards/progression_diversity/std": 0.03781190514564514, + "rewards/symbolic_reward_accuracy/mean": 0.080078125, + "rewards/symbolic_reward_accuracy/std": 0.271679550409317, + "rewards/symbolic_reward_partial_score/mean": 0.4930012822151184, + "rewards/symbolic_reward_partial_score/std": 0.2159450650215149, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0579612255096436, + "sampling/importance_sampling_ratio/min": 0.0007034747395664454, + "sampling/sampling_logp_difference/max": 7.259478569030762, + "sampling/sampling_logp_difference/mean": 0.10833683609962463, + "step": 121 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2941920757293701, + "epoch": 0.32105263157894737, + "grad_norm": 0.01042513083666563, + "learning_rate": 1e-06, + "loss": 0.0543, + "step": 122 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.29098397493362427, + "epoch": 0.3236842105263158, + "grad_norm": 0.012190199457108974, + "learning_rate": 1e-06, + "loss": 0.0446, + "step": 123 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.27491484582424164, + "epoch": 0.3263157894736842, + "grad_norm": 0.011241449974477291, + "learning_rate": 1e-06, + "loss": 0.0765, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9764.0, + "completions/mean_length": 1113.736328125, + "completions/mean_terminated_length": 652.8631591796875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.276974618434906, + "epoch": 0.32894736842105265, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.012477315962314606, + "learning_rate": 1e-06, + "loss": 0.075, + "num_tokens": 87448258.0, + "reward": 0.2307177186012268, + "reward_std": 0.18792694807052612, + "rewards/progression_diversity/mean": -0.004401450511068106, + "rewards/progression_diversity/std": 0.04930144548416138, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.5133463740348816, + "rewards/symbolic_reward_partial_score/std": 0.250105082988739, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0609157085418701, + "sampling/importance_sampling_ratio/min": 9.104550713345816e-08, + "sampling/sampling_logp_difference/max": 16.21190643310547, + "sampling/sampling_logp_difference/mean": 0.11256176233291626, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2965611666440964, + "epoch": 0.33157894736842103, + "grad_norm": 0.012660843320190907, + "learning_rate": 1e-06, + "loss": 0.0433, + "step": 126 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2932172417640686, + "epoch": 0.33421052631578946, + "grad_norm": 0.012092087417840958, + "learning_rate": 1e-06, + "loss": 0.0378, + "step": 127 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.30187714099884033, + "epoch": 0.3368421052631579, + "grad_norm": 0.0075751859694719315, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 945.626953125, + "completions/mean_terminated_length": 606.6607055664062, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.30951452255249023, + "epoch": 0.3394736842105263, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.015834081918001175, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 88331555.0, + "reward": 0.24679304659366608, + "reward_std": 0.20520557463169098, + "rewards/progression_diversity/mean": -0.003313412657007575, + "rewards/progression_diversity/std": 0.04354095831513405, + "rewards/symbolic_reward_accuracy/mean": 0.150390625, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.5291340947151184, + "rewards/symbolic_reward_partial_score/std": 0.25848349928855896, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0643789768218994, + "sampling/importance_sampling_ratio/min": 0.0014044058043509722, + "sampling/sampling_logp_difference/max": 6.568140983581543, + "sampling/sampling_logp_difference/mean": 0.12036062777042389, + "step": 129 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3004407584667206, + "epoch": 0.34210526315789475, + "grad_norm": 0.008652472868561745, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 130 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.28511108458042145, + "epoch": 0.3447368421052632, + "grad_norm": 0.011511228047311306, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 131 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2902694195508957, + "epoch": 0.3473684210526316, + "grad_norm": 0.011798321269452572, + "learning_rate": 1e-06, + "loss": 0.0542, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 944.07421875, + "completions/mean_terminated_length": 605.0738525390625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.2803247421979904, + "epoch": 0.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01663512922823429, + "learning_rate": 1e-06, + "loss": 0.0881, + "num_tokens": 89217385.0, + "reward": 0.2798116207122803, + "reward_std": 0.22263681888580322, + "rewards/progression_diversity/mean": -0.002236358355730772, + "rewards/progression_diversity/std": 0.03316602483391762, + "rewards/symbolic_reward_accuracy/mean": 0.1953125, + "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, + "rewards/symbolic_reward_partial_score/mean": 0.5499674081802368, + "rewards/symbolic_reward_partial_score/std": 0.2764817774295807, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0633987188339233, + "sampling/importance_sampling_ratio/min": 5.007424078939948e-06, + "sampling/sampling_logp_difference/max": 12.204588890075684, + "sampling/sampling_logp_difference/mean": 0.11828067898750305, + "step": 133 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3119271844625473, + "epoch": 0.3526315789473684, + "grad_norm": 0.014867684803903103, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 134 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.289058193564415, + "epoch": 0.35526315789473684, + "grad_norm": 0.009648671373724937, + "learning_rate": 1e-06, + "loss": 0.0339, + "step": 135 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.29756639897823334, + "epoch": 0.35789473684210527, + "grad_norm": 0.011878606863319874, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15970.0, + "completions/mean_length": 799.517578125, + "completions/mean_terminated_length": 645.824462890625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.3042868822813034, + "epoch": 0.3605263157894737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.012307998724281788, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 90016530.0, + "reward": 0.32415273785591125, + "reward_std": 0.21440261602401733, + "rewards/progression_diversity/mean": -0.0017208305653184652, + "rewards/progression_diversity/std": 0.024005113169550896, + "rewards/symbolic_reward_accuracy/mean": 0.248046875, + "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, + "rewards/symbolic_reward_partial_score/mean": 0.5896810293197632, + "rewards/symbolic_reward_partial_score/std": 0.2770000696182251, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0701053142547607, + "sampling/importance_sampling_ratio/min": 0.000937148230150342, + "sampling/sampling_logp_difference/max": 6.9726691246032715, + "sampling/sampling_logp_difference/mean": 0.13084274530410767, + "step": 137 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.31592294573783875, + "epoch": 0.3631578947368421, + "grad_norm": 0.013102422468364239, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 138 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2929012179374695, + "epoch": 0.36578947368421055, + "grad_norm": 0.012112054973840714, + "learning_rate": 1e-06, + "loss": 0.0511, + "step": 139 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.30165381729602814, + "epoch": 0.3684210526315789, + "grad_norm": 0.011927537620067596, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14312.0, + "completions/mean_length": 908.201171875, + "completions/mean_terminated_length": 631.2981567382812, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.3008659780025482, + "epoch": 0.37105263157894736, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.012268126010894775, + "learning_rate": 1e-06, + "loss": 0.0135, + "num_tokens": 90879481.0, + "reward": 0.31674230098724365, + "reward_std": 0.2281985729932785, + "rewards/progression_diversity/mean": -0.0005779473576694727, + "rewards/progression_diversity/std": 0.0075342655181884766, + "rewards/symbolic_reward_accuracy/mean": 0.240234375, + "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, + "rewards/symbolic_reward_partial_score/mean": 0.5831705927848816, + "rewards/symbolic_reward_partial_score/std": 0.28232038021087646, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0659434795379639, + "sampling/importance_sampling_ratio/min": 0.00015740895469207317, + "sampling/sampling_logp_difference/max": 8.75666332244873, + "sampling/sampling_logp_difference/mean": 0.12366581708192825, + "step": 141 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.2782522588968277, + "epoch": 0.3736842105263158, + "grad_norm": 0.01116311363875866, + "learning_rate": 1e-06, + "loss": 0.0528, + "step": 142 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3003556579351425, + "epoch": 0.3763157894736842, + "grad_norm": 0.010078194551169872, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 143 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.28932641446590424, + "epoch": 0.37894736842105264, + "grad_norm": 0.007731087971478701, + "learning_rate": 1e-06, + "loss": 0.0039, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 1018.0078125, + "completions/mean_terminated_length": 586.0321044921875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.2875199466943741, + "epoch": 0.3815789473684211, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.015733782202005386, + "learning_rate": 1e-06, + "loss": 0.0633, + "num_tokens": 91793821.0, + "reward": 0.33732670545578003, + "reward_std": 0.2401069700717926, + "rewards/progression_diversity/mean": -0.0026801489293575287, + "rewards/progression_diversity/std": 0.03682376444339752, + "rewards/symbolic_reward_accuracy/mean": 0.2734375, + "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, + "rewards/symbolic_reward_partial_score/mean": 0.58935546875, + "rewards/symbolic_reward_partial_score/std": 0.29994940757751465, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0609219074249268, + "sampling/importance_sampling_ratio/min": 0.0008631742675788701, + "sampling/sampling_logp_difference/max": 7.054893970489502, + "sampling/sampling_logp_difference/mean": 0.11308446526527405, + "step": 145 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.28733527660369873, + "epoch": 0.38421052631578945, + "grad_norm": 0.012077763676643372, + "learning_rate": 1e-06, + "loss": 0.0592, + "step": 146 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.29585976898670197, + "epoch": 0.3868421052631579, + "grad_norm": 0.013086505234241486, + "learning_rate": 1e-06, + "loss": 0.0457, + "step": 147 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.3003746271133423, + "epoch": 0.3894736842105263, + "grad_norm": 0.010912450030446053, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2308.0, + "completions/mean_length": 811.55078125, + "completions/mean_terminated_length": 595.695068359375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.3026224821805954, + "epoch": 0.39210526315789473, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.012824556790292263, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 92619159.0, + "reward": 0.3143312335014343, + "reward_std": 0.18259009718894958, + "rewards/progression_diversity/mean": -0.0024227979592978954, + "rewards/progression_diversity/std": 0.03060256503522396, + "rewards/symbolic_reward_accuracy/mean": 0.232421875, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.5882161259651184, + "rewards/symbolic_reward_partial_score/std": 0.2912585139274597, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069498062133789, + "sampling/importance_sampling_ratio/min": 0.0012500947341322899, + "sampling/sampling_logp_difference/max": 6.684535980224609, + "sampling/sampling_logp_difference/mean": 0.1314665824174881, + "step": 149 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.2950987070798874, + "epoch": 0.39473684210526316, + "grad_norm": 0.010317061096429825, + "learning_rate": 1e-06, + "loss": 0.0318, + "step": 150 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2954738140106201, + "epoch": 0.3973684210526316, + "grad_norm": 0.014257904142141342, + "learning_rate": 1e-06, + "loss": 0.0424, + "step": 151 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3058134615421295, + "epoch": 0.4, + "grad_norm": 0.00726959016174078, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2396.0, + "completions/mean_length": 805.603515625, + "completions/mean_terminated_length": 589.6653442382812, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.30552275478839874, + "epoch": 0.4026315789473684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03944050893187523, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 93433612.0, + "reward": 0.3065668046474457, + "reward_std": 0.22543519735336304, + "rewards/progression_diversity/mean": -0.0025011475663632154, + "rewards/progression_diversity/std": 0.029947273433208466, + "rewards/symbolic_reward_accuracy/mean": 0.220703125, + "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, + "rewards/symbolic_reward_partial_score/mean": 0.5870768427848816, + "rewards/symbolic_reward_partial_score/std": 0.2680220305919647, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.070122241973877, + "sampling/importance_sampling_ratio/min": 3.465719419182278e-05, + "sampling/sampling_logp_difference/max": 10.270005226135254, + "sampling/sampling_logp_difference/mean": 0.1315053105354309, + "step": 153 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3053535670042038, + "epoch": 0.4052631578947368, + "grad_norm": 0.011440174654126167, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 154 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3068731129169464, + "epoch": 0.40789473684210525, + "grad_norm": 0.009822564199566841, + "learning_rate": 1e-06, + "loss": 0.0338, + "step": 155 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2979227900505066, + "epoch": 0.4105263157894737, + "grad_norm": 0.01283957902342081, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 763.703125, + "completions/mean_terminated_length": 578.4822387695312, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.31284308433532715, + "epoch": 0.4131578947368421, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.015819117426872253, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 94233588.0, + "reward": 0.3703927993774414, + "reward_std": 0.24595896899700165, + "rewards/progression_diversity/mean": -0.0017375880852341652, + "rewards/progression_diversity/std": 0.024503301829099655, + "rewards/symbolic_reward_accuracy/mean": 0.30859375, + "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, + "rewards/symbolic_reward_partial_score/mean": 0.6233723759651184, + "rewards/symbolic_reward_partial_score/std": 0.3047916889190674, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0701302289962769, + "sampling/importance_sampling_ratio/min": 8.33201596606159e-08, + "sampling/sampling_logp_difference/max": 16.300575256347656, + "sampling/sampling_logp_difference/mean": 0.13046219944953918, + "step": 157 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.29947347939014435, + "epoch": 0.41578947368421054, + "grad_norm": 0.012457345612347126, + "learning_rate": 1e-06, + "loss": 0.0385, + "step": 158 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2942707985639572, + "epoch": 0.41842105263157897, + "grad_norm": 0.007380845490843058, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 159 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.2961069345474243, + "epoch": 0.42105263157894735, + "grad_norm": 0.009989511221647263, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1410.0, + "completions/mean_length": 733.279296875, + "completions/mean_terminated_length": 578.9329223632812, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.29683390259742737, + "epoch": 0.4236842105263158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.012312391772866249, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 95011011.0, + "reward": 0.3097507655620575, + "reward_std": 0.21708126366138458, + "rewards/progression_diversity/mean": -0.0014869628939777613, + "rewards/progression_diversity/std": 0.020555421710014343, + "rewards/symbolic_reward_accuracy/mean": 0.224609375, + "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, + "rewards/symbolic_reward_partial_score/mean": 0.5891926884651184, + "rewards/symbolic_reward_partial_score/std": 0.28653624653816223, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0722882747650146, + "sampling/importance_sampling_ratio/min": 0.0011683765333145857, + "sampling/sampling_logp_difference/max": 6.752140045166016, + "sampling/sampling_logp_difference/mean": 0.1345098316669464, + "step": 161 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.29501038789749146, + "epoch": 0.4263157894736842, + "grad_norm": 0.011208836920559406, + "learning_rate": 1e-06, + "loss": 0.0448, + "step": 162 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.31285424530506134, + "epoch": 0.42894736842105263, + "grad_norm": 0.01194053515791893, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 163 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2996787875890732, + "epoch": 0.43157894736842106, + "grad_norm": 0.0087357759475708, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12065.0, + "completions/mean_length": 683.697265625, + "completions/mean_terminated_length": 591.1611328125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.31017276644706726, + "epoch": 0.4342105263157895, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.014013568870723248, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 95761384.0, + "reward": 0.3427630066871643, + "reward_std": 0.21610036492347717, + "rewards/progression_diversity/mean": -0.0010415659053251147, + "rewards/progression_diversity/std": 0.01706579513847828, + "rewards/symbolic_reward_accuracy/mean": 0.263671875, + "rewards/symbolic_reward_accuracy/std": 0.4410543739795685, + "rewards/symbolic_reward_partial_score/mean": 0.6165364980697632, + "rewards/symbolic_reward_partial_score/std": 0.2920151948928833, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0745972394943237, + "sampling/importance_sampling_ratio/min": 0.000583315675612539, + "sampling/sampling_logp_difference/max": 7.446782112121582, + "sampling/sampling_logp_difference/mean": 0.1369236409664154, + "step": 165 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.29873988032341003, + "epoch": 0.4368421052631579, + "grad_norm": 0.012282563373446465, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 166 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.3020479381084442, + "epoch": 0.4394736842105263, + "grad_norm": 0.00869888998568058, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 167 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3060583770275116, + "epoch": 0.4421052631578947, + "grad_norm": 0.008922635577619076, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1920.0, + "completions/mean_length": 705.625, + "completions/mean_terminated_length": 551.0059204101562, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.3060384541749954, + "epoch": 0.44473684210526315, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.018640100955963135, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 96506920.0, + "reward": 0.44208988547325134, + "reward_std": 0.26640045642852783, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.3984375, + "rewards/symbolic_reward_accuracy/std": 0.4900552034378052, + "rewards/symbolic_reward_partial_score/mean": 0.6800130009651184, + "rewards/symbolic_reward_partial_score/std": 0.3060167729854584, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0716135501861572, + "sampling/importance_sampling_ratio/min": 0.0021636190358549356, + "sampling/sampling_logp_difference/max": 6.13597297668457, + "sampling/sampling_logp_difference/mean": 0.1341308355331421, + "step": 169 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.29913921654224396, + "epoch": 0.4473684210526316, + "grad_norm": 0.007306138519197702, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 170 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.30154837667942047, + "epoch": 0.45, + "grad_norm": 0.016574641689658165, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 171 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.30604830384254456, + "epoch": 0.45263157894736844, + "grad_norm": 0.013307604938745499, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1516.0, + "completions/mean_length": 626.103515625, + "completions/mean_terminated_length": 533.2279052734375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.30806519091129303, + "epoch": 0.45526315789473687, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.015151865780353546, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 97206429.0, + "reward": 0.41551151871681213, + "reward_std": 0.19223448634147644, + "rewards/progression_diversity/mean": -0.0015838092658668756, + "rewards/progression_diversity/std": 0.0340169332921505, + "rewards/symbolic_reward_accuracy/mean": 0.35546875, + "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, + "rewards/symbolic_reward_partial_score/mean": 0.6767578125, + "rewards/symbolic_reward_partial_score/std": 0.2906571328639984, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0748977661132812, + "sampling/importance_sampling_ratio/min": 0.00040466117206960917, + "sampling/sampling_logp_difference/max": 7.812460422515869, + "sampling/sampling_logp_difference/mean": 0.1398119330406189, + "step": 173 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3104834258556366, + "epoch": 0.45789473684210524, + "grad_norm": 0.01459193229675293, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 174 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.30411429703235626, + "epoch": 0.4605263157894737, + "grad_norm": 0.010530464351177216, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 175 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3101528435945511, + "epoch": 0.4631578947368421, + "grad_norm": 0.01191193237900734, + "learning_rate": 1e-06, + "loss": 0.014, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 706.248046875, + "completions/mean_terminated_length": 551.6351318359375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.3114381581544876, + "epoch": 0.46578947368421053, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.016358286142349243, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 97972860.0, + "reward": 0.3487420082092285, + "reward_std": 0.22794780135154724, + "rewards/progression_diversity/mean": -0.003728086594492197, + "rewards/progression_diversity/std": 0.046275150030851364, + "rewards/symbolic_reward_accuracy/mean": 0.271484375, + "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, + "rewards/symbolic_reward_partial_score/mean": 0.6222330331802368, + "rewards/symbolic_reward_partial_score/std": 0.2960527837276459, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0715268850326538, + "sampling/importance_sampling_ratio/min": 2.043495624093339e-05, + "sampling/sampling_logp_difference/max": 10.798263549804688, + "sampling/sampling_logp_difference/mean": 0.1340111345052719, + "step": 177 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.29386352002620697, + "epoch": 0.46842105263157896, + "grad_norm": 0.013762143440544605, + "learning_rate": 1e-06, + "loss": 0.0507, + "step": 178 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.3153984248638153, + "epoch": 0.4710526315789474, + "grad_norm": 0.007509378716349602, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 179 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3007982224225998, + "epoch": 0.47368421052631576, + "grad_norm": 0.012465434148907661, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1206.0, + "completions/mean_length": 710.966796875, + "completions/mean_terminated_length": 525.1205444335938, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.31515413522720337, + "epoch": 0.4763157894736842, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.014978190883994102, + "learning_rate": 1e-06, + "loss": 0.0238, + "num_tokens": 98750187.0, + "reward": 0.40842536091804504, + "reward_std": 0.22665637731552124, + "rewards/progression_diversity/mean": -0.002191566862165928, + "rewards/progression_diversity/std": 0.033938221633434296, + "rewards/symbolic_reward_accuracy/mean": 0.35546875, + "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, + "rewards/symbolic_reward_partial_score/mean": 0.6531575918197632, + "rewards/symbolic_reward_partial_score/std": 0.31162047386169434, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0704665184020996, + "sampling/importance_sampling_ratio/min": 0.000455824047094211, + "sampling/sampling_logp_difference/max": 7.693403720855713, + "sampling/sampling_logp_difference/mean": 0.13239729404449463, + "step": 181 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.30028870701789856, + "epoch": 0.4789473684210526, + "grad_norm": 0.008311889134347439, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3118121027946472, + "epoch": 0.48157894736842105, + "grad_norm": 0.009793510660529137, + "learning_rate": 1e-06, + "loss": 0.0052, + "step": 183 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3031664490699768, + "epoch": 0.4842105263157895, + "grad_norm": 0.010256760753691196, + "learning_rate": 1e-06, + "loss": 0.0198, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 563.029296875, + "completions/mean_terminated_length": 532.0684814453125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.3078819364309311, + "epoch": 0.4868421052631579, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01614123024046421, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 99453114.0, + "reward": 0.39682406187057495, + "reward_std": 0.22970899939537048, + "rewards/progression_diversity/mean": -0.00021465322060976177, + "rewards/progression_diversity/std": 0.00485704792663455, + "rewards/symbolic_reward_accuracy/mean": 0.330078125, + "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, + "rewards/symbolic_reward_partial_score/mean": 0.66259765625, + "rewards/symbolic_reward_partial_score/std": 0.28932827711105347, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078159213066101, + "sampling/importance_sampling_ratio/min": 5.380706716096029e-05, + "sampling/sampling_logp_difference/max": 9.830105781555176, + "sampling/sampling_logp_difference/mean": 0.14422446489334106, + "step": 185 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3147430270910263, + "epoch": 0.48947368421052634, + "grad_norm": 0.011492928490042686, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 186 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.3014184534549713, + "epoch": 0.4921052631578947, + "grad_norm": 0.009395534172654152, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 187 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.30802275240421295, + "epoch": 0.49473684210526314, + "grad_norm": 0.010077573359012604, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1267.0, + "completions/max_terminated_length": 1267.0, + "completions/mean_length": 525.791015625, + "completions/mean_terminated_length": 525.791015625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.30758512020111084, + "epoch": 0.49736842105263157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.014145960099995136, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 100124943.0, + "reward": 0.44975587725639343, + "reward_std": 0.23507720232009888, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.396484375, + "rewards/symbolic_reward_accuracy/std": 0.4896455705165863, + "rewards/symbolic_reward_partial_score/mean": 0.7062174677848816, + "rewards/symbolic_reward_partial_score/std": 0.28074365854263306, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0785939693450928, + "sampling/importance_sampling_ratio/min": 6.271932943491265e-05, + "sampling/sampling_logp_difference/max": 9.676840782165527, + "sampling/sampling_logp_difference/mean": 0.1481301635503769, + "step": 189 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.3067742586135864, + "epoch": 0.5, + "grad_norm": 0.00825838465243578, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 190 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.31375977396965027, + "epoch": 0.5026315789473684, + "grad_norm": 0.014494653791189194, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 191 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3070768415927887, + "epoch": 0.5052631578947369, + "grad_norm": 0.00879518873989582, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 595.572265625, + "completions/mean_terminated_length": 533.6569213867188, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.3034219741821289, + "epoch": 0.5078947368421053, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.013428542762994766, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 100829172.0, + "reward": 0.4348069429397583, + "reward_std": 0.23271231353282928, + "rewards/progression_diversity/mean": -0.000752636871766299, + "rewards/progression_diversity/std": 0.017030227929353714, + "rewards/symbolic_reward_accuracy/mean": 0.380859375, + "rewards/symbolic_reward_accuracy/std": 0.48607301712036133, + "rewards/symbolic_reward_partial_score/mean": 0.6896159052848816, + "rewards/symbolic_reward_partial_score/std": 0.3018878102302551, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0762717723846436, + "sampling/importance_sampling_ratio/min": 1.2637668987736106e-05, + "sampling/sampling_logp_difference/max": 11.278828620910645, + "sampling/sampling_logp_difference/mean": 0.1418013572692871, + "step": 193 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3115580826997757, + "epoch": 0.5105263157894737, + "grad_norm": 0.010667307302355766, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 194 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.307052806019783, + "epoch": 0.5131578947368421, + "grad_norm": 0.013530107215046883, + "learning_rate": 1e-06, + "loss": -0.0067, + "step": 195 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3012283742427826, + "epoch": 0.5157894736842106, + "grad_norm": 0.01216200552880764, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1240.0, + "completions/mean_length": 580.47265625, + "completions/mean_terminated_length": 518.498046875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.3057010918855667, + "epoch": 0.5184210526315789, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.014998500235378742, + "learning_rate": 1e-06, + "loss": 0.0221, + "num_tokens": 101530182.0, + "reward": 0.415574848651886, + "reward_std": 0.25428831577301025, + "rewards/progression_diversity/mean": -0.00013565561675932258, + "rewards/progression_diversity/std": 0.003069536527618766, + "rewards/symbolic_reward_accuracy/mean": 0.3515625, + "rewards/symbolic_reward_accuracy/std": 0.4779251217842102, + "rewards/symbolic_reward_partial_score/mean": 0.68408203125, + "rewards/symbolic_reward_partial_score/std": 0.2947002053260803, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0763626098632812, + "sampling/importance_sampling_ratio/min": 0.0009694536565802991, + "sampling/sampling_logp_difference/max": 6.938777923583984, + "sampling/sampling_logp_difference/mean": 0.14132827520370483, + "step": 197 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.30432167649269104, + "epoch": 0.5210526315789473, + "grad_norm": 0.009974686428904533, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 198 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3102835714817047, + "epoch": 0.5236842105263158, + "grad_norm": 0.013394089415669441, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 199 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3013733774423599, + "epoch": 0.5263157894736842, + "grad_norm": 0.007236282341182232, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15202.0, + "completions/mean_length": 670.62109375, + "completions/mean_terminated_length": 546.8936767578125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.295871302485466, + "epoch": 0.5289473684210526, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.013714026659727097, + "learning_rate": 1e-06, + "loss": 0.053, + "num_tokens": 102291044.0, + "reward": 0.40374255180358887, + "reward_std": 0.2347850203514099, + "rewards/progression_diversity/mean": -0.0017245247727259994, + "rewards/progression_diversity/std": 0.027910111472010612, + "rewards/symbolic_reward_accuracy/mean": 0.337890625, + "rewards/symbolic_reward_accuracy/std": 0.4734536409378052, + "rewards/symbolic_reward_partial_score/mean": 0.6739909052848816, + "rewards/symbolic_reward_partial_score/std": 0.28590893745422363, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0722577571868896, + "sampling/importance_sampling_ratio/min": 0.0002544128510635346, + "sampling/sampling_logp_difference/max": 8.276552200317383, + "sampling/sampling_logp_difference/mean": 0.13411201536655426, + "step": 201 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.31462037563323975, + "epoch": 0.531578947368421, + "grad_norm": 0.011020858772099018, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 202 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3070591688156128, + "epoch": 0.5342105263157895, + "grad_norm": 0.008980306796729565, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 203 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.30502383410930634, + "epoch": 0.5368421052631579, + "grad_norm": 0.012147264555096626, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1157.0, + "completions/mean_length": 577.3671875, + "completions/mean_terminated_length": 515.3804321289062, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.30741482973098755, + "epoch": 0.5394736842105263, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.010803978890180588, + "learning_rate": 1e-06, + "loss": 0.0152, + "num_tokens": 102999648.0, + "reward": 0.4042305052280426, + "reward_std": 0.18462583422660828, + "rewards/progression_diversity/mean": -0.0017548013711348176, + "rewards/progression_diversity/std": 0.034413598477840424, + "rewards/symbolic_reward_accuracy/mean": 0.33203125, + "rewards/symbolic_reward_accuracy/std": 0.47140273451805115, + "rewards/symbolic_reward_partial_score/mean": 0.6847330927848816, + "rewards/symbolic_reward_partial_score/std": 0.27975600957870483, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077283263206482, + "sampling/importance_sampling_ratio/min": 0.0026723169721663, + "sampling/sampling_logp_difference/max": 5.924809455871582, + "sampling/sampling_logp_difference/mean": 0.14251382648944855, + "step": 205 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3081301152706146, + "epoch": 0.5421052631578948, + "grad_norm": 0.007183319889008999, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.3076689690351486, + "epoch": 0.5447368421052632, + "grad_norm": 0.01192345842719078, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 207 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3079235255718231, + "epoch": 0.5473684210526316, + "grad_norm": 0.007967538200318813, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 527.55859375, + "completions/mean_terminated_length": 496.52838134765625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.316678062081337, + "epoch": 0.55, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.012876125983893871, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 103652030.0, + "reward": 0.5482417941093445, + "reward_std": 0.24300527572631836, + "rewards/progression_diversity/mean": -4.2574582039378583e-05, + "rewards/progression_diversity/std": 0.0009633527952246368, + "rewards/symbolic_reward_accuracy/mean": 0.52734375, + "rewards/symbolic_reward_accuracy/std": 0.49974003434181213, + "rewards/symbolic_reward_partial_score/mean": 0.7734375, + "rewards/symbolic_reward_partial_score/std": 0.28300556540489197, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078352689743042, + "sampling/importance_sampling_ratio/min": 0.000436106463894248, + "sampling/sampling_logp_difference/max": 7.737624168395996, + "sampling/sampling_logp_difference/mean": 0.1448683738708496, + "step": 209 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.31302228569984436, + "epoch": 0.5526315789473685, + "grad_norm": 0.011758224107325077, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 210 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.30750422179698944, + "epoch": 0.5552631578947368, + "grad_norm": 0.007878407835960388, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 211 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2995268553495407, + "epoch": 0.5578947368421052, + "grad_norm": 0.010879729874432087, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1264.0, + "completions/mean_length": 612.904296875, + "completions/mean_terminated_length": 519.950927734375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.2970995306968689, + "epoch": 0.5605263157894737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.013787680305540562, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 104361645.0, + "reward": 0.44565433263778687, + "reward_std": 0.21747955679893494, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.388671875, + "rewards/symbolic_reward_accuracy/std": 0.4879252314567566, + "rewards/symbolic_reward_partial_score/mean": 0.7120768427848816, + "rewards/symbolic_reward_partial_score/std": 0.2797081768512726, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0732554197311401, + "sampling/importance_sampling_ratio/min": 6.989557732595131e-05, + "sampling/sampling_logp_difference/max": 9.56850814819336, + "sampling/sampling_logp_difference/mean": 0.13793891668319702, + "step": 213 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.31382423639297485, + "epoch": 0.5631578947368421, + "grad_norm": 0.011133058927953243, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 214 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.3052579313516617, + "epoch": 0.5657894736842105, + "grad_norm": 0.01070347335189581, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 215 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2916347533464432, + "epoch": 0.5684210526315789, + "grad_norm": 0.008687763474881649, + "learning_rate": 1e-06, + "loss": 0.0397, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3699.0, + "completions/max_terminated_length": 3699.0, + "completions/mean_length": 499.142578125, + "completions/mean_terminated_length": 499.142578125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.3144657015800476, + "epoch": 0.5710526315789474, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.012763739563524723, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 105019350.0, + "reward": 0.44731447100639343, + "reward_std": 0.19935737550258636, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.380859375, + "rewards/symbolic_reward_accuracy/std": 0.48607301712036133, + "rewards/symbolic_reward_partial_score/mean": 0.7293294668197632, + "rewards/symbolic_reward_partial_score/std": 0.2697790563106537, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07834792137146, + "sampling/importance_sampling_ratio/min": 0.001173157594166696, + "sampling/sampling_logp_difference/max": 6.748056411743164, + "sampling/sampling_logp_difference/mean": 0.14621981978416443, + "step": 217 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3075670599937439, + "epoch": 0.5736842105263158, + "grad_norm": 0.006303088273853064, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 218 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3094196021556854, + "epoch": 0.5763157894736842, + "grad_norm": 0.011260257102549076, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 219 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3103173077106476, + "epoch": 0.5789473684210527, + "grad_norm": 0.010579501278698444, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 595.380859375, + "completions/mean_terminated_length": 502.3241882324219, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.30067962408065796, + "epoch": 0.5815789473684211, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.014474834315478802, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 105736857.0, + "reward": 0.38732582330703735, + "reward_std": 0.18756034970283508, + "rewards/progression_diversity/mean": -0.0027724693063646555, + "rewards/progression_diversity/std": 0.04055400565266609, + "rewards/symbolic_reward_accuracy/mean": 0.30078125, + "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, + "rewards/symbolic_reward_partial_score/mean": 0.69091796875, + "rewards/symbolic_reward_partial_score/std": 0.26963144540786743, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0751022100448608, + "sampling/importance_sampling_ratio/min": 0.00019873835844919086, + "sampling/sampling_logp_difference/max": 8.523521423339844, + "sampling/sampling_logp_difference/mean": 0.13958622515201569, + "step": 221 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3127920478582382, + "epoch": 0.5842105263157895, + "grad_norm": 0.010545584373176098, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 222 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3065568804740906, + "epoch": 0.5868421052631579, + "grad_norm": 0.008446205407381058, + "learning_rate": 1e-06, + "loss": -0.0092, + "step": 223 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.31316129863262177, + "epoch": 0.5894736842105263, + "grad_norm": 0.006929857190698385, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 520.384765625, + "completions/mean_terminated_length": 489.34051513671875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.3118240088224411, + "epoch": 0.5921052631578947, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.013699988834559917, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 106365054.0, + "reward": 0.5598512887954712, + "reward_std": 0.22764000296592712, + "rewards/progression_diversity/mean": -0.0012044012546539307, + "rewards/progression_diversity/std": 0.02724718675017357, + "rewards/symbolic_reward_accuracy/mean": 0.5390625, + "rewards/symbolic_reward_accuracy/std": 0.4989593029022217, + "rewards/symbolic_reward_partial_score/mean": 0.7893880605697632, + "rewards/symbolic_reward_partial_score/std": 0.2728375494480133, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0779380798339844, + "sampling/importance_sampling_ratio/min": 0.0002490824554115534, + "sampling/sampling_logp_difference/max": 8.29772663116455, + "sampling/sampling_logp_difference/mean": 0.1458706259727478, + "step": 225 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.310558557510376, + "epoch": 0.5947368421052631, + "grad_norm": 0.008256916888058186, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 226 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.3150336593389511, + "epoch": 0.5973684210526315, + "grad_norm": 0.007905205711722374, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 227 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3053455650806427, + "epoch": 0.6, + "grad_norm": 0.011195844039320946, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 490.607421875, + "completions/mean_terminated_length": 490.607421875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.3094610720872879, + "epoch": 0.6026315789473684, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.015274910256266594, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 107001365.0, + "reward": 0.535107433795929, + "reward_std": 0.21647313237190247, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.5, + "rewards/symbolic_reward_accuracy/std": 0.5004889965057373, + "rewards/symbolic_reward_partial_score/mean": 0.7843424677848816, + "rewards/symbolic_reward_partial_score/std": 0.2582632303237915, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0789885520935059, + "sampling/importance_sampling_ratio/min": 3.823011411441257e-06, + "sampling/sampling_logp_difference/max": 12.474472045898438, + "sampling/sampling_logp_difference/mean": 0.14612674713134766, + "step": 229 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.310358390212059, + "epoch": 0.6052631578947368, + "grad_norm": 0.009558123536407948, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 230 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3124067932367325, + "epoch": 0.6078947368421053, + "grad_norm": 0.007349732331931591, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 231 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3077686280012131, + "epoch": 0.6105263157894737, + "grad_norm": 0.010673035867512226, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 490.693359375, + "completions/mean_terminated_length": 490.693359375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.3138282001018524, + "epoch": 0.6131578947368421, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.012326201424002647, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 107653080.0, + "reward": 0.42856448888778687, + "reward_std": 0.21210803091526031, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.357421875, + "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, + "rewards/symbolic_reward_partial_score/mean": 0.71435546875, + "rewards/symbolic_reward_partial_score/std": 0.26391804218292236, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.079890489578247, + "sampling/importance_sampling_ratio/min": 7.049969281069934e-05, + "sampling/sampling_logp_difference/max": 9.55990219116211, + "sampling/sampling_logp_difference/mean": 0.14701983332633972, + "step": 233 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.30451372265815735, + "epoch": 0.6157894736842106, + "grad_norm": 0.009723720140755177, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 234 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.30988383293151855, + "epoch": 0.618421052631579, + "grad_norm": 0.008871919475495815, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 235 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.3084511011838913, + "epoch": 0.6210526315789474, + "grad_norm": 0.006157966796308756, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 527.09375, + "completions/mean_terminated_length": 496.0626220703125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.3075174540281296, + "epoch": 0.6236842105263158, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.014929900877177715, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 108329928.0, + "reward": 0.47631800174713135, + "reward_std": 0.18285810947418213, + "rewards/progression_diversity/mean": -3.653179373941384e-05, + "rewards/progression_diversity/std": 0.0008266201475635171, + "rewards/symbolic_reward_accuracy/mean": 0.416015625, + "rewards/symbolic_reward_accuracy/std": 0.493378221988678, + "rewards/symbolic_reward_partial_score/mean": 0.75634765625, + "rewards/symbolic_reward_partial_score/std": 0.2513831555843353, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0761387348175049, + "sampling/importance_sampling_ratio/min": 0.00018604067736305296, + "sampling/sampling_logp_difference/max": 8.589545249938965, + "sampling/sampling_logp_difference/mean": 0.14305877685546875, + "step": 237 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.30407415330410004, + "epoch": 0.6263157894736842, + "grad_norm": 0.010024651885032654, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 238 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.29691192507743835, + "epoch": 0.6289473684210526, + "grad_norm": 0.008623898029327393, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 239 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3079955577850342, + "epoch": 0.631578947368421, + "grad_norm": 0.012140064500272274, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12821.0, + "completions/mean_length": 544.341796875, + "completions/mean_terminated_length": 513.3444213867188, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.3042522668838501, + "epoch": 0.6342105263157894, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.014113095588982105, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 108989399.0, + "reward": 0.5777234435081482, + "reward_std": 0.18902507424354553, + "rewards/progression_diversity/mean": -0.0010965826222673059, + "rewards/progression_diversity/std": 0.024812830612063408, + "rewards/symbolic_reward_accuracy/mean": 0.560546875, + "rewards/symbolic_reward_accuracy/std": 0.49680593609809875, + "rewards/symbolic_reward_partial_score/mean": 0.8059896230697632, + "rewards/symbolic_reward_partial_score/std": 0.2615269124507904, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0745680332183838, + "sampling/importance_sampling_ratio/min": 4.572784291667631e-06, + "sampling/sampling_logp_difference/max": 12.295388221740723, + "sampling/sampling_logp_difference/mean": 0.14019736647605896, + "step": 241 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3048563450574875, + "epoch": 0.6368421052631579, + "grad_norm": 0.006459313910454512, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 242 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.30121709406375885, + "epoch": 0.6394736842105263, + "grad_norm": 0.016183165833353996, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 243 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.29656779766082764, + "epoch": 0.6421052631578947, + "grad_norm": 0.010587197728455067, + "learning_rate": 1e-06, + "loss": 0.0169, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13638.0, + "completions/max_terminated_length": 13638.0, + "completions/mean_length": 520.732421875, + "completions/mean_terminated_length": 520.732421875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.3021259307861328, + "epoch": 0.6447368421052632, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.013214373961091042, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 109673678.0, + "reward": 0.42620930075645447, + "reward_std": 0.1578913778066635, + "rewards/progression_diversity/mean": -0.0011426578275859356, + "rewards/progression_diversity/std": 0.02516518160700798, + "rewards/symbolic_reward_accuracy/mean": 0.341796875, + "rewards/symbolic_reward_accuracy/std": 0.4747757613658905, + "rewards/symbolic_reward_partial_score/mean": 0.73779296875, + "rewards/symbolic_reward_partial_score/std": 0.23799775540828705, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.075998306274414, + "sampling/importance_sampling_ratio/min": 0.0007776974816806614, + "sampling/sampling_logp_difference/max": 7.159173011779785, + "sampling/sampling_logp_difference/mean": 0.14280164241790771, + "step": 245 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.3004751205444336, + "epoch": 0.6473684210526316, + "grad_norm": 0.00852108933031559, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 246 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3045208901166916, + "epoch": 0.65, + "grad_norm": 0.01242540031671524, + "learning_rate": 1e-06, + "loss": 0.0037, + "step": 247 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2974863350391388, + "epoch": 0.6526315789473685, + "grad_norm": 0.011423774063587189, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1103.0, + "completions/mean_length": 529.646484375, + "completions/mean_terminated_length": 498.620361328125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.3006265312433243, + "epoch": 0.6552631578947369, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.013644593767821789, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 110364185.0, + "reward": 0.4431152641773224, + "reward_std": 0.1808246374130249, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.365234375, + "rewards/symbolic_reward_accuracy/std": 0.4819667339324951, + "rewards/symbolic_reward_partial_score/mean": 0.7472330927848816, + "rewards/symbolic_reward_partial_score/std": 0.22773893177509308, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0767557621002197, + "sampling/importance_sampling_ratio/min": 0.0005200820742174983, + "sampling/sampling_logp_difference/max": 7.561523914337158, + "sampling/sampling_logp_difference/mean": 0.1415756642818451, + "step": 249 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.30361469089984894, + "epoch": 0.6578947368421053, + "grad_norm": 0.007451191544532776, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 250 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.30292457342147827, + "epoch": 0.6605263157894737, + "grad_norm": 0.011732818558812141, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 251 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.30056828260421753, + "epoch": 0.6631578947368421, + "grad_norm": 0.006881546229124069, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1639.0, + "completions/mean_length": 606.2890625, + "completions/mean_terminated_length": 513.2966918945312, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.29490475356578827, + "epoch": 0.6657894736842105, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.017659462988376617, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 111082765.0, + "reward": 0.4332265555858612, + "reward_std": 0.18931256234645844, + "rewards/progression_diversity/mean": -0.0025423369370400906, + "rewards/progression_diversity/std": 0.03626888990402222, + "rewards/symbolic_reward_accuracy/mean": 0.357421875, + "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, + "rewards/symbolic_reward_partial_score/mean": 0.7325845956802368, + "rewards/symbolic_reward_partial_score/std": 0.2550489902496338, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0724674463272095, + "sampling/importance_sampling_ratio/min": 6.315225618891418e-05, + "sampling/sampling_logp_difference/max": 9.669961929321289, + "sampling/sampling_logp_difference/mean": 0.13574518263339996, + "step": 253 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.292779803276062, + "epoch": 0.6684210526315789, + "grad_norm": 0.008213960565626621, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 254 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.29811522364616394, + "epoch": 0.6710526315789473, + "grad_norm": 0.01159645989537239, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 255 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3002711534500122, + "epoch": 0.6736842105263158, + "grad_norm": 0.01329131331294775, + "learning_rate": 1e-06, + "loss": 0.0049, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 602.314453125, + "completions/mean_terminated_length": 509.29864501953125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.2865429222583771, + "epoch": 0.6763157894736842, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.012785503640770912, + "learning_rate": 1e-06, + "loss": 0.0296, + "num_tokens": 111793454.0, + "reward": 0.5003780126571655, + "reward_std": 0.21868066489696503, + "rewards/progression_diversity/mean": -0.0012658978812396526, + "rewards/progression_diversity/std": 0.02685299515724182, + "rewards/symbolic_reward_accuracy/mean": 0.453125, + "rewards/symbolic_reward_accuracy/std": 0.4982847273349762, + "rewards/symbolic_reward_partial_score/mean": 0.7630208730697632, + "rewards/symbolic_reward_partial_score/std": 0.26299402117729187, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0711028575897217, + "sampling/importance_sampling_ratio/min": 0.00016783220053184777, + "sampling/sampling_logp_difference/max": 8.692545890808105, + "sampling/sampling_logp_difference/mean": 0.13469204306602478, + "step": 257 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2916171997785568, + "epoch": 0.6789473684210526, + "grad_norm": 0.00957430712878704, + "learning_rate": 1e-06, + "loss": 0.0051, + "step": 258 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.29992710053920746, + "epoch": 0.6815789473684211, + "grad_norm": 0.009682898409664631, + "learning_rate": 1e-06, + "loss": -0.0027, + "step": 259 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.3045227527618408, + "epoch": 0.6842105263157895, + "grad_norm": 0.010915652848780155, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 499.568359375, + "completions/mean_terminated_length": 499.568359375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.29767170548439026, + "epoch": 0.6868421052631579, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.013380862772464752, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 112431025.0, + "reward": 0.503369152545929, + "reward_std": 0.20027483999729156, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.455078125, + "rewards/symbolic_reward_accuracy/std": 0.4984649419784546, + "rewards/symbolic_reward_partial_score/mean": 0.7683919072151184, + "rewards/symbolic_reward_partial_score/std": 0.2547670304775238, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077869176864624, + "sampling/importance_sampling_ratio/min": 8.587339107180014e-05, + "sampling/sampling_logp_difference/max": 9.36263656616211, + "sampling/sampling_logp_difference/mean": 0.14571282267570496, + "step": 261 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.30793944001197815, + "epoch": 0.6894736842105263, + "grad_norm": 0.009117713198065758, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 262 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3103470057249069, + "epoch": 0.6921052631578948, + "grad_norm": 0.008808186277747154, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 263 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.29520297050476074, + "epoch": 0.6947368421052632, + "grad_norm": 0.012514298781752586, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 547.052734375, + "completions/mean_terminated_length": 484.94708251953125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.29729560017585754, + "epoch": 0.6973684210526315, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.01363955345004797, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 113121260.0, + "reward": 0.5068749189376831, + "reward_std": 0.15882600843906403, + "rewards/progression_diversity/mean": -0.000983412959612906, + "rewards/progression_diversity/std": 0.021069984883069992, + "rewards/symbolic_reward_accuracy/mean": 0.447265625, + "rewards/symbolic_reward_accuracy/std": 0.4976975917816162, + "rewards/symbolic_reward_partial_score/mean": 0.7957357168197632, + "rewards/symbolic_reward_partial_score/std": 0.22917680442333221, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0737191438674927, + "sampling/importance_sampling_ratio/min": 0.0028329859487712383, + "sampling/sampling_logp_difference/max": 5.866424083709717, + "sampling/sampling_logp_difference/mean": 0.1374402940273285, + "step": 265 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3016761392354965, + "epoch": 0.7, + "grad_norm": 0.00674022501334548, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 266 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.30257420241832733, + "epoch": 0.7026315789473684, + "grad_norm": 0.0077163465321063995, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 267 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2949662506580353, + "epoch": 0.7052631578947368, + "grad_norm": 0.008304405026137829, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 497.12890625, + "completions/mean_terminated_length": 497.12890625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.2958240211009979, + "epoch": 0.7078947368421052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.01024286262691021, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 113745070.0, + "reward": 0.5867675542831421, + "reward_std": 0.1438578963279724, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.5703125, + "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, + "rewards/symbolic_reward_partial_score/mean": 0.8152669668197632, + "rewards/symbolic_reward_partial_score/std": 0.25280916690826416, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0769691467285156, + "sampling/importance_sampling_ratio/min": 4.805782373296097e-05, + "sampling/sampling_logp_difference/max": 9.943105697631836, + "sampling/sampling_logp_difference/mean": 0.1453056037425995, + "step": 269 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3069736659526825, + "epoch": 0.7105263157894737, + "grad_norm": 0.010676093399524689, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 270 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2979844808578491, + "epoch": 0.7131578947368421, + "grad_norm": 0.007469307165592909, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 271 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2992859333753586, + "epoch": 0.7157894736842105, + "grad_norm": 0.005593322217464447, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 567.21484375, + "completions/mean_terminated_length": 505.1882629394531, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.2983289957046509, + "epoch": 0.718421052631579, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.014190707355737686, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 114454972.0, + "reward": 0.4660017192363739, + "reward_std": 0.15957707166671753, + "rewards/progression_diversity/mean": -0.001394656952470541, + "rewards/progression_diversity/std": 0.022416062653064728, + "rewards/symbolic_reward_accuracy/mean": 0.400390625, + "rewards/symbolic_reward_accuracy/std": 0.4904567301273346, + "rewards/symbolic_reward_partial_score/mean": 0.75390625, + "rewards/symbolic_reward_partial_score/std": 0.24977906048297882, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07451593875885, + "sampling/importance_sampling_ratio/min": 0.0007800398161634803, + "sampling/sampling_logp_difference/max": 7.156165599822998, + "sampling/sampling_logp_difference/mean": 0.13903775811195374, + "step": 273 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3065921664237976, + "epoch": 0.7210526315789474, + "grad_norm": 0.005344115197658539, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 274 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.299715131521225, + "epoch": 0.7236842105263158, + "grad_norm": 0.007578641176223755, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 275 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2972417175769806, + "epoch": 0.7263157894736842, + "grad_norm": 0.00895113404840231, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13534.0, + "completions/mean_length": 611.857421875, + "completions/mean_terminated_length": 518.8978271484375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.2999434769153595, + "epoch": 0.7289473684210527, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.011687182821333408, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 115203251.0, + "reward": 0.5210317373275757, + "reward_std": 0.19981878995895386, + "rewards/progression_diversity/mean": -0.001316323410719633, + "rewards/progression_diversity/std": 0.021824264898896217, + "rewards/symbolic_reward_accuracy/mean": 0.47265625, + "rewards/symbolic_reward_accuracy/std": 0.49974003434181213, + "rewards/symbolic_reward_partial_score/mean": 0.7921549677848816, + "rewards/symbolic_reward_partial_score/std": 0.24679899215698242, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.070167899131775, + "sampling/importance_sampling_ratio/min": 0.001091643120162189, + "sampling/sampling_logp_difference/max": 6.820071220397949, + "sampling/sampling_logp_difference/mean": 0.1327057182788849, + "step": 277 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2990414947271347, + "epoch": 0.7315789473684211, + "grad_norm": 0.0071852910332381725, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 278 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2953525185585022, + "epoch": 0.7342105263157894, + "grad_norm": 0.006853134371340275, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 279 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.29329265654087067, + "epoch": 0.7368421052631579, + "grad_norm": 0.012452336959540844, + "learning_rate": 1e-06, + "loss": 0.0192, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1474.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 508.8125, + "completions/mean_terminated_length": 508.8125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.298539400100708, + "epoch": 0.7394736842105263, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.015085075981914997, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 115887923.0, + "reward": 0.4686523675918579, + "reward_std": 0.20140579342842102, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.3984375, + "rewards/symbolic_reward_accuracy/std": 0.4900552034378052, + "rewards/symbolic_reward_partial_score/mean": 0.7652994394302368, + "rewards/symbolic_reward_partial_score/std": 0.2437170445919037, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0764374732971191, + "sampling/importance_sampling_ratio/min": 0.0012120020110160112, + "sampling/sampling_logp_difference/max": 6.715481758117676, + "sampling/sampling_logp_difference/mean": 0.14303654432296753, + "step": 281 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3021211475133896, + "epoch": 0.7421052631578947, + "grad_norm": 0.011208699084818363, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 282 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.3050876259803772, + "epoch": 0.7447368421052631, + "grad_norm": 0.010669737122952938, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 283 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2993925213813782, + "epoch": 0.7473684210526316, + "grad_norm": 0.00823670532554388, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1084.0, + "completions/mean_length": 520.505859375, + "completions/mean_terminated_length": 489.4618225097656, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.31090834736824036, + "epoch": 0.75, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.010960198938846588, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 116568758.0, + "reward": 0.4624403119087219, + "reward_std": 0.16659079492092133, + "rewards/progression_diversity/mean": -0.0010909016709774733, + "rewards/progression_diversity/std": 0.02468428760766983, + "rewards/symbolic_reward_accuracy/mean": 0.39453125, + "rewards/symbolic_reward_accuracy/std": 0.4892277717590332, + "rewards/symbolic_reward_partial_score/mean": 0.7530924677848816, + "rewards/symbolic_reward_partial_score/std": 0.25413239002227783, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077139139175415, + "sampling/importance_sampling_ratio/min": 0.0007212276686914265, + "sampling/sampling_logp_difference/max": 7.234555721282959, + "sampling/sampling_logp_difference/mean": 0.14307495951652527, + "step": 285 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.30131247639656067, + "epoch": 0.7526315789473684, + "grad_norm": 0.008542075753211975, + "learning_rate": 1e-06, + "loss": -0.0027, + "step": 286 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.30619797110557556, + "epoch": 0.7552631578947369, + "grad_norm": 0.010217522270977497, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 287 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2946513742208481, + "epoch": 0.7578947368421053, + "grad_norm": 0.008310631848871708, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 567.96875, + "completions/mean_terminated_length": 474.7505187988281, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.305755078792572, + "epoch": 0.7605263157894737, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.012845570221543312, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 117272710.0, + "reward": 0.5138547420501709, + "reward_std": 0.18507125973701477, + "rewards/progression_diversity/mean": -0.0012514127884060144, + "rewards/progression_diversity/std": 0.020418958738446236, + "rewards/symbolic_reward_accuracy/mean": 0.462890625, + "rewards/symbolic_reward_accuracy/std": 0.4991086423397064, + "rewards/symbolic_reward_partial_score/mean": 0.7884114980697632, + "rewards/symbolic_reward_partial_score/std": 0.24423162639141083, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.073378086090088, + "sampling/importance_sampling_ratio/min": 0.0019662980921566486, + "sampling/sampling_logp_difference/max": 6.231602668762207, + "sampling/sampling_logp_difference/mean": 0.13790735602378845, + "step": 289 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2955671697854996, + "epoch": 0.7631578947368421, + "grad_norm": 0.007032850757241249, + "learning_rate": 1e-06, + "loss": 0.0306, + "step": 290 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3038811683654785, + "epoch": 0.7657894736842106, + "grad_norm": 0.012003585696220398, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 291 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3101692795753479, + "epoch": 0.7684210526315789, + "grad_norm": 0.006113457027822733, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1499.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 483.70703125, + "completions/mean_terminated_length": 483.70703125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.3106113076210022, + "epoch": 0.7710526315789473, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.011551190167665482, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 117922192.0, + "reward": 0.4702148735523224, + "reward_std": 0.16088829934597015, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.39453125, + "rewards/symbolic_reward_accuracy/std": 0.4892277717590332, + "rewards/symbolic_reward_partial_score/mean": 0.7783203125, + "rewards/symbolic_reward_partial_score/std": 0.23153917491436005, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0782867670059204, + "sampling/importance_sampling_ratio/min": 5.172830242372584e-06, + "sampling/sampling_logp_difference/max": 12.172090530395508, + "sampling/sampling_logp_difference/mean": 0.14682671427726746, + "step": 293 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3070777803659439, + "epoch": 0.7736842105263158, + "grad_norm": 0.014432739466428757, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 294 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.30722641944885254, + "epoch": 0.7763157894736842, + "grad_norm": 0.007676142267882824, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 295 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3001379519701004, + "epoch": 0.7789473684210526, + "grad_norm": 0.007182000204920769, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 536.740234375, + "completions/mean_terminated_length": 474.5941467285156, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.3064696937799454, + "epoch": 0.781578947368421, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.011220471933484077, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 118605643.0, + "reward": 0.5557116270065308, + "reward_std": 0.202279195189476, + "rewards/progression_diversity/mean": -0.00012732444156426936, + "rewards/progression_diversity/std": 0.002881023334339261, + "rewards/symbolic_reward_accuracy/mean": 0.51953125, + "rewards/symbolic_reward_accuracy/std": 0.5001069903373718, + "rewards/symbolic_reward_partial_score/mean": 0.8146158456802368, + "rewards/symbolic_reward_partial_score/std": 0.24289114773273468, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.074270248413086, + "sampling/importance_sampling_ratio/min": 0.0009082350297830999, + "sampling/sampling_logp_difference/max": 7.004007339477539, + "sampling/sampling_logp_difference/mean": 0.13980567455291748, + "step": 297 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3012381047010422, + "epoch": 0.7842105263157895, + "grad_norm": 0.006437814328819513, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 298 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3034280389547348, + "epoch": 0.7868421052631579, + "grad_norm": 0.009115688502788544, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 299 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2999880909919739, + "epoch": 0.7894736842105263, + "grad_norm": 0.008743058890104294, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 504.83984375, + "completions/mean_terminated_length": 473.7651672363281, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.30167509615421295, + "epoch": 0.7921052631578948, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.013987814076244831, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 119287961.0, + "reward": 0.5052167177200317, + "reward_std": 0.21000981330871582, + "rewards/progression_diversity/mean": -0.0007912339060567319, + "rewards/progression_diversity/std": 0.017903579398989677, + "rewards/symbolic_reward_accuracy/mean": 0.44921875, + "rewards/symbolic_reward_accuracy/std": 0.497901052236557, + "rewards/symbolic_reward_partial_score/mean": 0.7862955927848816, + "rewards/symbolic_reward_partial_score/std": 0.24358125030994415, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0761265754699707, + "sampling/importance_sampling_ratio/min": 0.0016627967124804854, + "sampling/sampling_logp_difference/max": 6.399254322052002, + "sampling/sampling_logp_difference/mean": 0.14250260591506958, + "step": 301 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.30729566514492035, + "epoch": 0.7947368421052632, + "grad_norm": 0.011272015050053596, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 302 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.30154599249362946, + "epoch": 0.7973684210526316, + "grad_norm": 0.007906567305326462, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 303 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.30861087143421173, + "epoch": 0.8, + "grad_norm": 0.009575615637004375, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 475.01171875, + "completions/mean_terminated_length": 475.01171875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.3128325790166855, + "epoch": 0.8026315789473685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.011371531523764133, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 119939647.0, + "reward": 0.5709472894668579, + "reward_std": 0.1911071240901947, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.541015625, + "rewards/symbolic_reward_accuracy/std": 0.49880221486091614, + "rewards/symbolic_reward_partial_score/mean": 0.8211263418197632, + "rewards/symbolic_reward_partial_score/std": 0.22490404546260834, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0794188976287842, + "sampling/importance_sampling_ratio/min": 0.0021937647834420204, + "sampling/sampling_logp_difference/max": 6.122136116027832, + "sampling/sampling_logp_difference/mean": 0.14725090563297272, + "step": 305 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.30579154193401337, + "epoch": 0.8052631578947368, + "grad_norm": 0.007889126427471638, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 306 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3117325156927109, + "epoch": 0.8078947368421052, + "grad_norm": 0.008403713814914227, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 307 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3067573755979538, + "epoch": 0.8105263157894737, + "grad_norm": 0.011215373873710632, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 470.318359375, + "completions/mean_terminated_length": 470.318359375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.3116874247789383, + "epoch": 0.8131578947368421, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.010915424674749374, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 120571586.0, + "reward": 0.59326171875, + "reward_std": 0.15734408795833588, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.5703125, + "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, + "rewards/symbolic_reward_partial_score/mean": 0.8369140625, + "rewards/symbolic_reward_partial_score/std": 0.21129031479358673, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0776973962783813, + "sampling/importance_sampling_ratio/min": 7.561699021607637e-06, + "sampling/sampling_logp_difference/max": 11.792414665222168, + "sampling/sampling_logp_difference/mean": 0.1471216082572937, + "step": 309 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3030817359685898, + "epoch": 0.8157894736842105, + "grad_norm": 0.005430555436760187, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 310 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3055662661790848, + "epoch": 0.8184210526315789, + "grad_norm": 0.007991395890712738, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 311 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3136454224586487, + "epoch": 0.8210526315789474, + "grad_norm": 0.008103788830339909, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 502.697265625, + "completions/mean_terminated_length": 471.6183776855469, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.3072304129600525, + "epoch": 0.8236842105263158, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.017745062708854675, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 121238951.0, + "reward": 0.5021483898162842, + "reward_std": 0.20283925533294678, + "rewards/progression_diversity/mean": -5.9837475419044495e-06, + "rewards/progression_diversity/std": 0.00013539673818740994, + "rewards/symbolic_reward_accuracy/mean": 0.443359375, + "rewards/symbolic_reward_accuracy/std": 0.49726733565330505, + "rewards/symbolic_reward_partial_score/mean": 0.7877604365348816, + "rewards/symbolic_reward_partial_score/std": 0.2305394411087036, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0763132572174072, + "sampling/importance_sampling_ratio/min": 0.00018456965335644782, + "sampling/sampling_logp_difference/max": 8.59748363494873, + "sampling/sampling_logp_difference/mean": 0.14485155045986176, + "step": 313 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.30788813531398773, + "epoch": 0.8263157894736842, + "grad_norm": 0.008959567174315453, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 314 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.30733516812324524, + "epoch": 0.8289473684210527, + "grad_norm": 0.008335716091096401, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 315 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.30802886188030243, + "epoch": 0.8315789473684211, + "grad_norm": 0.007674569729715586, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 470.5078125, + "completions/mean_terminated_length": 470.5078125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.3161182999610901, + "epoch": 0.8342105263157895, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.015065652318298817, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 121902347.0, + "reward": 0.4985351860523224, + "reward_std": 0.1782730519771576, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.43359375, + "rewards/symbolic_reward_accuracy/std": 0.4960552453994751, + "rewards/symbolic_reward_partial_score/mean": 0.7945963144302368, + "rewards/symbolic_reward_partial_score/std": 0.22836259007453918, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0780564546585083, + "sampling/importance_sampling_ratio/min": 3.911755629815161e-05, + "sampling/sampling_logp_difference/max": 10.14893913269043, + "sampling/sampling_logp_difference/mean": 0.14812889695167542, + "step": 317 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3078267425298691, + "epoch": 0.8368421052631579, + "grad_norm": 0.006529625505208969, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 318 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3183140605688095, + "epoch": 0.8394736842105263, + "grad_norm": 0.005689030978828669, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 319 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.30473969876766205, + "epoch": 0.8421052631578947, + "grad_norm": 0.013739674352109432, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1085.0, + "completions/mean_length": 588.193359375, + "completions/mean_terminated_length": 463.8169250488281, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.3014228492975235, + "epoch": 0.8447368421052631, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.011865752749145031, + "learning_rate": 1e-06, + "loss": 0.0318, + "num_tokens": 122591950.0, + "reward": 0.631678581237793, + "reward_std": 0.19821570813655853, + "rewards/progression_diversity/mean": -0.001093248138204217, + "rewards/progression_diversity/std": 0.02473738044500351, + "rewards/symbolic_reward_accuracy/mean": 0.625, + "rewards/symbolic_reward_accuracy/std": 0.4845963716506958, + "rewards/symbolic_reward_partial_score/mean": 0.8582357168197632, + "rewards/symbolic_reward_partial_score/std": 0.21694619953632355, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0716766119003296, + "sampling/importance_sampling_ratio/min": 0.0003383133444003761, + "sampling/sampling_logp_difference/max": 7.991538047790527, + "sampling/sampling_logp_difference/mean": 0.13535773754119873, + "step": 321 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.3073635548353195, + "epoch": 0.8473684210526315, + "grad_norm": 0.011415022425353527, + "learning_rate": 1e-06, + "loss": 0.0218, + "step": 322 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31290285289287567, + "epoch": 0.85, + "grad_norm": 0.007157693617045879, + "learning_rate": 1e-06, + "loss": -0.0026, + "step": 323 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3036612421274185, + "epoch": 0.8526315789473684, + "grad_norm": 0.01231673825532198, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1180.0, + "completions/mean_length": 493.33984375, + "completions/mean_terminated_length": 462.2426452636719, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.3125806152820587, + "epoch": 0.8552631578947368, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.013022052124142647, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 123257532.0, + "reward": 0.5332888960838318, + "reward_std": 0.1853906512260437, + "rewards/progression_diversity/mean": -0.0011929721804335713, + "rewards/progression_diversity/std": 0.026993878185749054, + "rewards/symbolic_reward_accuracy/mean": 0.4921875, + "rewards/symbolic_reward_accuracy/std": 0.5004279017448425, + "rewards/symbolic_reward_partial_score/mean": 0.7939453125, + "rewards/symbolic_reward_partial_score/std": 0.2435741126537323, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0774052143096924, + "sampling/importance_sampling_ratio/min": 0.0012375212972983718, + "sampling/sampling_logp_difference/max": 6.694644927978516, + "sampling/sampling_logp_difference/mean": 0.14501085877418518, + "step": 325 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3106594383716583, + "epoch": 0.8578947368421053, + "grad_norm": 0.005496948957443237, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 326 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3180868923664093, + "epoch": 0.8605263157894737, + "grad_norm": 0.007096852641552687, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 327 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.30386729538440704, + "epoch": 0.8631578947368421, + "grad_norm": 0.009813301265239716, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1200.0, + "completions/mean_length": 504.208984375, + "completions/mean_terminated_length": 473.133056640625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.319959819316864, + "epoch": 0.8657894736842106, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.012554957531392574, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 123914823.0, + "reward": 0.5417969226837158, + "reward_std": 0.17694982886314392, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.5, + "rewards/symbolic_reward_accuracy/std": 0.5004889965057373, + "rewards/symbolic_reward_partial_score/mean": 0.806640625, + "rewards/symbolic_reward_partial_score/std": 0.24059468507766724, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078836441040039, + "sampling/importance_sampling_ratio/min": 5.7318637118441984e-05, + "sampling/sampling_logp_difference/max": 9.766884803771973, + "sampling/sampling_logp_difference/mean": 0.14754198491573334, + "step": 329 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3119961619377136, + "epoch": 0.868421052631579, + "grad_norm": 0.012830966152250767, + "learning_rate": 1e-06, + "loss": 0.0273, + "step": 330 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.32057128846645355, + "epoch": 0.8710526315789474, + "grad_norm": 0.008462558500468731, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 331 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.3068248778581619, + "epoch": 0.8736842105263158, + "grad_norm": 0.009940098971128464, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 487.529296875, + "completions/mean_terminated_length": 456.4207458496094, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.314155712723732, + "epoch": 0.8763157894736842, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.015469806268811226, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 124568246.0, + "reward": 0.5467255115509033, + "reward_std": 0.22133252024650574, + "rewards/progression_diversity/mean": -0.0002992129884660244, + "rewards/progression_diversity/std": 0.0067704166285693645, + "rewards/symbolic_reward_accuracy/mean": 0.50390625, + "rewards/symbolic_reward_accuracy/std": 0.5004737377166748, + "rewards/symbolic_reward_partial_score/mean": 0.8152669668197632, + "rewards/symbolic_reward_partial_score/std": 0.22244098782539368, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.076984167098999, + "sampling/importance_sampling_ratio/min": 0.0020317258313298225, + "sampling/sampling_logp_difference/max": 6.198869705200195, + "sampling/sampling_logp_difference/mean": 0.14370480179786682, + "step": 333 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.31249886751174927, + "epoch": 0.8789473684210526, + "grad_norm": 0.009735578671097755, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 334 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.30799588561058044, + "epoch": 0.881578947368421, + "grad_norm": 0.008018133230507374, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 335 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.30934616923332214, + "epoch": 0.8842105263157894, + "grad_norm": 0.00663616880774498, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1090.0, + "completions/max_terminated_length": 1090.0, + "completions/mean_length": 471.818359375, + "completions/mean_terminated_length": 471.818359375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.3062121123075485, + "epoch": 0.8868421052631579, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.009202565997838974, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 125208089.0, + "reward": 0.6447754502296448, + "reward_std": 0.17742466926574707, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.640625, + "rewards/symbolic_reward_accuracy/std": 0.48028653860092163, + "rewards/symbolic_reward_partial_score/mean": 0.8680013418197632, + "rewards/symbolic_reward_partial_score/std": 0.21553559601306915, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0774188041687012, + "sampling/importance_sampling_ratio/min": 2.0920735551044345e-05, + "sampling/sampling_logp_difference/max": 10.77476978302002, + "sampling/sampling_logp_difference/mean": 0.1450258493423462, + "step": 337 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3057921230792999, + "epoch": 0.8894736842105263, + "grad_norm": 0.008431348949670792, + "learning_rate": 1e-06, + "loss": 0.0051, + "step": 338 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3095565140247345, + "epoch": 0.8921052631578947, + "grad_norm": 0.005716415587812662, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 339 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3105488568544388, + "epoch": 0.8947368421052632, + "grad_norm": 0.0054482058621943, + "learning_rate": 1e-06, + "loss": -0.0037, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1476.0, + "completions/mean_length": 529.880859375, + "completions/mean_terminated_length": 467.7078857421875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.3118719011545181, + "epoch": 0.8973684210526316, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.014669405296444893, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 125877020.0, + "reward": 0.533341646194458, + "reward_std": 0.16296130418777466, + "rewards/progression_diversity/mean": -0.000796740350779146, + "rewards/progression_diversity/std": 0.018028177320957184, + "rewards/symbolic_reward_accuracy/mean": 0.482421875, + "rewards/symbolic_reward_accuracy/std": 0.5001795887947083, + "rewards/symbolic_reward_partial_score/mean": 0.81494140625, + "rewards/symbolic_reward_partial_score/std": 0.22069676220417023, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0750058889389038, + "sampling/importance_sampling_ratio/min": 0.000702597841154784, + "sampling/sampling_logp_difference/max": 7.260725975036621, + "sampling/sampling_logp_difference/mean": 0.14171919226646423, + "step": 341 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.307451456785202, + "epoch": 0.9, + "grad_norm": 0.01112651638686657, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 342 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.31406456232070923, + "epoch": 0.9026315789473685, + "grad_norm": 0.010442834347486496, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 343 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.30461055040359497, + "epoch": 0.9052631578947369, + "grad_norm": 0.011064223945140839, + "learning_rate": 1e-06, + "loss": 0.0225, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 537.166015625, + "completions/mean_terminated_length": 475.0216064453125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.3110172599554062, + "epoch": 0.9078947368421053, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.011030428111553192, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 126580721.0, + "reward": 0.48596975207328796, + "reward_std": 0.14998123049736023, + "rewards/progression_diversity/mean": -0.0016597331268712878, + "rewards/progression_diversity/std": 0.03166636824607849, + "rewards/symbolic_reward_accuracy/mean": 0.41015625, + "rewards/symbolic_reward_accuracy/std": 0.49234291911125183, + "rewards/symbolic_reward_partial_score/mean": 0.8009440302848816, + "rewards/symbolic_reward_partial_score/std": 0.20570434629917145, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.075448751449585, + "sampling/importance_sampling_ratio/min": 2.118588236044161e-05, + "sampling/sampling_logp_difference/max": 10.762175559997559, + "sampling/sampling_logp_difference/mean": 0.14029094576835632, + "step": 345 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3086661100387573, + "epoch": 0.9105263157894737, + "grad_norm": 0.011540532112121582, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 346 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2966995984315872, + "epoch": 0.9131578947368421, + "grad_norm": 0.01433405838906765, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 347 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.31083013117313385, + "epoch": 0.9157894736842105, + "grad_norm": 0.008436622098088264, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 480.43359375, + "completions/mean_terminated_length": 480.43359375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.309502974152565, + "epoch": 0.9184210526315789, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.011041209101676941, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 127224495.0, + "reward": 0.5547363758087158, + "reward_std": 0.1895267367362976, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.5078125, + "rewards/symbolic_reward_accuracy/std": 0.5004279017448425, + "rewards/symbolic_reward_partial_score/mean": 0.83349609375, + "rewards/symbolic_reward_partial_score/std": 0.2128904014825821, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0771770477294922, + "sampling/importance_sampling_ratio/min": 3.5030089406973275e-07, + "sampling/sampling_logp_difference/max": 14.864473342895508, + "sampling/sampling_logp_difference/mean": 0.1465146243572235, + "step": 349 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31064388155937195, + "epoch": 0.9210526315789473, + "grad_norm": 0.00748514523729682, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 350 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3066399097442627, + "epoch": 0.9236842105263158, + "grad_norm": 0.010583535768091679, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 351 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3035551458597183, + "epoch": 0.9263157894736842, + "grad_norm": 0.006878511048853397, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 1075.0, + "completions/mean_length": 486.05078125, + "completions/mean_terminated_length": 486.05078125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.3168664276599884, + "epoch": 0.9289473684210526, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01184882689267397, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 127881993.0, + "reward": 0.5418945550918579, + "reward_std": 0.17704787850379944, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.490234375, + "rewards/symbolic_reward_accuracy/std": 0.5003935098648071, + "rewards/symbolic_reward_partial_score/mean": 0.8264973759651184, + "rewards/symbolic_reward_partial_score/std": 0.21585585176944733, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0782511234283447, + "sampling/importance_sampling_ratio/min": 0.00025796302361413836, + "sampling/sampling_logp_difference/max": 8.262694358825684, + "sampling/sampling_logp_difference/mean": 0.14865058660507202, + "step": 353 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.30700162053108215, + "epoch": 0.9315789473684211, + "grad_norm": 0.005886297207325697, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 354 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3129795789718628, + "epoch": 0.9342105263157895, + "grad_norm": 0.01028487179428339, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 355 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3170662522315979, + "epoch": 0.9368421052631579, + "grad_norm": 0.007761337794363499, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 545.998046875, + "completions/mean_terminated_length": 483.8882751464844, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.30055612325668335, + "epoch": 0.9394736842105263, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.012874174863100052, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 128565512.0, + "reward": 0.6208984851837158, + "reward_std": 0.15273459255695343, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.609375, + "rewards/symbolic_reward_accuracy/std": 0.48836761713027954, + "rewards/symbolic_reward_partial_score/mean": 0.8522135615348816, + "rewards/symbolic_reward_partial_score/std": 0.22056303918361664, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.075897216796875, + "sampling/importance_sampling_ratio/min": 0.0001214630319736898, + "sampling/sampling_logp_difference/max": 9.015900611877441, + "sampling/sampling_logp_difference/mean": 0.14393819868564606, + "step": 357 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.31270015239715576, + "epoch": 0.9421052631578948, + "grad_norm": 0.007534582633525133, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 358 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3103422075510025, + "epoch": 0.9447368421052632, + "grad_norm": 0.008300635032355785, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 359 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31079424917697906, + "epoch": 0.9473684210526315, + "grad_norm": 0.007545569911599159, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1127.0, + "completions/mean_length": 517.212890625, + "completions/mean_terminated_length": 486.16241455078125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.30875363945961, + "epoch": 0.95, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.007635221816599369, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 129216469.0, + "reward": 0.5335408449172974, + "reward_std": 0.1456352025270462, + "rewards/progression_diversity/mean": -0.00040900660678744316, + "rewards/progression_diversity/std": 0.009254762902855873, + "rewards/symbolic_reward_accuracy/mean": 0.474609375, + "rewards/symbolic_reward_accuracy/std": 0.4998432695865631, + "rewards/symbolic_reward_partial_score/mean": 0.8299154043197632, + "rewards/symbolic_reward_partial_score/std": 0.20124566555023193, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.076637864112854, + "sampling/importance_sampling_ratio/min": 1.8904121361629223e-06, + "sampling/sampling_logp_difference/max": 13.178715705871582, + "sampling/sampling_logp_difference/mean": 0.14335882663726807, + "step": 361 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.30543169379234314, + "epoch": 0.9526315789473684, + "grad_norm": 0.010882689617574215, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 362 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3083820194005966, + "epoch": 0.9552631578947368, + "grad_norm": 0.01051326934248209, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 363 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30124035477638245, + "epoch": 0.9578947368421052, + "grad_norm": 0.006582474801689386, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 473.248046875, + "completions/mean_terminated_length": 473.248046875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.30113451182842255, + "epoch": 0.9605263157894737, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.01765936240553856, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 129851252.0, + "reward": 0.562792956829071, + "reward_std": 0.1252935379743576, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.5234375, + "rewards/symbolic_reward_accuracy/std": 0.49993884563446045, + "rewards/symbolic_reward_partial_score/mean": 0.8291015625, + "rewards/symbolic_reward_partial_score/std": 0.22126927971839905, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0775682926177979, + "sampling/importance_sampling_ratio/min": 0.0008246903889812529, + "sampling/sampling_logp_difference/max": 7.1005024909973145, + "sampling/sampling_logp_difference/mean": 0.14582598209381104, + "step": 365 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3071126341819763, + "epoch": 0.9631578947368421, + "grad_norm": 0.009068193845450878, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 366 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.30289457738399506, + "epoch": 0.9657894736842105, + "grad_norm": 0.004140099510550499, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 367 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3062494695186615, + "epoch": 0.968421052631579, + "grad_norm": 0.007102786097675562, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 469.71484375, + "completions/mean_terminated_length": 469.71484375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.3085251748561859, + "epoch": 0.9710526315789474, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.010728326626121998, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 130498562.0, + "reward": 0.5833496451377869, + "reward_std": 0.17793656885623932, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.546875, + "rewards/symbolic_reward_accuracy/std": 0.4982847273349762, + "rewards/symbolic_reward_partial_score/mean": 0.8507487177848816, + "rewards/symbolic_reward_partial_score/std": 0.19178888201713562, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0768487453460693, + "sampling/importance_sampling_ratio/min": 0.0007275677635334432, + "sampling/sampling_logp_difference/max": 7.225803375244141, + "sampling/sampling_logp_difference/mean": 0.1458495855331421, + "step": 369 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30653318762779236, + "epoch": 0.9736842105263158, + "grad_norm": 0.008490893058478832, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 370 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.30405572056770325, + "epoch": 0.9763157894736842, + "grad_norm": 0.0068202815018594265, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 371 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.30216623842716217, + "epoch": 0.9789473684210527, + "grad_norm": 0.013265615329146385, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 503.1953125, + "completions/mean_terminated_length": 472.1174011230469, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.30290573835372925, + "epoch": 0.9815789473684211, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.014201073907315731, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 131153990.0, + "reward": 0.6204589605331421, + "reward_std": 0.204176664352417, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.6015625, + "rewards/symbolic_reward_accuracy/std": 0.4900552034378052, + "rewards/symbolic_reward_partial_score/mean": 0.86572265625, + "rewards/symbolic_reward_partial_score/std": 0.19912473857402802, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0761308670043945, + "sampling/importance_sampling_ratio/min": 0.0017201791051775217, + "sampling/sampling_logp_difference/max": 6.365326881408691, + "sampling/sampling_logp_difference/mean": 0.1428317129611969, + "step": 373 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3077033460140228, + "epoch": 0.9842105263157894, + "grad_norm": 0.007973029278218746, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 374 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2994297742843628, + "epoch": 0.9868421052631579, + "grad_norm": 0.006763116456568241, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 375 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.304544135928154, + "epoch": 0.9894736842105263, + "grad_norm": 0.007952687330543995, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 471.927734375, + "completions/mean_terminated_length": 471.927734375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.3087266832590103, + "epoch": 0.9921052631578947, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.011059445329010487, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 131803233.0, + "reward": 0.6175293326377869, + "reward_std": 0.15380731225013733, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.59765625, + "rewards/symbolic_reward_accuracy/std": 0.4908501207828522, + "rewards/symbolic_reward_partial_score/mean": 0.8631184697151184, + "rewards/symbolic_reward_partial_score/std": 0.20576836168766022, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.076733112335205, + "sampling/importance_sampling_ratio/min": 0.00020886259153485298, + "sampling/sampling_logp_difference/max": 8.473834037780762, + "sampling/sampling_logp_difference/mean": 0.14535820484161377, + "step": 377 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.30368588864803314, + "epoch": 0.9947368421052631, + "grad_norm": 0.007541055791079998, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 378 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.30102120339870453, + "epoch": 0.9973684210526316, + "grad_norm": 0.008830598555505276, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 379 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.30921386182308197, + "epoch": 1.0, + "grad_norm": 0.008218697272241116, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 380 + }, + { + "epoch": 1.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.002685546875, + "eval_completions/max_length": 4941.03125, + "eval_completions/max_terminated_length": 1225.71875, + "eval_completions/mean_length": 509.00732421875, + "eval_completions/mean_terminated_length": 466.2677412033081, + "eval_completions/min_length": 191.28125, + "eval_completions/min_terminated_length": 191.28125, + "eval_entropy": 0.30122990906238556, + "eval_frac_reward_zero_std": 0.265625, + "eval_loss": 0.00210479530505836, + "eval_num_tokens": 131803233.0, + "eval_reward": 0.6341025996953249, + "eval_reward_std": 0.17670447006821632, + "eval_rewards/progression_diversity/mean": -0.0003870166310662171, + "eval_rewards/progression_diversity/std": 0.003330339086460299, + "eval_rewards/symbolic_reward_accuracy/mean": 0.62890625, + "eval_rewards/symbolic_reward_accuracy/std": 0.46045348327606916, + "eval_rewards/symbolic_reward_partial_score/mean": 0.8587239664047956, + "eval_rewards/symbolic_reward_partial_score/std": 0.20982443122193217, + "eval_rewards/tag_count_reward/mean": -0.008544921875, + "eval_rewards/tag_count_reward/std": 0.05361618706956506, + "eval_runtime": 573.6785, + "eval_samples_per_second": 0.436, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0764745026826859, + "eval_sampling/importance_sampling_ratio/min": 0.0018949521987123776, + "eval_sampling/sampling_logp_difference/max": 19.122781857848167, + "eval_sampling/sampling_logp_difference/mean": 0.14713718881830573, + "eval_steps_per_second": 0.003, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 453.083984375, + "completions/mean_terminated_length": 453.083984375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.31483590602874756, + "epoch": 1.0026315789473683, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.011226167902350426, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 132457548.0, + "reward": 0.536572277545929, + "reward_std": 0.17151208221912384, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.484375, + "rewards/symbolic_reward_accuracy/std": 0.5002445578575134, + "rewards/symbolic_reward_partial_score/mean": 0.81982421875, + "rewards/symbolic_reward_partial_score/std": 0.2147509604692459, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0785313844680786, + "sampling/importance_sampling_ratio/min": 4.000367334811017e-05, + "sampling/sampling_logp_difference/max": 10.12653923034668, + "sampling/sampling_logp_difference/mean": 0.1462843418121338, + "step": 381 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.30385373532772064, + "epoch": 1.0052631578947369, + "grad_norm": 0.007102385628968477, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 382 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3114987909793854, + "epoch": 1.0078947368421052, + "grad_norm": 0.010028541088104248, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 383 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.30750955641269684, + "epoch": 1.0105263157894737, + "grad_norm": 0.007849895395338535, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 487.294921875, + "completions/mean_terminated_length": 456.1859130859375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.30419932305812836, + "epoch": 1.013157894736842, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.00997752696275711, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 133121859.0, + "reward": 0.5177246332168579, + "reward_std": 0.16577917337417603, + "rewards/progression_diversity/mean": -1.0128132998943329e-07, + "rewards/progression_diversity/std": 2.2917349724593805e-06, + "rewards/symbolic_reward_accuracy/mean": 0.453125, + "rewards/symbolic_reward_accuracy/std": 0.4982847273349762, + "rewards/symbolic_reward_partial_score/mean": 0.8201497793197632, + "rewards/symbolic_reward_partial_score/std": 0.19332556426525116, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0750830173492432, + "sampling/importance_sampling_ratio/min": 3.3023450669134036e-05, + "sampling/sampling_logp_difference/max": 10.318292617797852, + "sampling/sampling_logp_difference/mean": 0.14270050823688507, + "step": 385 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.301358163356781, + "epoch": 1.0157894736842106, + "grad_norm": 0.007478445768356323, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 386 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.3112320601940155, + "epoch": 1.018421052631579, + "grad_norm": 0.01002897322177887, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 387 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2982504069805145, + "epoch": 1.0210526315789474, + "grad_norm": 0.008274735882878304, + "learning_rate": 1e-06, + "loss": 0.03, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 446.642578125, + "completions/mean_terminated_length": 446.642578125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.30985428392887115, + "epoch": 1.0236842105263158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.007278556935489178, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 133741996.0, + "reward": 0.6395508050918579, + "reward_std": 0.09756526350975037, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.625, + "rewards/symbolic_reward_accuracy/std": 0.4845963716506958, + "rewards/symbolic_reward_partial_score/mean": 0.8818359375, + "rewards/symbolic_reward_partial_score/std": 0.18144503235816956, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0790255069732666, + "sampling/importance_sampling_ratio/min": 0.0008311926503665745, + "sampling/sampling_logp_difference/max": 7.092648983001709, + "sampling/sampling_logp_difference/mean": 0.1477079838514328, + "step": 389 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3042980283498764, + "epoch": 1.0263157894736843, + "grad_norm": 0.009012778289616108, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 390 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.313108429312706, + "epoch": 1.0289473684210526, + "grad_norm": 0.00844503566622734, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 391 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3174958974123001, + "epoch": 1.0315789473684212, + "grad_norm": 0.01085878349840641, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 441.478515625, + "completions/mean_terminated_length": 441.478515625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.31333790719509125, + "epoch": 1.0342105263157895, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.008390745148062706, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 134371841.0, + "reward": 0.5450682640075684, + "reward_std": 0.11910407245159149, + "rewards/progression_diversity/mean": -7.797831131028943e-06, + "rewards/progression_diversity/std": 0.00017644476611167192, + "rewards/symbolic_reward_accuracy/mean": 0.490234375, + "rewards/symbolic_reward_accuracy/std": 0.5003935098648071, + "rewards/symbolic_reward_partial_score/mean": 0.83642578125, + "rewards/symbolic_reward_partial_score/std": 0.18248093128204346, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078395128250122, + "sampling/importance_sampling_ratio/min": 7.312164962058887e-05, + "sampling/sampling_logp_difference/max": 9.523386001586914, + "sampling/sampling_logp_difference/mean": 0.14865905046463013, + "step": 393 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3087403327226639, + "epoch": 1.0368421052631578, + "grad_norm": 0.007888459600508213, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 394 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3131715953350067, + "epoch": 1.0394736842105263, + "grad_norm": 0.008880356326699257, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 395 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3134143352508545, + "epoch": 1.0421052631578946, + "grad_norm": 0.006940594408661127, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 442.345703125, + "completions/mean_terminated_length": 442.345703125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.3102358728647232, + "epoch": 1.0447368421052632, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.007839059457182884, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 134999282.0, + "reward": 0.622314453125, + "reward_std": 0.1633392572402954, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.60546875, + "rewards/symbolic_reward_accuracy/std": 0.4892277717590332, + "rewards/symbolic_reward_partial_score/mean": 0.8634440302848816, + "rewards/symbolic_reward_partial_score/std": 0.20192034542560577, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07792067527771, + "sampling/importance_sampling_ratio/min": 6.452879915741505e-06, + "sampling/sampling_logp_difference/max": 11.950984001159668, + "sampling/sampling_logp_difference/mean": 0.14813551306724548, + "step": 397 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.31489740312099457, + "epoch": 1.0473684210526315, + "grad_norm": 0.009207435883581638, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 398 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.31071890890598297, + "epoch": 1.05, + "grad_norm": 0.006323720328509808, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 399 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.31255054473876953, + "epoch": 1.0526315789473684, + "grad_norm": 0.015162119641900063, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 425.5390625, + "completions/mean_terminated_length": 425.5390625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3116391748189926, + "epoch": 1.055263157894737, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.011498549953103065, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 135611462.0, + "reward": 0.6023924350738525, + "reward_std": 0.1811559945344925, + "rewards/progression_diversity/mean": -1.2093556506442837e-05, + "rewards/progression_diversity/std": 0.00027364594279788435, + "rewards/symbolic_reward_accuracy/mean": 0.578125, + "rewards/symbolic_reward_accuracy/std": 0.49434176087379456, + "rewards/symbolic_reward_partial_score/mean": 0.8517252802848816, + "rewards/symbolic_reward_partial_score/std": 0.20265617966651917, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078771710395813, + "sampling/importance_sampling_ratio/min": 8.55166106816796e-08, + "sampling/sampling_logp_difference/max": 16.274555206298828, + "sampling/sampling_logp_difference/mean": 0.14964525401592255, + "step": 401 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.31484755873680115, + "epoch": 1.0578947368421052, + "grad_norm": 0.007245294749736786, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 402 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.31309810280799866, + "epoch": 1.0605263157894738, + "grad_norm": 0.005314267706125975, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 403 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3142850995063782, + "epoch": 1.063157894736842, + "grad_norm": 0.012064156122505665, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1077.0, + "completions/max_terminated_length": 1077.0, + "completions/mean_length": 444.736328125, + "completions/mean_terminated_length": 444.736328125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.31040342152118683, + "epoch": 1.0657894736842106, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.014743371866643429, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 136253823.0, + "reward": 0.571484386920929, + "reward_std": 0.1715823858976364, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.537109375, + "rewards/symbolic_reward_accuracy/std": 0.4991086423397064, + "rewards/symbolic_reward_partial_score/mean": 0.8307291865348816, + "rewards/symbolic_reward_partial_score/std": 0.21525520086288452, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07864248752594, + "sampling/importance_sampling_ratio/min": 2.170778117260852e-07, + "sampling/sampling_logp_difference/max": 15.343009948730469, + "sampling/sampling_logp_difference/mean": 0.1498403698205948, + "step": 405 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3129070848226547, + "epoch": 1.068421052631579, + "grad_norm": 0.008034562692046165, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 406 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3151528090238571, + "epoch": 1.0710526315789473, + "grad_norm": 0.007343663834035397, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 407 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31475599110126495, + "epoch": 1.0736842105263158, + "grad_norm": 0.009991384111344814, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 439.943359375, + "completions/mean_terminated_length": 439.943359375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.31397488713264465, + "epoch": 1.0763157894736841, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.009094743058085442, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 136884386.0, + "reward": 0.552441418170929, + "reward_std": 0.1292879283428192, + "rewards/progression_diversity/mean": -7.611899377479858e-07, + "rewards/progression_diversity/std": 1.722376146062743e-05, + "rewards/symbolic_reward_accuracy/mean": 0.5, + "rewards/symbolic_reward_accuracy/std": 0.5004889965057373, + "rewards/symbolic_reward_partial_score/mean": 0.8414713144302368, + "rewards/symbolic_reward_partial_score/std": 0.19328801333904266, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0790287256240845, + "sampling/importance_sampling_ratio/min": 0.0020058066584169865, + "sampling/sampling_logp_difference/max": 6.211709022521973, + "sampling/sampling_logp_difference/mean": 0.14997223019599915, + "step": 409 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3111269772052765, + "epoch": 1.0789473684210527, + "grad_norm": 0.010494858026504517, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 410 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.31543155014514923, + "epoch": 1.081578947368421, + "grad_norm": 0.006056911777704954, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 411 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3128467947244644, + "epoch": 1.0842105263157895, + "grad_norm": 0.0047110323794186115, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 458.69921875, + "completions/mean_terminated_length": 427.53424072265625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.3135870397090912, + "epoch": 1.0868421052631578, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.008776719681918621, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 137529544.0, + "reward": 0.5720604062080383, + "reward_std": 0.18176168203353882, + "rewards/progression_diversity/mean": -0.0009923388715833426, + "rewards/progression_diversity/std": 0.022347548976540565, + "rewards/symbolic_reward_accuracy/mean": 0.53515625, + "rewards/symbolic_reward_accuracy/std": 0.49925029277801514, + "rewards/symbolic_reward_partial_score/mean": 0.8372396230697632, + "rewards/symbolic_reward_partial_score/std": 0.20521557331085205, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0783798694610596, + "sampling/importance_sampling_ratio/min": 3.016177743120352e-06, + "sampling/sampling_logp_difference/max": 12.711520195007324, + "sampling/sampling_logp_difference/mean": 0.14703714847564697, + "step": 413 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.315027192234993, + "epoch": 1.0894736842105264, + "grad_norm": 0.007484138943254948, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 414 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.31333404779434204, + "epoch": 1.0921052631578947, + "grad_norm": 0.006227640900760889, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 415 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.30919110774993896, + "epoch": 1.0947368421052632, + "grad_norm": 0.04769471660256386, + "learning_rate": 1e-06, + "loss": 0.0281, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 436.810546875, + "completions/mean_terminated_length": 436.810546875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.31040582060813904, + "epoch": 1.0973684210526315, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.011968264356255531, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 138151143.0, + "reward": 0.645214855670929, + "reward_std": 0.16506317257881165, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.6328125, + "rewards/symbolic_reward_accuracy/std": 0.48250964283943176, + "rewards/symbolic_reward_partial_score/mean": 0.8850911855697632, + "rewards/symbolic_reward_partial_score/std": 0.1865391731262207, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0785300731658936, + "sampling/importance_sampling_ratio/min": 0.0006420343997888267, + "sampling/sampling_logp_difference/max": 7.3508687019348145, + "sampling/sampling_logp_difference/mean": 0.1492883265018463, + "step": 417 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.30660849809646606, + "epoch": 1.1, + "grad_norm": 0.00733218202367425, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 418 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3156313896179199, + "epoch": 1.1026315789473684, + "grad_norm": 0.008495202288031578, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 419 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3160051256418228, + "epoch": 1.1052631578947367, + "grad_norm": 0.004335207864642143, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 418.0078125, + "completions/mean_terminated_length": 418.0078125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.3174561411142349, + "epoch": 1.1078947368421053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.009130661375820637, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 138748619.0, + "reward": 0.5871574282646179, + "reward_std": 0.10197652876377106, + "rewards/progression_diversity/mean": -8.063457789830863e-05, + "rewards/progression_diversity/std": 0.0018245523096993566, + "rewards/symbolic_reward_accuracy/mean": 0.552734375, + "rewards/symbolic_reward_accuracy/std": 0.4976975917816162, + "rewards/symbolic_reward_partial_score/mean": 0.8517252802848816, + "rewards/symbolic_reward_partial_score/std": 0.1851339489221573, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0805712938308716, + "sampling/importance_sampling_ratio/min": 1.1332810384567793e-10, + "sampling/sampling_logp_difference/max": 22.900733947753906, + "sampling/sampling_logp_difference/mean": 0.15244132280349731, + "step": 421 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.31834374368190765, + "epoch": 1.1105263157894736, + "grad_norm": 0.00669381208717823, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 422 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.31199371814727783, + "epoch": 1.1131578947368421, + "grad_norm": 0.006369912531226873, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 423 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31869344413280487, + "epoch": 1.1157894736842104, + "grad_norm": 0.009942461736500263, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 424.611328125, + "completions/mean_terminated_length": 424.611328125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.3098357766866684, + "epoch": 1.118421052631579, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.01318982895463705, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 139371172.0, + "reward": 0.5094720125198364, + "reward_std": 0.14669474959373474, + "rewards/progression_diversity/mean": -6.337450759019703e-05, + "rewards/progression_diversity/std": 0.0014340013731271029, + "rewards/symbolic_reward_accuracy/mean": 0.4375, + "rewards/symbolic_reward_accuracy/std": 0.49656352400779724, + "rewards/symbolic_reward_partial_score/mean": 0.8238931894302368, + "rewards/symbolic_reward_partial_score/std": 0.17723464965820312, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0803987979888916, + "sampling/importance_sampling_ratio/min": 0.00013676800881512463, + "sampling/sampling_logp_difference/max": 8.897224426269531, + "sampling/sampling_logp_difference/mean": 0.15104670822620392, + "step": 425 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.32109367847442627, + "epoch": 1.1210526315789473, + "grad_norm": 0.010008157230913639, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 426 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3111288249492645, + "epoch": 1.1236842105263158, + "grad_norm": 0.009902669116854668, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 427 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3151933252811432, + "epoch": 1.1263157894736842, + "grad_norm": 0.006997853983193636, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 411.9609375, + "completions/mean_terminated_length": 411.9609375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.31707698106765747, + "epoch": 1.1289473684210527, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.009430482983589172, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 139965712.0, + "reward": 0.5824218988418579, + "reward_std": 0.09653446823358536, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.5390625, + "rewards/symbolic_reward_accuracy/std": 0.4989593029022217, + "rewards/symbolic_reward_partial_score/mean": 0.86328125, + "rewards/symbolic_reward_partial_score/std": 0.1686072200536728, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0779767036437988, + "sampling/importance_sampling_ratio/min": 0.0005325896781869233, + "sampling/sampling_logp_difference/max": 7.537759304046631, + "sampling/sampling_logp_difference/mean": 0.1509750634431839, + "step": 429 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.31451140344142914, + "epoch": 1.131578947368421, + "grad_norm": 0.006913443561643362, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 430 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3125956207513809, + "epoch": 1.1342105263157896, + "grad_norm": 0.005642498843371868, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 431 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.30931124091148376, + "epoch": 1.1368421052631579, + "grad_norm": 0.0037342640571296215, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 416.34375, + "completions/mean_terminated_length": 416.34375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.31373219192028046, + "epoch": 1.1394736842105262, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.009242606349289417, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 140564160.0, + "reward": 0.6995117664337158, + "reward_std": 0.17368769645690918, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.712890625, + "rewards/symbolic_reward_accuracy/std": 0.45285552740097046, + "rewards/symbolic_reward_partial_score/mean": 0.9059244394302368, + "rewards/symbolic_reward_partial_score/std": 0.18118035793304443, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078127384185791, + "sampling/importance_sampling_ratio/min": 0.00010586700227577239, + "sampling/sampling_logp_difference/max": 9.153326988220215, + "sampling/sampling_logp_difference/mean": 0.1500268280506134, + "step": 433 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3079152852296829, + "epoch": 1.1421052631578947, + "grad_norm": 0.010234912857413292, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 434 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.31362414360046387, + "epoch": 1.1447368421052633, + "grad_norm": 0.009198206476867199, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 435 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.313403844833374, + "epoch": 1.1473684210526316, + "grad_norm": 0.007888988591730595, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 455.685546875, + "completions/mean_terminated_length": 424.5146789550781, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.30758020281791687, + "epoch": 1.15, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.012641419656574726, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 141187103.0, + "reward": 0.5866211652755737, + "reward_std": 0.12764082849025726, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.544921875, + "rewards/symbolic_reward_accuracy/std": 0.4984649419784546, + "rewards/symbolic_reward_partial_score/mean": 0.8662109375, + "rewards/symbolic_reward_partial_score/std": 0.1738623082637787, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0769283771514893, + "sampling/importance_sampling_ratio/min": 6.12766743870452e-05, + "sampling/sampling_logp_difference/max": 9.700111389160156, + "sampling/sampling_logp_difference/mean": 0.14595070481300354, + "step": 437 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.31858476996421814, + "epoch": 1.1526315789473685, + "grad_norm": 0.006982941180467606, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 438 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.30900435149669647, + "epoch": 1.1552631578947368, + "grad_norm": 0.005663620308041573, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 439 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3072082996368408, + "epoch": 1.1578947368421053, + "grad_norm": 0.006159190554171801, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 411.267578125, + "completions/mean_terminated_length": 411.267578125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.3167288303375244, + "epoch": 1.1605263157894736, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.008326681330800056, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 141824520.0, + "reward": 0.5193359851837158, + "reward_std": 0.1523609459400177, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.44921875, + "rewards/symbolic_reward_accuracy/std": 0.497901052236557, + "rewards/symbolic_reward_partial_score/mean": 0.8326823115348816, + "rewards/symbolic_reward_partial_score/std": 0.18424758315086365, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0789525508880615, + "sampling/importance_sampling_ratio/min": 0.0006212457665242255, + "sampling/sampling_logp_difference/max": 7.38378381729126, + "sampling/sampling_logp_difference/mean": 0.15032032132148743, + "step": 441 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3107949197292328, + "epoch": 1.1631578947368422, + "grad_norm": 0.009470357559621334, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 442 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.31048038601875305, + "epoch": 1.1657894736842105, + "grad_norm": 0.008164659142494202, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 443 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.31193624436855316, + "epoch": 1.168421052631579, + "grad_norm": 0.006771499291062355, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1159.0, + "completions/mean_length": 454.470703125, + "completions/mean_terminated_length": 423.2974548339844, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.3074225187301636, + "epoch": 1.1710526315789473, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.012199649587273598, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 142451673.0, + "reward": 0.6461914777755737, + "reward_std": 0.17145656049251556, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.638671875, + "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, + "rewards/symbolic_reward_partial_score/mean": 0.8772786259651184, + "rewards/symbolic_reward_partial_score/std": 0.19054405391216278, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0755971670150757, + "sampling/importance_sampling_ratio/min": 0.0009448478813283145, + "sampling/sampling_logp_difference/max": 6.964486598968506, + "sampling/sampling_logp_difference/mean": 0.14415866136550903, + "step": 445 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3090481609106064, + "epoch": 1.1736842105263159, + "grad_norm": 0.010675223544239998, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 446 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2992829978466034, + "epoch": 1.1763157894736842, + "grad_norm": 0.009602191857993603, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 447 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3019806146621704, + "epoch": 1.1789473684210527, + "grad_norm": 0.0074586388655006886, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1698.0, + "completions/max_terminated_length": 1698.0, + "completions/mean_length": 416.384765625, + "completions/mean_terminated_length": 416.384765625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.3051687926054001, + "epoch": 1.181578947368421, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.009870451875030994, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 143072158.0, + "reward": 0.6162108182907104, + "reward_std": 0.16109126806259155, + "rewards/progression_diversity/mean": -1.2670810974668711e-05, + "rewards/progression_diversity/std": 0.0002688794629648328, + "rewards/symbolic_reward_accuracy/mean": 0.59375, + "rewards/symbolic_reward_accuracy/std": 0.49161264300346375, + "rewards/symbolic_reward_partial_score/mean": 0.8665364384651184, + "rewards/symbolic_reward_partial_score/std": 0.20193934440612793, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0762814283370972, + "sampling/importance_sampling_ratio/min": 0.00018196065502706915, + "sampling/sampling_logp_difference/max": 8.611720085144043, + "sampling/sampling_logp_difference/mean": 0.14643414318561554, + "step": 449 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.31105637550354004, + "epoch": 1.1842105263157894, + "grad_norm": 0.007723489310592413, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 450 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30340468883514404, + "epoch": 1.186842105263158, + "grad_norm": 0.007095393259078264, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 451 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.30283382534980774, + "epoch": 1.1894736842105262, + "grad_norm": 0.003884904785081744, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 421.1328125, + "completions/mean_terminated_length": 421.1328125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.3055991977453232, + "epoch": 1.1921052631578948, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.00943301897495985, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 143696098.0, + "reward": 0.5283684730529785, + "reward_std": 0.1500403881072998, + "rewards/progression_diversity/mean": -6.903627945575863e-05, + "rewards/progression_diversity/std": 0.001343548996374011, + "rewards/symbolic_reward_accuracy/mean": 0.462890625, + "rewards/symbolic_reward_accuracy/std": 0.4991086423397064, + "rewards/symbolic_reward_partial_score/mean": 0.83544921875, + "rewards/symbolic_reward_partial_score/std": 0.181224524974823, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0770254135131836, + "sampling/importance_sampling_ratio/min": 0.0003639268397819251, + "sampling/sampling_logp_difference/max": 7.918557643890381, + "sampling/sampling_logp_difference/mean": 0.14773014187812805, + "step": 453 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.30224618315696716, + "epoch": 1.194736842105263, + "grad_norm": 0.010618505999445915, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 454 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.31181102991104126, + "epoch": 1.1973684210526316, + "grad_norm": 0.008554063737392426, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 455 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.31251056492328644, + "epoch": 1.2, + "grad_norm": 0.00930565595626831, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 411.087890625, + "completions/mean_terminated_length": 411.087890625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.3041108399629593, + "epoch": 1.2026315789473685, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.010307530872523785, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 144323215.0, + "reward": 0.6019531488418579, + "reward_std": 0.15361054241657257, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.5703125, + "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, + "rewards/symbolic_reward_partial_score/mean": 0.8658853769302368, + "rewards/symbolic_reward_partial_score/std": 0.1798396110534668, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0760828256607056, + "sampling/importance_sampling_ratio/min": 0.00027435572701506317, + "sampling/sampling_logp_difference/max": 8.201085090637207, + "sampling/sampling_logp_difference/mean": 0.14550653100013733, + "step": 457 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.30855709314346313, + "epoch": 1.2052631578947368, + "grad_norm": 0.006558986846357584, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 458 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.30384236574172974, + "epoch": 1.2078947368421054, + "grad_norm": 0.005990568548440933, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 459 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.30504827201366425, + "epoch": 1.2105263157894737, + "grad_norm": 0.011308073066174984, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 413.00390625, + "completions/mean_terminated_length": 413.00390625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.3013734370470047, + "epoch": 1.2131578947368422, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.008770092390477657, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 144952657.0, + "reward": 0.5407714247703552, + "reward_std": 0.12260206788778305, + "rewards/progression_diversity/mean": -6.970949470996857e-06, + "rewards/progression_diversity/std": 0.00015773458289913833, + "rewards/symbolic_reward_accuracy/mean": 0.4921875, + "rewards/symbolic_reward_accuracy/std": 0.5004279017448425, + "rewards/symbolic_reward_partial_score/mean": 0.8181966543197632, + "rewards/symbolic_reward_partial_score/std": 0.21273426711559296, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.076730728149414, + "sampling/importance_sampling_ratio/min": 7.727334013907239e-05, + "sampling/sampling_logp_difference/max": 9.468161582946777, + "sampling/sampling_logp_difference/mean": 0.14597457647323608, + "step": 461 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.30145081877708435, + "epoch": 1.2157894736842105, + "grad_norm": 0.005171489901840687, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 462 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3117906451225281, + "epoch": 1.2184210526315788, + "grad_norm": 0.00654226541519165, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 463 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3086640387773514, + "epoch": 1.2210526315789474, + "grad_norm": 0.007939969189465046, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 445.212890625, + "completions/mean_terminated_length": 414.0215148925781, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.31280216574668884, + "epoch": 1.2236842105263157, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.009629838168621063, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 145584574.0, + "reward": 0.6421382427215576, + "reward_std": 0.18600882589817047, + "rewards/progression_diversity/mean": -3.9178747101686895e-05, + "rewards/progression_diversity/std": 0.000886513851583004, + "rewards/symbolic_reward_accuracy/mean": 0.626953125, + "rewards/symbolic_reward_accuracy/std": 0.48408737778663635, + "rewards/symbolic_reward_partial_score/mean": 0.88720703125, + "rewards/symbolic_reward_partial_score/std": 0.17482532560825348, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0746833086013794, + "sampling/importance_sampling_ratio/min": 0.0003272001340519637, + "sampling/sampling_logp_difference/max": 8.024938583374023, + "sampling/sampling_logp_difference/mean": 0.13991132378578186, + "step": 465 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2979233115911484, + "epoch": 1.2263157894736842, + "grad_norm": 0.009606428444385529, + "learning_rate": 1e-06, + "loss": 0.0284, + "step": 466 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3038392812013626, + "epoch": 1.2289473684210526, + "grad_norm": 0.006687372922897339, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 467 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.30388760566711426, + "epoch": 1.231578947368421, + "grad_norm": 0.010134851559996605, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 409.5859375, + "completions/mean_terminated_length": 409.5859375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3190210312604904, + "epoch": 1.2342105263157894, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.010196955874562263, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 146227466.0, + "reward": 0.5007807016372681, + "reward_std": 0.19226862490177155, + "rewards/progression_diversity/mean": -5.4633375839330256e-05, + "rewards/progression_diversity/std": 0.0007276704418472946, + "rewards/symbolic_reward_accuracy/mean": 0.4296875, + "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, + "rewards/symbolic_reward_partial_score/mean": 0.8098958730697632, + "rewards/symbolic_reward_partial_score/std": 0.19706134498119354, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0778709650039673, + "sampling/importance_sampling_ratio/min": 0.001399537082761526, + "sampling/sampling_logp_difference/max": 6.571613788604736, + "sampling/sampling_logp_difference/mean": 0.14782477915287018, + "step": 469 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3139819949865341, + "epoch": 1.236842105263158, + "grad_norm": 0.007627170532941818, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 470 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.30873872339725494, + "epoch": 1.2394736842105263, + "grad_norm": 0.006808551959693432, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 471 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3088305741548538, + "epoch": 1.2421052631578948, + "grad_norm": 0.011055199429392815, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 410.4921875, + "completions/mean_terminated_length": 410.4921875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.31317228078842163, + "epoch": 1.2447368421052631, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.009969279170036316, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 146844454.0, + "reward": 0.566699206829071, + "reward_std": 0.13038089871406555, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.513671875, + "rewards/symbolic_reward_accuracy/std": 0.5003018379211426, + "rewards/symbolic_reward_partial_score/mean": 0.8616536259651184, + "rewards/symbolic_reward_partial_score/std": 0.16637560725212097, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0773735046386719, + "sampling/importance_sampling_ratio/min": 2.6251354938722216e-06, + "sampling/sampling_logp_difference/max": 12.850378036499023, + "sampling/sampling_logp_difference/mean": 0.1465354859828949, + "step": 473 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.31050053238868713, + "epoch": 1.2473684210526317, + "grad_norm": 0.005133951548486948, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 474 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3095279932022095, + "epoch": 1.25, + "grad_norm": 0.009616038762032986, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 475 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.31143562495708466, + "epoch": 1.2526315789473683, + "grad_norm": 0.008402734994888306, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 410.126953125, + "completions/mean_terminated_length": 410.126953125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.31026244163513184, + "epoch": 1.2552631578947369, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00997492577880621, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 147446215.0, + "reward": 0.6174317002296448, + "reward_std": 0.0901901125907898, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.59765625, + "rewards/symbolic_reward_accuracy/std": 0.4908501207828522, + "rewards/symbolic_reward_partial_score/mean": 0.8634440302848816, + "rewards/symbolic_reward_partial_score/std": 0.19049188494682312, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0762457847595215, + "sampling/importance_sampling_ratio/min": 2.626519199111499e-05, + "sampling/sampling_logp_difference/max": 10.547266006469727, + "sampling/sampling_logp_difference/mean": 0.14433935284614563, + "step": 477 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.30593959987163544, + "epoch": 1.2578947368421054, + "grad_norm": 0.007015667390078306, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 478 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.30408360064029694, + "epoch": 1.2605263157894737, + "grad_norm": 0.005526754539459944, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 479 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3031175583600998, + "epoch": 1.263157894736842, + "grad_norm": 0.006289470940828323, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 407.314453125, + "completions/mean_terminated_length": 407.314453125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.3031823933124542, + "epoch": 1.2657894736842106, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.011879026889801025, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 148058728.0, + "reward": 0.547900378704071, + "reward_std": 0.1337689757347107, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.490234375, + "rewards/symbolic_reward_accuracy/std": 0.5003935098648071, + "rewards/symbolic_reward_partial_score/mean": 0.8458659052848816, + "rewards/symbolic_reward_partial_score/std": 0.1819261908531189, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0763696432113647, + "sampling/importance_sampling_ratio/min": 0.0010056419996544719, + "sampling/sampling_logp_difference/max": 6.902129173278809, + "sampling/sampling_logp_difference/mean": 0.1454847753047943, + "step": 481 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31013526022434235, + "epoch": 1.268421052631579, + "grad_norm": 0.005811288487166166, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 482 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.30912819504737854, + "epoch": 1.2710526315789474, + "grad_norm": 0.006225419230759144, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 483 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.30452604591846466, + "epoch": 1.2736842105263158, + "grad_norm": 0.007433480583131313, + "learning_rate": 1e-06, + "loss": -0.0034, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 418.41796875, + "completions/mean_terminated_length": 418.41796875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.30341316759586334, + "epoch": 1.2763157894736843, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.011153457686305046, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 148655230.0, + "reward": 0.686572253704071, + "reward_std": 0.11991460621356964, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.705078125, + "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, + "rewards/symbolic_reward_partial_score/mean": 0.87841796875, + "rewards/symbolic_reward_partial_score/std": 0.21000272035598755, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.075927972793579, + "sampling/importance_sampling_ratio/min": 0.0007375699933618307, + "sampling/sampling_logp_difference/max": 7.212149620056152, + "sampling/sampling_logp_difference/mean": 0.14460578560829163, + "step": 485 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2982192039489746, + "epoch": 1.2789473684210526, + "grad_norm": 0.005498748738318682, + "learning_rate": 1e-06, + "loss": -0.0034, + "step": 486 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.30715058743953705, + "epoch": 1.2815789473684212, + "grad_norm": 0.00860719196498394, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 487 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.30552174150943756, + "epoch": 1.2842105263157895, + "grad_norm": 0.007835413329303265, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 403.826171875, + "completions/mean_terminated_length": 403.826171875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.3113265782594681, + "epoch": 1.2868421052631578, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.010389966890215874, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 149262469.0, + "reward": 0.5675293207168579, + "reward_std": 0.10506439208984375, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.515625, + "rewards/symbolic_reward_accuracy/std": 0.5002445578575134, + "rewards/symbolic_reward_partial_score/mean": 0.8605142831802368, + "rewards/symbolic_reward_partial_score/std": 0.16847233474254608, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07625412940979, + "sampling/importance_sampling_ratio/min": 0.002913564909249544, + "sampling/sampling_logp_difference/max": 5.838377952575684, + "sampling/sampling_logp_difference/mean": 0.14658384025096893, + "step": 489 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.30906206369400024, + "epoch": 1.2894736842105263, + "grad_norm": 0.006328464951366186, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 490 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.30755680799484253, + "epoch": 1.2921052631578949, + "grad_norm": 0.009180142544209957, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 491 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3033202886581421, + "epoch": 1.2947368421052632, + "grad_norm": 0.0056364513002336025, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 405.30078125, + "completions/mean_terminated_length": 405.30078125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.30497485399246216, + "epoch": 1.2973684210526315, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.008342030458152294, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 149876959.0, + "reward": 0.6874990463256836, + "reward_std": 0.1433926224708557, + "rewards/progression_diversity/mean": -0.00010337447747588158, + "rewards/progression_diversity/std": 0.0016614391934126616, + "rewards/symbolic_reward_accuracy/mean": 0.69140625, + "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, + "rewards/symbolic_reward_partial_score/mean": 0.9088541865348816, + "rewards/symbolic_reward_partial_score/std": 0.14890244603157043, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0752720832824707, + "sampling/importance_sampling_ratio/min": 0.0008119558915495872, + "sampling/sampling_logp_difference/max": 7.116064548492432, + "sampling/sampling_logp_difference/mean": 0.14508485794067383, + "step": 493 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3033030182123184, + "epoch": 1.3, + "grad_norm": 0.004780566319823265, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 494 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30343642830848694, + "epoch": 1.3026315789473684, + "grad_norm": 0.006665179040282965, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 495 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3080981373786926, + "epoch": 1.305263157894737, + "grad_norm": 0.00797088909894228, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 407.87890625, + "completions/mean_terminated_length": 407.87890625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.30483171343803406, + "epoch": 1.3078947368421052, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.010050629265606403, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 150488097.0, + "reward": 0.545116662979126, + "reward_std": 0.13066868484020233, + "rewards/progression_diversity/mean": -5.45380862604361e-05, + "rewards/progression_diversity/std": 0.0008203862817026675, + "rewards/symbolic_reward_accuracy/mean": 0.494140625, + "rewards/symbolic_reward_accuracy/std": 0.5004546642303467, + "rewards/symbolic_reward_partial_score/mean": 0.8287760615348816, + "rewards/symbolic_reward_partial_score/std": 0.19000308215618134, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0742123126983643, + "sampling/importance_sampling_ratio/min": 9.160504851024598e-05, + "sampling/sampling_logp_difference/max": 9.29802417755127, + "sampling/sampling_logp_difference/mean": 0.14309212565422058, + "step": 497 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.29966841638088226, + "epoch": 1.3105263157894738, + "grad_norm": 0.004824103321880102, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 498 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.30102767050266266, + "epoch": 1.313157894736842, + "grad_norm": 0.007378766778856516, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 499 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.30113108456134796, + "epoch": 1.3157894736842106, + "grad_norm": 0.009306096471846104, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 399.66796875, + "completions/mean_terminated_length": 399.66796875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.30337396264076233, + "epoch": 1.318421052631579, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.011074943467974663, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 151096535.0, + "reward": 0.6620602011680603, + "reward_std": 0.17205195128917694, + "rewards/progression_diversity/mean": -3.9536142139695585e-05, + "rewards/progression_diversity/std": 0.0008946007583290339, + "rewards/symbolic_reward_accuracy/mean": 0.6640625, + "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, + "rewards/symbolic_reward_partial_score/mean": 0.8787435293197632, + "rewards/symbolic_reward_partial_score/std": 0.19655956327915192, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.075458288192749, + "sampling/importance_sampling_ratio/min": 3.9075985114322975e-05, + "sampling/sampling_logp_difference/max": 10.150002479553223, + "sampling/sampling_logp_difference/mean": 0.14344847202301025, + "step": 501 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3023105710744858, + "epoch": 1.3210526315789473, + "grad_norm": 0.006497818976640701, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 502 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.30219969153404236, + "epoch": 1.3236842105263158, + "grad_norm": 0.004322134889662266, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 503 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3050485998392105, + "epoch": 1.3263157894736843, + "grad_norm": 0.008752789348363876, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 402.416015625, + "completions/mean_terminated_length": 402.416015625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.30552002787590027, + "epoch": 1.3289473684210527, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00773452315479517, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 151739244.0, + "reward": 0.596386730670929, + "reward_std": 0.1162668839097023, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.564453125, + "rewards/symbolic_reward_accuracy/std": 0.49631330370903015, + "rewards/symbolic_reward_partial_score/mean": 0.8590494394302368, + "rewards/symbolic_reward_partial_score/std": 0.1787545084953308, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077009916305542, + "sampling/importance_sampling_ratio/min": 4.0180704672820866e-05, + "sampling/sampling_logp_difference/max": 10.122123718261719, + "sampling/sampling_logp_difference/mean": 0.1449548900127411, + "step": 505 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3068659007549286, + "epoch": 1.331578947368421, + "grad_norm": 0.004809284582734108, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 506 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.30974075198173523, + "epoch": 1.3342105263157895, + "grad_norm": 0.008824328891932964, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 507 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.30546213686466217, + "epoch": 1.3368421052631578, + "grad_norm": 0.007738140411674976, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 403.423828125, + "completions/mean_terminated_length": 403.423828125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.3024378567934036, + "epoch": 1.3394736842105264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.008510293439030647, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 152346597.0, + "reward": 0.6561523675918579, + "reward_std": 0.13507352769374847, + "rewards/progression_diversity/mean": -1.180451363325119e-06, + "rewards/progression_diversity/std": 2.6710564270615578e-05, + "rewards/symbolic_reward_accuracy/mean": 0.654296875, + "rewards/symbolic_reward_accuracy/std": 0.4760620892047882, + "rewards/symbolic_reward_partial_score/mean": 0.8785806894302368, + "rewards/symbolic_reward_partial_score/std": 0.19002403318881989, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0751402378082275, + "sampling/importance_sampling_ratio/min": 4.670919224736281e-06, + "sampling/sampling_logp_difference/max": 12.274154663085938, + "sampling/sampling_logp_difference/mean": 0.14365223050117493, + "step": 509 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3060772866010666, + "epoch": 1.3421052631578947, + "grad_norm": 0.006542941089719534, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 510 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30383458733558655, + "epoch": 1.3447368421052632, + "grad_norm": 0.005349365528672934, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 511 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3082137107849121, + "epoch": 1.3473684210526315, + "grad_norm": 0.009062383323907852, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 409.34375, + "completions/mean_terminated_length": 409.34375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.3136248141527176, + "epoch": 1.35, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.008149274624884129, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 152938453.0, + "reward": 0.6833982467651367, + "reward_std": 0.1238265112042427, + "rewards/progression_diversity/mean": -2.053233765764162e-05, + "rewards/progression_diversity/std": 0.0004645937879104167, + "rewards/symbolic_reward_accuracy/mean": 0.689453125, + "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, + "rewards/symbolic_reward_partial_score/mean": 0.8990885019302368, + "rewards/symbolic_reward_partial_score/std": 0.18109507858753204, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0766186714172363, + "sampling/importance_sampling_ratio/min": 1.2683102795563173e-05, + "sampling/sampling_logp_difference/max": 11.275239944458008, + "sampling/sampling_logp_difference/mean": 0.14412665367126465, + "step": 513 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3056807965040207, + "epoch": 1.3526315789473684, + "grad_norm": 0.003623334923759103, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 514 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.30095237493515015, + "epoch": 1.3552631578947367, + "grad_norm": 0.0060382746160030365, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 515 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3073163330554962, + "epoch": 1.3578947368421053, + "grad_norm": 0.008139624260365963, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 396.921875, + "completions/mean_terminated_length": 396.921875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.3063454031944275, + "epoch": 1.3605263157894738, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.01082697045058012, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 153535981.0, + "reward": 0.5885741710662842, + "reward_std": 0.13205640017986298, + "rewards/progression_diversity/mean": -1.0338511856389232e-05, + "rewards/progression_diversity/std": 0.00017001591913867742, + "rewards/symbolic_reward_accuracy/mean": 0.556640625, + "rewards/symbolic_reward_accuracy/std": 0.49726733565330505, + "rewards/symbolic_reward_partial_score/mean": 0.8486328125, + "rewards/symbolic_reward_partial_score/std": 0.20038633048534393, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0767133235931396, + "sampling/importance_sampling_ratio/min": 0.0021863149013370275, + "sampling/sampling_logp_difference/max": 6.125537872314453, + "sampling/sampling_logp_difference/mean": 0.1458137333393097, + "step": 517 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3054617643356323, + "epoch": 1.3631578947368421, + "grad_norm": 0.006087815389037132, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 518 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.309252068400383, + "epoch": 1.3657894736842104, + "grad_norm": 0.0069716330617666245, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 519 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.31125877797603607, + "epoch": 1.368421052631579, + "grad_norm": 0.006700367201119661, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 394.095703125, + "completions/mean_terminated_length": 394.095703125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.30801936984062195, + "epoch": 1.3710526315789473, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.007649766281247139, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 154135390.0, + "reward": 0.5628417730331421, + "reward_std": 0.1588105857372284, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.517578125, + "rewards/symbolic_reward_accuracy/std": 0.5001795887947083, + "rewards/symbolic_reward_partial_score/mean": 0.8416340947151184, + "rewards/symbolic_reward_partial_score/std": 0.21291983127593994, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0771160125732422, + "sampling/importance_sampling_ratio/min": 8.427595275861677e-06, + "sampling/sampling_logp_difference/max": 11.683999061584473, + "sampling/sampling_logp_difference/mean": 0.14557112753391266, + "step": 521 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.30574391782283783, + "epoch": 1.3736842105263158, + "grad_norm": 0.006504260469228029, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 522 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.31414419412612915, + "epoch": 1.3763157894736842, + "grad_norm": 0.004781621042639017, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 523 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3113695830106735, + "epoch": 1.3789473684210527, + "grad_norm": 0.00859212689101696, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 424.533203125, + "completions/mean_terminated_length": 393.3013610839844, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.3044796884059906, + "epoch": 1.381578947368421, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.008099560625851154, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 154777071.0, + "reward": 0.6270015239715576, + "reward_std": 0.1343921273946762, + "rewards/progression_diversity/mean": -4.158555020694621e-05, + "rewards/progression_diversity/std": 0.0009409735794179142, + "rewards/symbolic_reward_accuracy/mean": 0.609375, + "rewards/symbolic_reward_accuracy/std": 0.48836761713027954, + "rewards/symbolic_reward_partial_score/mean": 0.8719075918197632, + "rewards/symbolic_reward_partial_score/std": 0.1844439059495926, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0758286714553833, + "sampling/importance_sampling_ratio/min": 5.585740655078553e-05, + "sampling/sampling_logp_difference/max": 9.792708396911621, + "sampling/sampling_logp_difference/mean": 0.1419658362865448, + "step": 525 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3106999695301056, + "epoch": 1.3842105263157896, + "grad_norm": 0.008930054493248463, + "learning_rate": 1e-06, + "loss": 0.0297, + "step": 526 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3158520311117172, + "epoch": 1.3868421052631579, + "grad_norm": 0.005683277267962694, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 527 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30958664417266846, + "epoch": 1.3894736842105262, + "grad_norm": 0.006956308148801327, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 396.740234375, + "completions/mean_terminated_length": 396.740234375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3058925271034241, + "epoch": 1.3921052631578947, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.009263314306735992, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 155396042.0, + "reward": 0.6233398914337158, + "reward_std": 0.1278683841228485, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.607421875, + "rewards/symbolic_reward_accuracy/std": 0.4888018071651459, + "rewards/symbolic_reward_partial_score/mean": 0.8629557490348816, + "rewards/symbolic_reward_partial_score/std": 0.19650208950042725, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0787575244903564, + "sampling/importance_sampling_ratio/min": 0.003360233036801219, + "sampling/sampling_logp_difference/max": 5.69574499130249, + "sampling/sampling_logp_difference/mean": 0.14666804671287537, + "step": 529 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3156265914440155, + "epoch": 1.3947368421052633, + "grad_norm": 0.005127849522978067, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 530 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31428514420986176, + "epoch": 1.3973684210526316, + "grad_norm": 0.005728641990572214, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 531 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.31455811858177185, + "epoch": 1.4, + "grad_norm": 0.00949255283921957, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 380.107421875, + "completions/mean_terminated_length": 380.107421875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.31324976682662964, + "epoch": 1.4026315789473685, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.007790201343595982, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 156002465.0, + "reward": 0.6195793151855469, + "reward_std": 0.1195153146982193, + "rewards/progression_diversity/mean": -8.090149640338495e-05, + "rewards/progression_diversity/std": 0.001830591820180416, + "rewards/symbolic_reward_accuracy/mean": 0.59375, + "rewards/symbolic_reward_accuracy/std": 0.49161264300346375, + "rewards/symbolic_reward_partial_score/mean": 0.8777669072151184, + "rewards/symbolic_reward_partial_score/std": 0.17154613137245178, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078112006187439, + "sampling/importance_sampling_ratio/min": 3.216441427866812e-06, + "sampling/sampling_logp_difference/max": 12.647234916687012, + "sampling/sampling_logp_difference/mean": 0.14644742012023926, + "step": 533 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.31220632791519165, + "epoch": 1.4052631578947368, + "grad_norm": 0.006627189461141825, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 534 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.313899427652359, + "epoch": 1.4078947368421053, + "grad_norm": 0.0068327574990689754, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 535 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3095439076423645, + "epoch": 1.4105263157894736, + "grad_norm": 0.006293828133493662, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 384.59375, + "completions/mean_terminated_length": 384.59375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.310128778219223, + "epoch": 1.4131578947368422, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.008397966623306274, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 156591153.0, + "reward": 0.6722656488418579, + "reward_std": 0.13169080018997192, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.67578125, + "rewards/symbolic_reward_accuracy/std": 0.4685399830341339, + "rewards/symbolic_reward_partial_score/mean": 0.8893228769302368, + "rewards/symbolic_reward_partial_score/std": 0.18464936316013336, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0768916606903076, + "sampling/importance_sampling_ratio/min": 0.00040862703463062644, + "sampling/sampling_logp_difference/max": 7.802707672119141, + "sampling/sampling_logp_difference/mean": 0.1446777880191803, + "step": 537 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30385565757751465, + "epoch": 1.4157894736842105, + "grad_norm": 0.00654578348621726, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 538 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.30842697620391846, + "epoch": 1.418421052631579, + "grad_norm": 0.005931271240115166, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 539 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.307214692234993, + "epoch": 1.4210526315789473, + "grad_norm": 0.004985845647752285, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 385.11328125, + "completions/mean_terminated_length": 385.11328125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.31230244040489197, + "epoch": 1.4236842105263157, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.007098773028701544, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 157183115.0, + "reward": 0.67724609375, + "reward_std": 0.11101227253675461, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.685546875, + "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, + "rewards/symbolic_reward_partial_score/mean": 0.8863932490348816, + "rewards/symbolic_reward_partial_score/std": 0.19459529221057892, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0779294967651367, + "sampling/importance_sampling_ratio/min": 0.0003273308975622058, + "sampling/sampling_logp_difference/max": 8.02453899383545, + "sampling/sampling_logp_difference/mean": 0.1470160037279129, + "step": 541 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31138667464256287, + "epoch": 1.4263157894736842, + "grad_norm": 0.006334090139716864, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 542 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3078685700893402, + "epoch": 1.4289473684210527, + "grad_norm": 0.009027614258229733, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 543 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.30706819891929626, + "epoch": 1.431578947368421, + "grad_norm": 0.006638620514422655, + "learning_rate": 1e-06, + "loss": -0.0019, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 388.3984375, + "completions/mean_terminated_length": 388.3984375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.3106583505868912, + "epoch": 1.4342105263157894, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.008243853226304054, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 157760919.0, + "reward": 0.735595703125, + "reward_std": 0.1595648229122162, + "rewards/progression_diversity/mean": -4.20508786191931e-06, + "rewards/progression_diversity/std": 9.51502806856297e-05, + "rewards/symbolic_reward_accuracy/mean": 0.76171875, + "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, + "rewards/symbolic_reward_partial_score/mean": 0.9285481572151184, + "rewards/symbolic_reward_partial_score/std": 0.14488224685192108, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078289270401001, + "sampling/importance_sampling_ratio/min": 0.00010619553358992562, + "sampling/sampling_logp_difference/max": 9.150228500366211, + "sampling/sampling_logp_difference/mean": 0.14683890342712402, + "step": 545 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31235410273075104, + "epoch": 1.436842105263158, + "grad_norm": 0.005676098167896271, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 546 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.30875393748283386, + "epoch": 1.4394736842105262, + "grad_norm": 0.00652680266648531, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 547 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3143164962530136, + "epoch": 1.4421052631578948, + "grad_norm": 0.006441683974117041, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 388.33203125, + "completions/mean_terminated_length": 388.33203125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.31009891629219055, + "epoch": 1.444736842105263, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.009880034253001213, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 158389761.0, + "reward": 0.5567857027053833, + "reward_std": 0.15986129641532898, + "rewards/progression_diversity/mean": -0.00014837765775155276, + "rewards/progression_diversity/std": 0.0023622072767466307, + "rewards/symbolic_reward_accuracy/mean": 0.51171875, + "rewards/symbolic_reward_accuracy/std": 0.5003514885902405, + "rewards/symbolic_reward_partial_score/mean": 0.83251953125, + "rewards/symbolic_reward_partial_score/std": 0.1981428563594818, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077824354171753, + "sampling/importance_sampling_ratio/min": 0.0002361713268328458, + "sampling/sampling_logp_difference/max": 8.350953102111816, + "sampling/sampling_logp_difference/mean": 0.14743688702583313, + "step": 549 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31600329279899597, + "epoch": 1.4473684210526316, + "grad_norm": 0.007339461240917444, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 550 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3122548907995224, + "epoch": 1.45, + "grad_norm": 0.006188119761645794, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 551 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3066437244415283, + "epoch": 1.4526315789473685, + "grad_norm": 0.004552071448415518, + "learning_rate": 1e-06, + "loss": 0.0039, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 384.88671875, + "completions/mean_terminated_length": 384.88671875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.3084342032670975, + "epoch": 1.4552631578947368, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.005656357388943434, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 158984455.0, + "reward": 0.7021973133087158, + "reward_std": 0.09900879859924316, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.71484375, + "rewards/symbolic_reward_accuracy/std": 0.45193037390708923, + "rewards/symbolic_reward_partial_score/mean": 0.9109700918197632, + "rewards/symbolic_reward_partial_score/std": 0.15867188572883606, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0765399932861328, + "sampling/importance_sampling_ratio/min": 6.768944876966998e-05, + "sampling/sampling_logp_difference/max": 9.600580215454102, + "sampling/sampling_logp_difference/mean": 0.14523030817508698, + "step": 553 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.31066471338272095, + "epoch": 1.4578947368421051, + "grad_norm": 0.0047269840724766254, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 554 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3113812804222107, + "epoch": 1.4605263157894737, + "grad_norm": 0.00564876152202487, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 555 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3069649040699005, + "epoch": 1.4631578947368422, + "grad_norm": 0.008440033532679081, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 407.296875, + "completions/mean_terminated_length": 376.03131103515625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3084985613822937, + "epoch": 1.4657894736842105, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.010170203633606434, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 159586111.0, + "reward": 0.6351065635681152, + "reward_std": 0.11396625638008118, + "rewards/progression_diversity/mean": -9.009381756186485e-05, + "rewards/progression_diversity/std": 0.0020385903771966696, + "rewards/symbolic_reward_accuracy/mean": 0.62109375, + "rewards/symbolic_reward_accuracy/std": 0.4855891764163971, + "rewards/symbolic_reward_partial_score/mean": 0.87548828125, + "rewards/symbolic_reward_partial_score/std": 0.1809736043214798, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0758986473083496, + "sampling/importance_sampling_ratio/min": 2.0304414647398517e-05, + "sampling/sampling_logp_difference/max": 10.804672241210938, + "sampling/sampling_logp_difference/mean": 0.14537295699119568, + "step": 557 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3136870563030243, + "epoch": 1.4684210526315788, + "grad_norm": 0.00794845912605524, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 558 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3167266547679901, + "epoch": 1.4710526315789474, + "grad_norm": 0.009949425235390663, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 559 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3107505738735199, + "epoch": 1.4736842105263157, + "grad_norm": 0.01014183834195137, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 382.29296875, + "completions/mean_terminated_length": 382.29296875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.30869798362255096, + "epoch": 1.4763157894736842, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.008826439268887043, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 160179317.0, + "reward": 0.6139646768569946, + "reward_std": 0.11059055477380753, + "rewards/progression_diversity/mean": -1.9328768757986836e-05, + "rewards/progression_diversity/std": 0.00043736008228734136, + "rewards/symbolic_reward_accuracy/mean": 0.591796875, + "rewards/symbolic_reward_accuracy/std": 0.49198177456855774, + "rewards/symbolic_reward_partial_score/mean": 0.8629557490348816, + "rewards/symbolic_reward_partial_score/std": 0.19420644640922546, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0768595933914185, + "sampling/importance_sampling_ratio/min": 0.00032772633130662143, + "sampling/sampling_logp_difference/max": 8.023331642150879, + "sampling/sampling_logp_difference/mean": 0.14667022228240967, + "step": 561 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3062967211008072, + "epoch": 1.4789473684210526, + "grad_norm": 0.005198640748858452, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 562 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31053030490875244, + "epoch": 1.481578947368421, + "grad_norm": 0.006931893527507782, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 563 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3084117919206619, + "epoch": 1.4842105263157894, + "grad_norm": 0.007732919882982969, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 367.578125, + "completions/mean_terminated_length": 367.578125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.30998261272907257, + "epoch": 1.486842105263158, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.00754775432869792, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 160771645.0, + "reward": 0.6725579500198364, + "reward_std": 0.1186295673251152, + "rewards/progression_diversity/mean": -7.19197269063443e-05, + "rewards/progression_diversity/std": 0.0011566577013581991, + "rewards/symbolic_reward_accuracy/mean": 0.669921875, + "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, + "rewards/symbolic_reward_partial_score/mean": 0.9026693105697632, + "rewards/symbolic_reward_partial_score/std": 0.1593107134103775, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0781853199005127, + "sampling/importance_sampling_ratio/min": 5.44397971680155e-06, + "sampling/sampling_logp_difference/max": 12.121000289916992, + "sampling/sampling_logp_difference/mean": 0.14748519659042358, + "step": 565 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31304365396499634, + "epoch": 1.4894736842105263, + "grad_norm": 0.006320515181869268, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 566 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3075980395078659, + "epoch": 1.4921052631578946, + "grad_norm": 0.005773784592747688, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 567 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.31288425624370575, + "epoch": 1.4947368421052631, + "grad_norm": 0.004835109226405621, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 365.66796875, + "completions/mean_terminated_length": 365.66796875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.30872373282909393, + "epoch": 1.4973684210526317, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.007975532673299313, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 161374867.0, + "reward": 0.5785626173019409, + "reward_std": 0.15455234050750732, + "rewards/progression_diversity/mean": -0.00018735449702944607, + "rewards/progression_diversity/std": 0.004239348694682121, + "rewards/symbolic_reward_accuracy/mean": 0.537109375, + "rewards/symbolic_reward_accuracy/std": 0.4991086423397064, + "rewards/symbolic_reward_partial_score/mean": 0.8543294668197632, + "rewards/symbolic_reward_partial_score/std": 0.18278612196445465, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0777958631515503, + "sampling/importance_sampling_ratio/min": 0.00035884103272110224, + "sampling/sampling_logp_difference/max": 7.932631015777588, + "sampling/sampling_logp_difference/mean": 0.1475108563899994, + "step": 569 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.30878394842147827, + "epoch": 1.5, + "grad_norm": 0.006081805098801851, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 570 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.313975527882576, + "epoch": 1.5026315789473683, + "grad_norm": 0.004849690943956375, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 571 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3043152540922165, + "epoch": 1.5052631578947369, + "grad_norm": 0.006966730579733849, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 365.453125, + "completions/mean_terminated_length": 365.453125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.3078303337097168, + "epoch": 1.5078947368421054, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.007786921691149473, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 161959771.0, + "reward": 0.6698729991912842, + "reward_std": 0.09264719486236572, + "rewards/progression_diversity/mean": -5.197777682042215e-06, + "rewards/progression_diversity/std": 0.00011761228233808652, + "rewards/symbolic_reward_accuracy/mean": 0.669921875, + "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, + "rewards/symbolic_reward_partial_score/mean": 0.89306640625, + "rewards/symbolic_reward_partial_score/std": 0.17148670554161072, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0773301124572754, + "sampling/importance_sampling_ratio/min": 6.37840139461332e-06, + "sampling/sampling_logp_difference/max": 11.962593078613281, + "sampling/sampling_logp_difference/mean": 0.14748983085155487, + "step": 573 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.30795395374298096, + "epoch": 1.5105263157894737, + "grad_norm": 0.00861379038542509, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 574 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3071673512458801, + "epoch": 1.513157894736842, + "grad_norm": 0.00907305721193552, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 575 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3147677630186081, + "epoch": 1.5157894736842106, + "grad_norm": 0.0050981612876057625, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 372.0, + "completions/mean_terminated_length": 372.0, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.30765050649642944, + "epoch": 1.518421052631579, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.008448776789009571, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 162577083.0, + "reward": 0.5401855707168579, + "reward_std": 0.13993936777114868, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.48046875, + "rewards/symbolic_reward_accuracy/std": 0.5001069903373718, + "rewards/symbolic_reward_partial_score/mean": 0.8396810293197632, + "rewards/symbolic_reward_partial_score/std": 0.17196524143218994, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0772461891174316, + "sampling/importance_sampling_ratio/min": 0.0007980514201335609, + "sampling/sampling_logp_difference/max": 7.133337497711182, + "sampling/sampling_logp_difference/mean": 0.1471158266067505, + "step": 577 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31448253989219666, + "epoch": 1.5210526315789474, + "grad_norm": 0.008604890666902065, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 578 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30293673276901245, + "epoch": 1.5236842105263158, + "grad_norm": 0.005941214971244335, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 579 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.310068815946579, + "epoch": 1.526315789473684, + "grad_norm": 0.0063788071274757385, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 387.646484375, + "completions/mean_terminated_length": 356.34246826171875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.30966073274612427, + "epoch": 1.5289473684210526, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0067711323499679565, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 163170342.0, + "reward": 0.6299285888671875, + "reward_std": 0.1152401715517044, + "rewards/progression_diversity/mean": -0.00030736689222976565, + "rewards/progression_diversity/std": 0.006954919081181288, + "rewards/symbolic_reward_accuracy/mean": 0.619140625, + "rewards/symbolic_reward_accuracy/std": 0.48607301712036133, + "rewards/symbolic_reward_partial_score/mean": 0.8621419668197632, + "rewards/symbolic_reward_partial_score/std": 0.20800890028476715, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0749588012695312, + "sampling/importance_sampling_ratio/min": 6.100111136220221e-07, + "sampling/sampling_logp_difference/max": 14.309788703918457, + "sampling/sampling_logp_difference/mean": 0.14406411349773407, + "step": 581 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.30469025671482086, + "epoch": 1.5315789473684212, + "grad_norm": 0.007111302111297846, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 582 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.30858737230300903, + "epoch": 1.5342105263157895, + "grad_norm": 0.005157460458576679, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 583 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.30936598777770996, + "epoch": 1.5368421052631578, + "grad_norm": 0.00526466453447938, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 362.904296875, + "completions/mean_terminated_length": 362.904296875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.3120175153017044, + "epoch": 1.5394736842105263, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.008821161463856697, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 163783477.0, + "reward": 0.6006832122802734, + "reward_std": 0.13123157620429993, + "rewards/progression_diversity/mean": -4.234274820191786e-05, + "rewards/progression_diversity/std": 0.0009581070044077933, + "rewards/symbolic_reward_accuracy/mean": 0.5703125, + "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, + "rewards/symbolic_reward_partial_score/mean": 0.8616536855697632, + "rewards/symbolic_reward_partial_score/std": 0.17949894070625305, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0769450664520264, + "sampling/importance_sampling_ratio/min": 0.00024724824470467865, + "sampling/sampling_logp_difference/max": 8.3051176071167, + "sampling/sampling_logp_difference/mean": 0.14837928116321564, + "step": 585 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31175045669078827, + "epoch": 1.5421052631578949, + "grad_norm": 0.007316979113966227, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 586 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31634584069252014, + "epoch": 1.5447368421052632, + "grad_norm": 0.006950597278773785, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 587 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.31104812026023865, + "epoch": 1.5473684210526315, + "grad_norm": 0.00765888299793005, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 355.30078125, + "completions/mean_terminated_length": 355.30078125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.30843158066272736, + "epoch": 1.55, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.009021712467074394, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 164348847.0, + "reward": 0.6466308832168579, + "reward_std": 0.09964179992675781, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.640625, + "rewards/symbolic_reward_accuracy/std": 0.48028653860092163, + "rewards/symbolic_reward_partial_score/mean": 0.8741861581802368, + "rewards/symbolic_reward_partial_score/std": 0.19597046077251434, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0768463611602783, + "sampling/importance_sampling_ratio/min": 0.0002325253444723785, + "sampling/sampling_logp_difference/max": 8.366511344909668, + "sampling/sampling_logp_difference/mean": 0.14811888337135315, + "step": 589 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.31392180919647217, + "epoch": 1.5526315789473686, + "grad_norm": 0.005683423485606909, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 590 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31084367632865906, + "epoch": 1.555263157894737, + "grad_norm": 0.011430696584284306, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 591 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3137829303741455, + "epoch": 1.5578947368421052, + "grad_norm": 0.0035277451388537884, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 351.935546875, + "completions/mean_terminated_length": 351.935546875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.31604452431201935, + "epoch": 1.5605263157894735, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.01125564705580473, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 164937198.0, + "reward": 0.6098132133483887, + "reward_std": 0.16237008571624756, + "rewards/progression_diversity/mean": -0.00012535469431895763, + "rewards/progression_diversity/std": 0.0018754107877612114, + "rewards/symbolic_reward_accuracy/mean": 0.583984375, + "rewards/symbolic_reward_accuracy/std": 0.493378221988678, + "rewards/symbolic_reward_partial_score/mean": 0.86474609375, + "rewards/symbolic_reward_partial_score/std": 0.1801557093858719, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078462839126587, + "sampling/importance_sampling_ratio/min": 4.561973582895007e-06, + "sampling/sampling_logp_difference/max": 12.297755241394043, + "sampling/sampling_logp_difference/mean": 0.14773216843605042, + "step": 593 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.30913083255290985, + "epoch": 1.563157894736842, + "grad_norm": 0.007575163152068853, + "learning_rate": 1e-06, + "loss": -0.0019, + "step": 594 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.310193806886673, + "epoch": 1.5657894736842106, + "grad_norm": 0.007082466036081314, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 595 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3109205365180969, + "epoch": 1.568421052631579, + "grad_norm": 0.008095265366137028, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 355.970703125, + "completions/mean_terminated_length": 355.970703125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3134462833404541, + "epoch": 1.5710526315789473, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.012334701605141163, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 165526111.0, + "reward": 0.64453125, + "reward_std": 0.167199969291687, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.63671875, + "rewards/symbolic_reward_accuracy/std": 0.4814152419567108, + "rewards/symbolic_reward_partial_score/mean": 0.875, + "rewards/symbolic_reward_partial_score/std": 0.18969999253749847, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0783336162567139, + "sampling/importance_sampling_ratio/min": 2.1293792542564915e-06, + "sampling/sampling_logp_difference/max": 13.059679985046387, + "sampling/sampling_logp_difference/mean": 0.14881224930286407, + "step": 597 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.316374272108078, + "epoch": 1.5736842105263158, + "grad_norm": 0.008729521185159683, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 598 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.30681292712688446, + "epoch": 1.5763157894736843, + "grad_norm": 0.0042996518313884735, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 599 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.31428535282611847, + "epoch": 1.5789473684210527, + "grad_norm": 0.008006506599485874, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 352.412109375, + "completions/mean_terminated_length": 352.412109375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.30341216921806335, + "epoch": 1.581578947368421, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.012584522366523743, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 166133714.0, + "reward": 0.6537594795227051, + "reward_std": 0.15127253532409668, + "rewards/progression_diversity/mean": -3.323890268802643e-05, + "rewards/progression_diversity/std": 0.0007521104998886585, + "rewards/symbolic_reward_accuracy/mean": 0.646484375, + "rewards/symbolic_reward_accuracy/std": 0.47852855920791626, + "rewards/symbolic_reward_partial_score/mean": 0.8868814706802368, + "rewards/symbolic_reward_partial_score/std": 0.17624127864837646, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0774664878845215, + "sampling/importance_sampling_ratio/min": 0.0012332568876445293, + "sampling/sampling_logp_difference/max": 6.698096752166748, + "sampling/sampling_logp_difference/mean": 0.14874780178070068, + "step": 601 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31365664303302765, + "epoch": 1.5842105263157895, + "grad_norm": 0.008380788378417492, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 602 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3135155141353607, + "epoch": 1.586842105263158, + "grad_norm": 0.006163998506963253, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 603 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.316244512796402, + "epoch": 1.5894736842105264, + "grad_norm": 0.009533174335956573, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 352.10546875, + "completions/mean_terminated_length": 352.10546875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.30988338589668274, + "epoch": 1.5921052631578947, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.009614565409719944, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 166705288.0, + "reward": 0.6556152105331421, + "reward_std": 0.14926879107952118, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.642578125, + "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, + "rewards/symbolic_reward_partial_score/mean": 0.9002279043197632, + "rewards/symbolic_reward_partial_score/std": 0.15159675478935242, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0770351886749268, + "sampling/importance_sampling_ratio/min": 7.399953756248578e-05, + "sampling/sampling_logp_difference/max": 9.511451721191406, + "sampling/sampling_logp_difference/mean": 0.14816106855869293, + "step": 605 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3066626638174057, + "epoch": 1.594736842105263, + "grad_norm": 0.006337527651339769, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 606 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3137834966182709, + "epoch": 1.5973684210526315, + "grad_norm": 0.0055765085853636265, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 607 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31103527545928955, + "epoch": 1.6, + "grad_norm": 0.0059480974450707436, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 350.796875, + "completions/mean_terminated_length": 350.796875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3112498074769974, + "epoch": 1.6026315789473684, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00678864074870944, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 167282688.0, + "reward": 0.6035632491111755, + "reward_std": 0.06579291820526123, + "rewards/progression_diversity/mean": -0.00012304651318117976, + "rewards/progression_diversity/std": 0.0026907166466116905, + "rewards/symbolic_reward_accuracy/mean": 0.57421875, + "rewards/symbolic_reward_accuracy/std": 0.4949444830417633, + "rewards/symbolic_reward_partial_score/mean": 0.8634439706802368, + "rewards/symbolic_reward_partial_score/std": 0.17649410665035248, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077202320098877, + "sampling/importance_sampling_ratio/min": 0.0004482800140976906, + "sampling/sampling_logp_difference/max": 7.710092544555664, + "sampling/sampling_logp_difference/mean": 0.14757178723812103, + "step": 609 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3036888688802719, + "epoch": 1.6052631578947367, + "grad_norm": 0.006577013060450554, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 610 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31006863713264465, + "epoch": 1.6078947368421053, + "grad_norm": 0.005068251863121986, + "learning_rate": 1e-06, + "loss": -0.0019, + "step": 611 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.30943046510219574, + "epoch": 1.6105263157894738, + "grad_norm": 0.006940970662981272, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 348.203125, + "completions/mean_terminated_length": 348.203125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.30800144374370575, + "epoch": 1.6131578947368421, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.010091869160532951, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 167850440.0, + "reward": 0.6912109851837158, + "reward_std": 0.1221666932106018, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.705078125, + "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, + "rewards/symbolic_reward_partial_score/mean": 0.8938802480697632, + "rewards/symbolic_reward_partial_score/std": 0.1832769364118576, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0772395133972168, + "sampling/importance_sampling_ratio/min": 0.00012781730038113892, + "sampling/sampling_logp_difference/max": 8.964908599853516, + "sampling/sampling_logp_difference/mean": 0.14840662479400635, + "step": 613 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.30726876854896545, + "epoch": 1.6157894736842104, + "grad_norm": 0.006161559373140335, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 614 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3138604611158371, + "epoch": 1.618421052631579, + "grad_norm": 0.00567243155092001, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 615 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31249263882637024, + "epoch": 1.6210526315789475, + "grad_norm": 0.0038140625692903996, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 340.693359375, + "completions/mean_terminated_length": 340.693359375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.30807264149188995, + "epoch": 1.6236842105263158, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.009525323286652565, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 168432171.0, + "reward": 0.64228355884552, + "reward_std": 0.11010335385799408, + "rewards/progression_diversity/mean": -0.00016110966680571437, + "rewards/progression_diversity/std": 0.002038724021986127, + "rewards/symbolic_reward_accuracy/mean": 0.626953125, + "rewards/symbolic_reward_accuracy/std": 0.48408737778663635, + "rewards/symbolic_reward_partial_score/mean": 0.8870443105697632, + "rewards/symbolic_reward_partial_score/std": 0.16728946566581726, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0766582489013672, + "sampling/importance_sampling_ratio/min": 3.1295207008952275e-05, + "sampling/sampling_logp_difference/max": 10.372045516967773, + "sampling/sampling_logp_difference/mean": 0.14576473832130432, + "step": 617 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3076433092355728, + "epoch": 1.6263157894736842, + "grad_norm": 0.005641128867864609, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 618 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.30672571063041687, + "epoch": 1.6289473684210525, + "grad_norm": 0.00586570193991065, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 619 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3051045835018158, + "epoch": 1.631578947368421, + "grad_norm": 0.006130191031843424, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 344.671875, + "completions/mean_terminated_length": 344.671875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3110572397708893, + "epoch": 1.6342105263157896, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006634837947785854, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 169024803.0, + "reward": 0.6852539777755737, + "reward_std": 0.10974645614624023, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.69140625, + "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, + "rewards/symbolic_reward_partial_score/mean": 0.9013671875, + "rewards/symbolic_reward_partial_score/std": 0.1646461933851242, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0777833461761475, + "sampling/importance_sampling_ratio/min": 0.0019899923354387283, + "sampling/sampling_logp_difference/max": 6.2196245193481445, + "sampling/sampling_logp_difference/mean": 0.14576740562915802, + "step": 621 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3100120574235916, + "epoch": 1.6368421052631579, + "grad_norm": 0.004018013831228018, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 622 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.31171390414237976, + "epoch": 1.6394736842105262, + "grad_norm": 0.008900276385247707, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 623 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.30071602761745453, + "epoch": 1.6421052631578947, + "grad_norm": 0.007619917392730713, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 352.658203125, + "completions/mean_terminated_length": 352.658203125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.3088865578174591, + "epoch": 1.6447368421052633, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.006969306152313948, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 169577812.0, + "reward": 0.7381347417831421, + "reward_std": 0.09588056802749634, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.763671875, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.93310546875, + "rewards/symbolic_reward_partial_score/std": 0.12945035099983215, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0774608850479126, + "sampling/importance_sampling_ratio/min": 0.0002049918402917683, + "sampling/sampling_logp_difference/max": 8.49254035949707, + "sampling/sampling_logp_difference/mean": 0.14746268093585968, + "step": 625 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31198354065418243, + "epoch": 1.6473684210526316, + "grad_norm": 0.005133276339620352, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 626 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3059317022562027, + "epoch": 1.65, + "grad_norm": 0.005952898412942886, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 627 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3020048588514328, + "epoch": 1.6526315789473685, + "grad_norm": 0.0032144656870514154, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 343.083984375, + "completions/mean_terminated_length": 343.083984375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3132614195346832, + "epoch": 1.655263157894737, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.004867217969149351, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 170130591.0, + "reward": 0.693359375, + "reward_std": 0.11107412725687027, + "rewards/progression_diversity/mean": -1.2624410601347336e-06, + "rewards/progression_diversity/std": 2.8565777029143646e-05, + "rewards/symbolic_reward_accuracy/mean": 0.712890625, + "rewards/symbolic_reward_accuracy/std": 0.45285552740097046, + "rewards/symbolic_reward_partial_score/mean": 0.8854166865348816, + "rewards/symbolic_reward_partial_score/std": 0.20853707194328308, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077362298965454, + "sampling/importance_sampling_ratio/min": 3.827792897936888e-05, + "sampling/sampling_logp_difference/max": 10.170637130737305, + "sampling/sampling_logp_difference/mean": 0.14775975048542023, + "step": 629 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3046818673610687, + "epoch": 1.6578947368421053, + "grad_norm": 0.003144451417028904, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 630 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31304094195365906, + "epoch": 1.6605263157894736, + "grad_norm": 0.006349779199808836, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 631 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31196820735931396, + "epoch": 1.663157894736842, + "grad_norm": 0.008312534540891647, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 355.884765625, + "completions/mean_terminated_length": 355.884765625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.30678650736808777, + "epoch": 1.6657894736842105, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.010083966888487339, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 170696260.0, + "reward": 0.7260736227035522, + "reward_std": 0.12834513187408447, + "rewards/progression_diversity/mean": -6.044433393981308e-05, + "rewards/progression_diversity/std": 0.001367699122056365, + "rewards/symbolic_reward_accuracy/mean": 0.751953125, + "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, + "rewards/symbolic_reward_partial_score/mean": 0.9163411855697632, + "rewards/symbolic_reward_partial_score/std": 0.16328899562358856, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0770810842514038, + "sampling/importance_sampling_ratio/min": 0.003595878602936864, + "sampling/sampling_logp_difference/max": 5.62796688079834, + "sampling/sampling_logp_difference/mean": 0.14828580617904663, + "step": 633 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.314095139503479, + "epoch": 1.668421052631579, + "grad_norm": 0.0036648185923695564, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 634 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31373080611228943, + "epoch": 1.6710526315789473, + "grad_norm": 0.003688375698402524, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 635 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.30743128061294556, + "epoch": 1.6736842105263157, + "grad_norm": 0.0059085749089717865, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 346.625, + "completions/mean_terminated_length": 346.625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3069046139717102, + "epoch": 1.6763157894736842, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.006045298185199499, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 171266692.0, + "reward": 0.7287590503692627, + "reward_std": 0.10971052944660187, + "rewards/progression_diversity/mean": -7.262287545017898e-05, + "rewards/progression_diversity/std": 0.0010342065943405032, + "rewards/symbolic_reward_accuracy/mean": 0.7578125, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.91357421875, + "rewards/symbolic_reward_partial_score/std": 0.17370551824569702, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0787405967712402, + "sampling/importance_sampling_ratio/min": 6.20439004705986e-06, + "sampling/sampling_logp_difference/max": 11.990253448486328, + "sampling/sampling_logp_difference/mean": 0.14685004949569702, + "step": 637 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3091466426849365, + "epoch": 1.6789473684210527, + "grad_norm": 0.007839719764888287, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 638 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.30954572558403015, + "epoch": 1.681578947368421, + "grad_norm": 0.00465408293530345, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 639 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3108525574207306, + "epoch": 1.6842105263157894, + "grad_norm": 0.005443122237920761, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 346.2265625, + "completions/mean_terminated_length": 346.2265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3054659068584442, + "epoch": 1.686842105263158, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.012117248959839344, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 171841752.0, + "reward": 0.6958000659942627, + "reward_std": 0.11660270392894745, + "rewards/progression_diversity/mean": -7.265746535267681e-05, + "rewards/progression_diversity/std": 0.0009757342631928623, + "rewards/symbolic_reward_accuracy/mean": 0.705078125, + "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, + "rewards/symbolic_reward_partial_score/mean": 0.9091796875, + "rewards/symbolic_reward_partial_score/std": 0.1629505455493927, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0758869647979736, + "sampling/importance_sampling_ratio/min": 6.6611269176064525e-06, + "sampling/sampling_logp_difference/max": 11.919221878051758, + "sampling/sampling_logp_difference/mean": 0.1453857272863388, + "step": 641 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3011976033449173, + "epoch": 1.6894736842105265, + "grad_norm": 0.007454789709299803, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 642 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.31143735349178314, + "epoch": 1.6921052631578948, + "grad_norm": 0.005452565383166075, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 643 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.30266809463500977, + "epoch": 1.694736842105263, + "grad_norm": 0.00574772572144866, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 355.8515625, + "completions/mean_terminated_length": 355.8515625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3147277981042862, + "epoch": 1.6973684210526314, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.007081964984536171, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 172435436.0, + "reward": 0.5885741710662842, + "reward_std": 0.10568203777074814, + "rewards/progression_diversity/mean": -9.049794243765064e-06, + "rewards/progression_diversity/std": 0.0002047734596999362, + "rewards/symbolic_reward_accuracy/mean": 0.552734375, + "rewards/symbolic_reward_accuracy/std": 0.4976975917816162, + "rewards/symbolic_reward_partial_score/mean": 0.8564453125, + "rewards/symbolic_reward_partial_score/std": 0.18515437841415405, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0765762329101562, + "sampling/importance_sampling_ratio/min": 5.255365977063775e-05, + "sampling/sampling_logp_difference/max": 9.853675842285156, + "sampling/sampling_logp_difference/mean": 0.146433487534523, + "step": 645 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.31117910146713257, + "epoch": 1.7, + "grad_norm": 0.004298560321331024, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 646 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3119800388813019, + "epoch": 1.7026315789473685, + "grad_norm": 0.006341192871332169, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 647 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3077608495950699, + "epoch": 1.7052631578947368, + "grad_norm": 0.004039146471768618, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 361.634765625, + "completions/mean_terminated_length": 361.634765625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.30650724470615387, + "epoch": 1.7078947368421051, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.007845344953238964, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 173023057.0, + "reward": 0.6627441644668579, + "reward_std": 0.07411643862724304, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.65625, + "rewards/symbolic_reward_accuracy/std": 0.4754233956336975, + "rewards/symbolic_reward_partial_score/mean": 0.8966470956802368, + "rewards/symbolic_reward_partial_score/std": 0.16268277168273926, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0776004791259766, + "sampling/importance_sampling_ratio/min": 0.00023482624965254217, + "sampling/sampling_logp_difference/max": 8.356664657592773, + "sampling/sampling_logp_difference/mean": 0.14773426949977875, + "step": 649 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31050027906894684, + "epoch": 1.7105263157894737, + "grad_norm": 0.0048981113359332085, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 650 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.31170788407325745, + "epoch": 1.7131578947368422, + "grad_norm": 0.004619154147803783, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 651 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3152112662792206, + "epoch": 1.7157894736842105, + "grad_norm": 0.0038981193210929632, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 349.685546875, + "completions/mean_terminated_length": 349.685546875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.31783896684646606, + "epoch": 1.7184210526315788, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0073747457936406136, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 173585712.0, + "reward": 0.7288573384284973, + "reward_std": 0.06641851365566254, + "rewards/progression_diversity/mean": -1.288700968871126e-05, + "rewards/progression_diversity/std": 0.00029159971745684743, + "rewards/symbolic_reward_accuracy/mean": 0.748046875, + "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, + "rewards/symbolic_reward_partial_score/mean": 0.9334309697151184, + "rewards/symbolic_reward_partial_score/std": 0.13425354659557343, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0794637203216553, + "sampling/importance_sampling_ratio/min": 5.184490703413758e-08, + "sampling/sampling_logp_difference/max": 16.775009155273438, + "sampling/sampling_logp_difference/mean": 0.15089115500450134, + "step": 653 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.31695644557476044, + "epoch": 1.7210526315789474, + "grad_norm": 0.0054539949633181095, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 654 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3197050541639328, + "epoch": 1.723684210526316, + "grad_norm": 0.005290233064442873, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 655 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3151402175426483, + "epoch": 1.7263157894736842, + "grad_norm": 0.009018939919769764, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 348.197265625, + "completions/mean_terminated_length": 348.197265625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.3115183562040329, + "epoch": 1.7289473684210526, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.008836961351335049, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 174152597.0, + "reward": 0.703369140625, + "reward_std": 0.11740782856941223, + "rewards/progression_diversity/mean": -5.42844645678997e-06, + "rewards/progression_diversity/std": 0.00012283171236049384, + "rewards/symbolic_reward_accuracy/mean": 0.72265625, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.8992513418197632, + "rewards/symbolic_reward_partial_score/std": 0.18293997645378113, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0789730548858643, + "sampling/importance_sampling_ratio/min": 4.3294774513924494e-05, + "sampling/sampling_logp_difference/max": 10.047478675842285, + "sampling/sampling_logp_difference/mean": 0.14934486150741577, + "step": 657 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31983131170272827, + "epoch": 1.731578947368421, + "grad_norm": 0.006589457858353853, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 658 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3190360963344574, + "epoch": 1.7342105263157894, + "grad_norm": 0.004673468880355358, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 659 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31859441101551056, + "epoch": 1.736842105263158, + "grad_norm": 0.008711190894246101, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 364.14453125, + "completions/mean_terminated_length": 364.14453125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.30695727467536926, + "epoch": 1.7394736842105263, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.00819223653525114, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 174727327.0, + "reward": 0.6963849067687988, + "reward_std": 0.10175984352827072, + "rewards/progression_diversity/mean": -0.00018321775132790208, + "rewards/progression_diversity/std": 0.002992808585986495, + "rewards/symbolic_reward_accuracy/mean": 0.7109375, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.8994140625, + "rewards/symbolic_reward_partial_score/std": 0.16723868250846863, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077476143836975, + "sampling/importance_sampling_ratio/min": 0.0003383801376912743, + "sampling/sampling_logp_difference/max": 7.991340637207031, + "sampling/sampling_logp_difference/mean": 0.14830249547958374, + "step": 661 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3172942250967026, + "epoch": 1.7421052631578946, + "grad_norm": 0.0050326017662882805, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 662 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31450478732585907, + "epoch": 1.7447368421052631, + "grad_norm": 0.004655744414776564, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 663 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.31187137961387634, + "epoch": 1.7473684210526317, + "grad_norm": 0.007312784902751446, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 361.953125, + "completions/mean_terminated_length": 361.953125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.312186524271965, + "epoch": 1.75, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.006459662225097418, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 175310279.0, + "reward": 0.6182616949081421, + "reward_std": 0.08476275205612183, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.595703125, + "rewards/symbolic_reward_accuracy/std": 0.4912354052066803, + "rewards/symbolic_reward_partial_score/mean": 0.8694661855697632, + "rewards/symbolic_reward_partial_score/std": 0.1792290210723877, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077375054359436, + "sampling/importance_sampling_ratio/min": 0.00022012810222804546, + "sampling/sampling_logp_difference/max": 8.421300888061523, + "sampling/sampling_logp_difference/mean": 0.14795972406864166, + "step": 665 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3078509271144867, + "epoch": 1.7526315789473683, + "grad_norm": 0.007247635163366795, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 666 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3109118938446045, + "epoch": 1.7552631578947369, + "grad_norm": 0.005482961889356375, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 667 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3142966479063034, + "epoch": 1.7578947368421054, + "grad_norm": 0.005572004709392786, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 355.5546875, + "completions/mean_terminated_length": 355.5546875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3197452425956726, + "epoch": 1.7605263157894737, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.005338034126907587, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 175890979.0, + "reward": 0.6089342832565308, + "reward_std": 0.10050010681152344, + "rewards/progression_diversity/mean": -0.00012931314995512366, + "rewards/progression_diversity/std": 0.00251020141877234, + "rewards/symbolic_reward_accuracy/mean": 0.580078125, + "rewards/symbolic_reward_accuracy/std": 0.4940285086631775, + "rewards/symbolic_reward_partial_score/mean": 0.86962890625, + "rewards/symbolic_reward_partial_score/std": 0.17219598591327667, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0790578126907349, + "sampling/importance_sampling_ratio/min": 0.0004860071639996022, + "sampling/sampling_logp_difference/max": 7.629287242889404, + "sampling/sampling_logp_difference/mean": 0.15068799257278442, + "step": 669 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3192948251962662, + "epoch": 1.763157894736842, + "grad_norm": 0.009029190056025982, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 670 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.32492591440677643, + "epoch": 1.7657894736842106, + "grad_norm": 0.004878129344433546, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 671 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.31461015343666077, + "epoch": 1.768421052631579, + "grad_norm": 0.00928495917469263, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 353.5234375, + "completions/mean_terminated_length": 353.5234375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.3127034604549408, + "epoch": 1.7710526315789474, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.012209014035761356, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 176469775.0, + "reward": 0.7016580104827881, + "reward_std": 0.08149384707212448, + "rewards/progression_diversity/mean": -0.0002171548258047551, + "rewards/progression_diversity/std": 0.004398517310619354, + "rewards/symbolic_reward_accuracy/mean": 0.7109375, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.9169921875, + "rewards/symbolic_reward_partial_score/std": 0.15359680354595184, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0774240493774414, + "sampling/importance_sampling_ratio/min": 4.304129674892465e-07, + "sampling/sampling_logp_difference/max": 14.658520698547363, + "sampling/sampling_logp_difference/mean": 0.14875832200050354, + "step": 673 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.31441599130630493, + "epoch": 1.7736842105263158, + "grad_norm": 0.006872444413602352, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 674 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3115265518426895, + "epoch": 1.776315789473684, + "grad_norm": 0.0034778716508299112, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 675 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31803882122039795, + "epoch": 1.7789473684210526, + "grad_norm": 0.007153674028813839, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 365.19921875, + "completions/mean_terminated_length": 365.19921875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.3086838871240616, + "epoch": 1.7815789473684212, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.005677321460098028, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 177066901.0, + "reward": 0.6431640386581421, + "reward_std": 0.08386066555976868, + "rewards/progression_diversity/mean": -2.6685718239605194e-06, + "rewards/progression_diversity/std": 6.03828884777613e-05, + "rewards/symbolic_reward_accuracy/mean": 0.63671875, + "rewards/symbolic_reward_accuracy/std": 0.4814152419567108, + "rewards/symbolic_reward_partial_score/mean": 0.8704427480697632, + "rewards/symbolic_reward_partial_score/std": 0.19743919372558594, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0781702995300293, + "sampling/importance_sampling_ratio/min": 0.00019859473104588687, + "sampling/sampling_logp_difference/max": 8.52424430847168, + "sampling/sampling_logp_difference/mean": 0.14643803238868713, + "step": 677 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3097618967294693, + "epoch": 1.7842105263157895, + "grad_norm": 0.008383281528949738, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 678 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3085528761148453, + "epoch": 1.7868421052631578, + "grad_norm": 0.004206513985991478, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 679 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3110394924879074, + "epoch": 1.7894736842105263, + "grad_norm": 0.0055688670836389065, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 364.806640625, + "completions/mean_terminated_length": 364.806640625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.31680648028850555, + "epoch": 1.7921052631578949, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.010036691091954708, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 177653842.0, + "reward": 0.6611816883087158, + "reward_std": 0.1240668073296547, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.666015625, + "rewards/symbolic_reward_accuracy/std": 0.47209542989730835, + "rewards/symbolic_reward_partial_score/mean": 0.87255859375, + "rewards/symbolic_reward_partial_score/std": 0.20477424561977386, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0788350105285645, + "sampling/importance_sampling_ratio/min": 0.0001117745487135835, + "sampling/sampling_logp_difference/max": 9.099026679992676, + "sampling/sampling_logp_difference/mean": 0.14859601855278015, + "step": 681 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31431975960731506, + "epoch": 1.7947368421052632, + "grad_norm": 0.006399835925549269, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 682 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3147125542163849, + "epoch": 1.7973684210526315, + "grad_norm": 0.007597615476697683, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 683 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3126670718193054, + "epoch": 1.8, + "grad_norm": 0.004449205473065376, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 375.79296875, + "completions/mean_terminated_length": 375.79296875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.30589479207992554, + "epoch": 1.8026315789473686, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.009832125157117844, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 178257416.0, + "reward": 0.690575897693634, + "reward_std": 0.09389779716730118, + "rewards/progression_diversity/mean": -3.32641793647781e-05, + "rewards/progression_diversity/std": 0.0005181218730285764, + "rewards/symbolic_reward_accuracy/mean": 0.703125, + "rewards/symbolic_reward_accuracy/std": 0.45732781291007996, + "rewards/symbolic_reward_partial_score/mean": 0.8963216543197632, + "rewards/symbolic_reward_partial_score/std": 0.17704488337039948, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0771936178207397, + "sampling/importance_sampling_ratio/min": 0.0028477907180786133, + "sampling/sampling_logp_difference/max": 5.861211776733398, + "sampling/sampling_logp_difference/mean": 0.14920267462730408, + "step": 685 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31586427986621857, + "epoch": 1.805263157894737, + "grad_norm": 0.0033005087170749903, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 686 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3129272162914276, + "epoch": 1.8078947368421052, + "grad_norm": 0.0028064732905477285, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 687 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3123166412115097, + "epoch": 1.8105263157894735, + "grad_norm": 0.003090722020715475, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 374.056640625, + "completions/mean_terminated_length": 374.056640625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.3101574629545212, + "epoch": 1.813157894736842, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.007430217228829861, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 178852581.0, + "reward": 0.6585937142372131, + "reward_std": 0.06606994569301605, + "rewards/progression_diversity/mean": -8.257005902123637e-06, + "rewards/progression_diversity/std": 0.00018683471716940403, + "rewards/symbolic_reward_accuracy/mean": 0.65625, + "rewards/symbolic_reward_accuracy/std": 0.4754233956336975, + "rewards/symbolic_reward_partial_score/mean": 0.8828125, + "rewards/symbolic_reward_partial_score/std": 0.174610897898674, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0775363445281982, + "sampling/importance_sampling_ratio/min": 0.0027425093576312065, + "sampling/sampling_logp_difference/max": 5.898881912231445, + "sampling/sampling_logp_difference/mean": 0.14803138375282288, + "step": 689 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.31864188611507416, + "epoch": 1.8157894736842106, + "grad_norm": 0.0023194809909909964, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 690 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3088736832141876, + "epoch": 1.818421052631579, + "grad_norm": 0.004817016888409853, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 691 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.31209132075309753, + "epoch": 1.8210526315789473, + "grad_norm": 0.005796011071652174, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 386.078125, + "completions/mean_terminated_length": 386.078125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.30596648156642914, + "epoch": 1.8236842105263158, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.009517088532447815, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 179452397.0, + "reward": 0.6856445670127869, + "reward_std": 0.09474059194326401, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.693359375, + "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, + "rewards/symbolic_reward_partial_score/mean": 0.8987630605697632, + "rewards/symbolic_reward_partial_score/std": 0.16487817466259003, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0773887634277344, + "sampling/importance_sampling_ratio/min": 0.006162859965115786, + "sampling/sampling_logp_difference/max": 5.089214324951172, + "sampling/sampling_logp_difference/mean": 0.14641115069389343, + "step": 693 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3115035742521286, + "epoch": 1.8263157894736843, + "grad_norm": 0.006516161374747753, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 694 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.30916786193847656, + "epoch": 1.8289473684210527, + "grad_norm": 0.005851257126778364, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 695 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3106692433357239, + "epoch": 1.831578947368421, + "grad_norm": 0.005098440684378147, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 380.732421875, + "completions/mean_terminated_length": 380.732421875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.31172554194927216, + "epoch": 1.8342105263157895, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.008900835178792477, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 180043140.0, + "reward": 0.690136194229126, + "reward_std": 0.1291923224925995, + "rewards/progression_diversity/mean": -5.790265277028084e-05, + "rewards/progression_diversity/std": 0.0011140767019242048, + "rewards/symbolic_reward_accuracy/mean": 0.701171875, + "rewards/symbolic_reward_accuracy/std": 0.45819199085235596, + "rewards/symbolic_reward_partial_score/mean": 0.8987630605697632, + "rewards/symbolic_reward_partial_score/std": 0.1765023171901703, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0769742727279663, + "sampling/importance_sampling_ratio/min": 5.942506959399907e-06, + "sampling/sampling_logp_difference/max": 12.033379554748535, + "sampling/sampling_logp_difference/mean": 0.1485772430896759, + "step": 697 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31369997560977936, + "epoch": 1.836842105263158, + "grad_norm": 0.007809172384440899, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 698 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31465624272823334, + "epoch": 1.8394736842105264, + "grad_norm": 0.009294657967984676, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 699 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3038342297077179, + "epoch": 1.8421052631578947, + "grad_norm": 0.011830865405499935, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 385.259765625, + "completions/mean_terminated_length": 385.259765625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.30587026476860046, + "epoch": 1.844736842105263, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.0067597865127027035, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 180647529.0, + "reward": 0.722265362739563, + "reward_std": 0.12456189095973969, + "rewards/progression_diversity/mean": -2.3203070668387227e-05, + "rewards/progression_diversity/std": 0.0005250255926512182, + "rewards/symbolic_reward_accuracy/mean": 0.73828125, + "rewards/symbolic_reward_accuracy/std": 0.44000017642974854, + "rewards/symbolic_reward_partial_score/mean": 0.9309896230697632, + "rewards/symbolic_reward_partial_score/std": 0.12603533267974854, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0772705078125, + "sampling/importance_sampling_ratio/min": 2.513166327844374e-05, + "sampling/sampling_logp_difference/max": 10.591382026672363, + "sampling/sampling_logp_difference/mean": 0.14627277851104736, + "step": 701 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3159767836332321, + "epoch": 1.8473684210526315, + "grad_norm": 0.008303055539727211, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 702 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3041051924228668, + "epoch": 1.85, + "grad_norm": 0.0064351242035627365, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 703 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3142499476671219, + "epoch": 1.8526315789473684, + "grad_norm": 0.005516021512448788, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 375.818359375, + "completions/mean_terminated_length": 375.818359375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.31115204095840454, + "epoch": 1.8552631578947367, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.009341200813651085, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 181254444.0, + "reward": 0.6875925064086914, + "reward_std": 0.09061338007450104, + "rewards/progression_diversity/mean": -0.0005182477761991322, + "rewards/progression_diversity/std": 0.008718207478523254, + "rewards/symbolic_reward_accuracy/mean": 0.6875, + "rewards/symbolic_reward_accuracy/std": 0.4639657139778137, + "rewards/symbolic_reward_partial_score/mean": 0.9169921875, + "rewards/symbolic_reward_partial_score/std": 0.14258471131324768, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0775392055511475, + "sampling/importance_sampling_ratio/min": 0.001974229235202074, + "sampling/sampling_logp_difference/max": 6.227577209472656, + "sampling/sampling_logp_difference/mean": 0.14746874570846558, + "step": 705 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.31405113637447357, + "epoch": 1.8578947368421053, + "grad_norm": 0.007201730273663998, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 706 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31317275762557983, + "epoch": 1.8605263157894738, + "grad_norm": 0.0077608609572052956, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 707 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30931928753852844, + "epoch": 1.8631578947368421, + "grad_norm": 0.007706182077527046, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 372.041015625, + "completions/mean_terminated_length": 372.041015625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.31280550360679626, + "epoch": 1.8657894736842104, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.007203435059636831, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 181856577.0, + "reward": 0.6337395906448364, + "reward_std": 0.11881368607282639, + "rewards/progression_diversity/mean": -7.104627729859203e-05, + "rewards/progression_diversity/std": 0.0014897359069436789, + "rewards/symbolic_reward_accuracy/mean": 0.615234375, + "rewards/symbolic_reward_accuracy/std": 0.4870156943798065, + "rewards/symbolic_reward_partial_score/mean": 0.8819986581802368, + "rewards/symbolic_reward_partial_score/std": 0.17834146320819855, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078622579574585, + "sampling/importance_sampling_ratio/min": 0.000638676225207746, + "sampling/sampling_logp_difference/max": 7.356112957000732, + "sampling/sampling_logp_difference/mean": 0.14934757351875305, + "step": 709 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.31266067922115326, + "epoch": 1.868421052631579, + "grad_norm": 0.005193103104829788, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 710 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3179129362106323, + "epoch": 1.8710526315789475, + "grad_norm": 0.004717197269201279, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 711 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.32021914422512054, + "epoch": 1.8736842105263158, + "grad_norm": 0.006537822540849447, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 409.322265625, + "completions/mean_terminated_length": 378.0606689453125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.3134091943502426, + "epoch": 1.8763157894736842, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.007834019139409065, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 182449766.0, + "reward": 0.7249389290809631, + "reward_std": 0.09784473478794098, + "rewards/progression_diversity/mean": -0.0012266990961506963, + "rewards/progression_diversity/std": 0.02737962268292904, + "rewards/symbolic_reward_accuracy/mean": 0.75390625, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.90869140625, + "rewards/symbolic_reward_partial_score/std": 0.17205534875392914, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0749471187591553, + "sampling/importance_sampling_ratio/min": 0.004973389208316803, + "sampling/sampling_logp_difference/max": 5.303653717041016, + "sampling/sampling_logp_difference/mean": 0.14320877194404602, + "step": 713 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3065633177757263, + "epoch": 1.8789473684210525, + "grad_norm": 0.00554261077195406, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 714 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.312502920627594, + "epoch": 1.881578947368421, + "grad_norm": 0.006236112676560879, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 715 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31002379953861237, + "epoch": 1.8842105263157896, + "grad_norm": 0.0022626114077866077, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 370.48046875, + "completions/mean_terminated_length": 370.48046875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.31257370114326477, + "epoch": 1.8868421052631579, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.007542754523456097, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 183043100.0, + "reward": 0.6575683951377869, + "reward_std": 0.09304552525281906, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.671875, + "rewards/symbolic_reward_accuracy/std": 0.4699897766113281, + "rewards/symbolic_reward_partial_score/mean": 0.84814453125, + "rewards/symbolic_reward_partial_score/std": 0.23802095651626587, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0778599977493286, + "sampling/importance_sampling_ratio/min": 0.00111155875492841, + "sampling/sampling_logp_difference/max": 6.801991939544678, + "sampling/sampling_logp_difference/mean": 0.14849314093589783, + "step": 717 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.31163640320301056, + "epoch": 1.8894736842105262, + "grad_norm": 0.007604603189975023, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 718 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3117101490497589, + "epoch": 1.8921052631578947, + "grad_norm": 0.007121788803488016, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 719 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3161403089761734, + "epoch": 1.8947368421052633, + "grad_norm": 0.004191112704575062, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 365.009765625, + "completions/mean_terminated_length": 365.009765625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.3126900643110275, + "epoch": 1.8973684210526316, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.00634095398709178, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 183639809.0, + "reward": 0.6971662640571594, + "reward_std": 0.07519043982028961, + "rewards/progression_diversity/mean": -0.00017441027739550918, + "rewards/progression_diversity/std": 0.0029865563847124577, + "rewards/symbolic_reward_accuracy/mean": 0.708984375, + "rewards/symbolic_reward_accuracy/std": 0.45467492938041687, + "rewards/symbolic_reward_partial_score/mean": 0.9059244394302368, + "rewards/symbolic_reward_partial_score/std": 0.16108950972557068, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077085018157959, + "sampling/importance_sampling_ratio/min": 0.00030413156491704285, + "sampling/sampling_logp_difference/max": 8.098050117492676, + "sampling/sampling_logp_difference/mean": 0.1494484543800354, + "step": 721 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31474678218364716, + "epoch": 1.9, + "grad_norm": 0.007915548980236053, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 722 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.31036464869976044, + "epoch": 1.9026315789473685, + "grad_norm": 0.004422432277351618, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 723 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3158973455429077, + "epoch": 1.905263157894737, + "grad_norm": 0.005758347921073437, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 398.349609375, + "completions/mean_terminated_length": 367.0665283203125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.3149125427007675, + "epoch": 1.9078947368421053, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.008096176199615002, + "learning_rate": 1e-06, + "loss": 0.0283, + "num_tokens": 184239732.0, + "reward": 0.6666892766952515, + "reward_std": 0.06731939315795898, + "rewards/progression_diversity/mean": -0.0009964853525161743, + "rewards/progression_diversity/std": 0.0225478895008564, + "rewards/symbolic_reward_accuracy/mean": 0.671875, + "rewards/symbolic_reward_accuracy/std": 0.4699897766113281, + "rewards/symbolic_reward_partial_score/mean": 0.8792318105697632, + "rewards/symbolic_reward_partial_score/std": 0.1995687186717987, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0768595933914185, + "sampling/importance_sampling_ratio/min": 0.0012633983278647065, + "sampling/sampling_logp_difference/max": 6.6739501953125, + "sampling/sampling_logp_difference/mean": 0.14742116630077362, + "step": 725 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.31597861647605896, + "epoch": 1.9105263157894736, + "grad_norm": 0.0031310454942286015, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 726 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.31505411863327026, + "epoch": 1.913157894736842, + "grad_norm": 0.005425718612968922, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 727 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3154520094394684, + "epoch": 1.9157894736842105, + "grad_norm": 0.003204651176929474, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 371.125, + "completions/mean_terminated_length": 371.125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.31483209133148193, + "epoch": 1.918421052631579, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.009964163415133953, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 184853588.0, + "reward": 0.6192383170127869, + "reward_std": 0.08837610483169556, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.603515625, + "rewards/symbolic_reward_accuracy/std": 0.4896455705165863, + "rewards/symbolic_reward_partial_score/mean": 0.8570963144302368, + "rewards/symbolic_reward_partial_score/std": 0.21023648977279663, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0781691074371338, + "sampling/importance_sampling_ratio/min": 3.291745451861061e-05, + "sampling/sampling_logp_difference/max": 10.321507453918457, + "sampling/sampling_logp_difference/mean": 0.14980000257492065, + "step": 729 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31423668563365936, + "epoch": 1.9210526315789473, + "grad_norm": 0.005938328802585602, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 730 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31109851598739624, + "epoch": 1.9236842105263157, + "grad_norm": 0.006710145156830549, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 731 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.31440161168575287, + "epoch": 1.9263157894736842, + "grad_norm": 0.007554585114121437, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.0, + "completions/max_terminated_length": 741.0, + "completions/mean_length": 358.267578125, + "completions/mean_terminated_length": 358.267578125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.3080116808414459, + "epoch": 1.9289473684210527, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.004908265545964241, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 185426653.0, + "reward": 0.6898437738418579, + "reward_std": 0.11878422647714615, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.693359375, + "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, + "rewards/symbolic_reward_partial_score/mean": 0.9134114980697632, + "rewards/symbolic_reward_partial_score/std": 0.14149527251720428, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0759687423706055, + "sampling/importance_sampling_ratio/min": 3.671062586363405e-05, + "sampling/sampling_logp_difference/max": 10.212444305419922, + "sampling/sampling_logp_difference/mean": 0.14870089292526245, + "step": 733 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3127932846546173, + "epoch": 1.931578947368421, + "grad_norm": 0.0042357915081083775, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 734 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3097704350948334, + "epoch": 1.9342105263157894, + "grad_norm": 0.003953521605581045, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 735 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3147825747728348, + "epoch": 1.936842105263158, + "grad_norm": 0.00491707818582654, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 907.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 364.462890625, + "completions/mean_terminated_length": 364.462890625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.31078068912029266, + "epoch": 1.9394736842105265, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.008055577054619789, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 186010890.0, + "reward": 0.7134765982627869, + "reward_std": 0.11845691502094269, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.7265625, + "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, + "rewards/symbolic_reward_partial_score/mean": 0.9251301884651184, + "rewards/symbolic_reward_partial_score/std": 0.137575164437294, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0782480239868164, + "sampling/importance_sampling_ratio/min": 0.0007956507615745068, + "sampling/sampling_logp_difference/max": 7.136350154876709, + "sampling/sampling_logp_difference/mean": 0.147745743393898, + "step": 737 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3155370503664017, + "epoch": 1.9421052631578948, + "grad_norm": 0.003693908918648958, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 738 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.30935104191303253, + "epoch": 1.944736842105263, + "grad_norm": 0.004344849847257137, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 739 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.31252725422382355, + "epoch": 1.9473684210526314, + "grad_norm": 0.007028626743704081, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 366.111328125, + "completions/mean_terminated_length": 366.111328125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.3120097368955612, + "epoch": 1.95, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.012667186558246613, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 186614499.0, + "reward": 0.6177229285240173, + "reward_std": 0.08512495458126068, + "rewards/progression_diversity/mean": -0.00017063260020222515, + "rewards/progression_diversity/std": 0.0029495148919522762, + "rewards/symbolic_reward_accuracy/mean": 0.59375, + "rewards/symbolic_reward_accuracy/std": 0.49161264300346375, + "rewards/symbolic_reward_partial_score/mean": 0.87158203125, + "rewards/symbolic_reward_partial_score/std": 0.17591506242752075, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0777539014816284, + "sampling/importance_sampling_ratio/min": 0.0005583280581049621, + "sampling/sampling_logp_difference/max": 7.490563869476318, + "sampling/sampling_logp_difference/mean": 0.14936181902885437, + "step": 741 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3076600730419159, + "epoch": 1.9526315789473685, + "grad_norm": 0.008351475931704044, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 742 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3145374208688736, + "epoch": 1.9552631578947368, + "grad_norm": 0.0027690737042576075, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 743 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3131014108657837, + "epoch": 1.9578947368421051, + "grad_norm": 0.005030886270105839, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 369.201171875, + "completions/mean_terminated_length": 369.201171875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.31135545670986176, + "epoch": 1.9605263157894737, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.008429143577814102, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 187226026.0, + "reward": 0.7362792491912842, + "reward_std": 0.08533239364624023, + "rewards/progression_diversity/mean": -1.097430595109472e-05, + "rewards/progression_diversity/std": 0.00024832019698806107, + "rewards/symbolic_reward_accuracy/mean": 0.76171875, + "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, + "rewards/symbolic_reward_partial_score/mean": 0.9308267831802368, + "rewards/symbolic_reward_partial_score/std": 0.13665828108787537, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0770303010940552, + "sampling/importance_sampling_ratio/min": 0.0008326321840286255, + "sampling/sampling_logp_difference/max": 7.09091854095459, + "sampling/sampling_logp_difference/mean": 0.14952237904071808, + "step": 745 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.31318385899066925, + "epoch": 1.9631578947368422, + "grad_norm": 0.0033032975625246763, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 746 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31359700858592987, + "epoch": 1.9657894736842105, + "grad_norm": 0.007935271598398685, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 747 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3112269937992096, + "epoch": 1.9684210526315788, + "grad_norm": 0.00817700196057558, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 370.04296875, + "completions/mean_terminated_length": 370.04296875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.3109482377767563, + "epoch": 1.9710526315789474, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.006165255792438984, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 187772416.0, + "reward": 0.7406250238418579, + "reward_std": 0.09467050433158875, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.779296875, + "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, + "rewards/symbolic_reward_partial_score/mean": 0.91015625, + "rewards/symbolic_reward_partial_score/std": 0.1857500523328781, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.076927661895752, + "sampling/importance_sampling_ratio/min": 7.953165550134145e-06, + "sampling/sampling_logp_difference/max": 11.74194049835205, + "sampling/sampling_logp_difference/mean": 0.1476995050907135, + "step": 749 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.308622270822525, + "epoch": 1.973684210526316, + "grad_norm": 0.0047165341675281525, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 750 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3079015612602234, + "epoch": 1.9763157894736842, + "grad_norm": 0.0040458752773702145, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 751 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.30972377955913544, + "epoch": 1.9789473684210526, + "grad_norm": 0.004278114996850491, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 358.255859375, + "completions/mean_terminated_length": 358.255859375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3107995539903641, + "epoch": 1.981578947368421, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.007798196282237768, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 188375491.0, + "reward": 0.6957031488418579, + "reward_std": 0.07784873992204666, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.70703125, + "rewards/symbolic_reward_accuracy/std": 0.455569326877594, + "rewards/symbolic_reward_partial_score/mean": 0.9049478769302368, + "rewards/symbolic_reward_partial_score/std": 0.16625335812568665, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0773422718048096, + "sampling/importance_sampling_ratio/min": 0.00029975359211675823, + "sampling/sampling_logp_difference/max": 8.112549781799316, + "sampling/sampling_logp_difference/mean": 0.14972257614135742, + "step": 753 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3143154978752136, + "epoch": 1.9842105263157894, + "grad_norm": 0.004069626331329346, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 754 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.31426501274108887, + "epoch": 1.986842105263158, + "grad_norm": 0.0061919731087982655, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 755 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.31135572493076324, + "epoch": 1.9894736842105263, + "grad_norm": 0.004079555626958609, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 357.47265625, + "completions/mean_terminated_length": 357.47265625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.309786781668663, + "epoch": 1.9921052631578946, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.00555258197709918, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 188963829.0, + "reward": 0.6704590320587158, + "reward_std": 0.10458528995513916, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.669921875, + "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, + "rewards/symbolic_reward_partial_score/mean": 0.89501953125, + "rewards/symbolic_reward_partial_score/std": 0.16062310338020325, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0771996974945068, + "sampling/importance_sampling_ratio/min": 0.0002447458100505173, + "sampling/sampling_logp_difference/max": 8.315290451049805, + "sampling/sampling_logp_difference/mean": 0.14952662587165833, + "step": 757 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31545260548591614, + "epoch": 1.9947368421052631, + "grad_norm": 0.006828597281128168, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 758 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3086663782596588, + "epoch": 1.9973684210526317, + "grad_norm": 0.0067805699072778225, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 759 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.31685784459114075, + "epoch": 2.0, + "grad_norm": 0.007817032746970654, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 760 + }, + { + "epoch": 2.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.000244140625, + "eval_completions/max_length": 2330.09375, + "eval_completions/max_terminated_length": 1848.78125, + "eval_completions/mean_length": 383.0869140625, + "eval_completions/mean_terminated_length": 379.1825752258301, + "eval_completions/min_length": 154.75, + "eval_completions/min_terminated_length": 154.75, + "eval_entropy": 0.3063309593126178, + "eval_frac_reward_zero_std": 0.41015625, + "eval_loss": 0.0009358798852190375, + "eval_num_tokens": 188963829.0, + "eval_reward": 0.7193302186205983, + "eval_reward_std": 0.14116343623027205, + "eval_rewards/progression_diversity/mean": -0.0005755307929575793, + "eval_rewards/progression_diversity/std": 0.006511387479804398, + "eval_rewards/symbolic_reward_accuracy/mean": 0.751220703125, + "eval_rewards/symbolic_reward_accuracy/std": 0.40731942653656006, + "eval_rewards/symbolic_reward_partial_score/mean": 0.8976237010210752, + "eval_rewards/symbolic_reward_partial_score/std": 0.18608224554918706, + "eval_rewards/tag_count_reward/mean": -0.0068359375, + "eval_rewards/tag_count_reward/std": 0.03354312968440354, + "eval_runtime": 289.381, + "eval_samples_per_second": 0.864, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0764338932931423, + "eval_sampling/importance_sampling_ratio/min": 0.0022319126777483106, + "eval_sampling/sampling_logp_difference/max": 14.824358269572258, + "eval_sampling/sampling_logp_difference/mean": 0.1495322003029287, + "eval_steps_per_second": 0.007, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 359.412109375, + "completions/mean_terminated_length": 359.412109375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.31213897466659546, + "epoch": 2.0026315789473683, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.009487095288932323, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 189540808.0, + "reward": 0.71630859375, + "reward_std": 0.050411537289619446, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.748046875, + "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, + "rewards/symbolic_reward_partial_score/mean": 0.8916015625, + "rewards/symbolic_reward_partial_score/std": 0.19575290381908417, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0758132934570312, + "sampling/importance_sampling_ratio/min": 0.0004769986553583294, + "sampling/sampling_logp_difference/max": 7.64799690246582, + "sampling/sampling_logp_difference/mean": 0.14864085614681244, + "step": 761 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3116689920425415, + "epoch": 2.0052631578947366, + "grad_norm": 0.0009561034385114908, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 762 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.3024158477783203, + "epoch": 2.0078947368421054, + "grad_norm": 0.0017013449687510729, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 763 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.3098195642232895, + "epoch": 2.0105263157894737, + "grad_norm": 0.0036314663011580706, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 355.013671875, + "completions/mean_terminated_length": 355.013671875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3051309287548065, + "epoch": 2.013157894736842, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.005320638883858919, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 190129391.0, + "reward": 0.657177746295929, + "reward_std": 0.08853315562009811, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.654296875, + "rewards/symbolic_reward_accuracy/std": 0.4760620892047882, + "rewards/symbolic_reward_partial_score/mean": 0.8819987177848816, + "rewards/symbolic_reward_partial_score/std": 0.18076345324516296, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077763557434082, + "sampling/importance_sampling_ratio/min": 2.4497896447428502e-05, + "sampling/sampling_logp_difference/max": 10.616923332214355, + "sampling/sampling_logp_difference/mean": 0.15018996596336365, + "step": 765 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.31671372056007385, + "epoch": 2.0157894736842104, + "grad_norm": 0.00803243275731802, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 766 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3143162131309509, + "epoch": 2.018421052631579, + "grad_norm": 0.003088021883741021, + "learning_rate": 1e-06, + "loss": -0.0026, + "step": 767 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3092113137245178, + "epoch": 2.0210526315789474, + "grad_norm": 0.0034267231822013855, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 353.95703125, + "completions/mean_terminated_length": 353.95703125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.3089481443166733, + "epoch": 2.0236842105263158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.008214015513658524, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 190701913.0, + "reward": 0.6354489326477051, + "reward_std": 0.10014986246824265, + "rewards/progression_diversity/mean": -3.783680949709378e-05, + "rewards/progression_diversity/std": 0.0008561493013985455, + "rewards/symbolic_reward_accuracy/mean": 0.630859375, + "rewards/symbolic_reward_accuracy/std": 0.4830440282821655, + "rewards/symbolic_reward_partial_score/mean": 0.8564453125, + "rewards/symbolic_reward_partial_score/std": 0.20011062920093536, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0768274068832397, + "sampling/importance_sampling_ratio/min": 0.000324864435242489, + "sampling/sampling_logp_difference/max": 8.032102584838867, + "sampling/sampling_logp_difference/mean": 0.14762979745864868, + "step": 769 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.31103044748306274, + "epoch": 2.026315789473684, + "grad_norm": 0.005561685189604759, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 770 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.30797044932842255, + "epoch": 2.028947368421053, + "grad_norm": 0.008373531512916088, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 771 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3142692297697067, + "epoch": 2.031578947368421, + "grad_norm": 0.009180448018014431, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 364.009765625, + "completions/mean_terminated_length": 364.009765625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.3065429776906967, + "epoch": 2.0342105263157895, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.006964081432670355, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 191298590.0, + "reward": 0.6746094226837158, + "reward_std": 0.09266329556703568, + "rewards/progression_diversity/mean": -1.0304419220119598e-06, + "rewards/progression_diversity/std": 2.331623727513943e-05, + "rewards/symbolic_reward_accuracy/mean": 0.681640625, + "rewards/symbolic_reward_accuracy/std": 0.46629536151885986, + "rewards/symbolic_reward_partial_score/mean": 0.8854166865348816, + "rewards/symbolic_reward_partial_score/std": 0.1944405734539032, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0783308744430542, + "sampling/importance_sampling_ratio/min": 0.0001869058469310403, + "sampling/sampling_logp_difference/max": 8.584905624389648, + "sampling/sampling_logp_difference/mean": 0.14911575615406036, + "step": 773 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30941206216812134, + "epoch": 2.036842105263158, + "grad_norm": 0.008871463127434254, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 774 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3097645044326782, + "epoch": 2.039473684210526, + "grad_norm": 0.006418595090508461, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 775 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31205175817012787, + "epoch": 2.042105263157895, + "grad_norm": 0.005364975426346064, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 367.24609375, + "completions/mean_terminated_length": 367.24609375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.31110282242298126, + "epoch": 2.044736842105263, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.009222902357578278, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 191865724.0, + "reward": 0.6914538741111755, + "reward_std": 0.0986681878566742, + "rewards/progression_diversity/mean": -0.00012244281242601573, + "rewards/progression_diversity/std": 0.002770564751699567, + "rewards/symbolic_reward_accuracy/mean": 0.69921875, + "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, + "rewards/symbolic_reward_partial_score/mean": 0.9064127206802368, + "rewards/symbolic_reward_partial_score/std": 0.16166870296001434, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077000379562378, + "sampling/importance_sampling_ratio/min": 0.00012576297740451992, + "sampling/sampling_logp_difference/max": 8.981111526489258, + "sampling/sampling_logp_difference/mean": 0.14813096821308136, + "step": 777 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3072364181280136, + "epoch": 2.0473684210526315, + "grad_norm": 0.004725575912743807, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 778 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3081643432378769, + "epoch": 2.05, + "grad_norm": 0.005970868747681379, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 779 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3126187026500702, + "epoch": 2.0526315789473686, + "grad_norm": 0.0036161758471280336, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 366.17578125, + "completions/mean_terminated_length": 366.17578125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.31414252519607544, + "epoch": 2.055263157894737, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.007239960134029388, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 192478710.0, + "reward": 0.7059570550918579, + "reward_std": 0.1260911226272583, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.71875, + "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, + "rewards/symbolic_reward_partial_score/mean": 0.9163411259651184, + "rewards/symbolic_reward_partial_score/std": 0.14764182269573212, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0782256126403809, + "sampling/importance_sampling_ratio/min": 0.0007079911883920431, + "sampling/sampling_logp_difference/max": 7.253078937530518, + "sampling/sampling_logp_difference/mean": 0.14842364192008972, + "step": 781 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3148372173309326, + "epoch": 2.057894736842105, + "grad_norm": 0.005920483730733395, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 782 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3127191811800003, + "epoch": 2.0605263157894735, + "grad_norm": 0.009441790170967579, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 783 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.313884437084198, + "epoch": 2.0631578947368423, + "grad_norm": 0.004328244365751743, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 395.302734375, + "completions/mean_terminated_length": 364.0137023925781, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.30200617015361786, + "epoch": 2.0657894736842106, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.008038647472858429, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 193108273.0, + "reward": 0.7076036334037781, + "reward_std": 0.08745211362838745, + "rewards/progression_diversity/mean": -0.0013568074209615588, + "rewards/progression_diversity/std": 0.030187834054231644, + "rewards/symbolic_reward_accuracy/mean": 0.734375, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.890625, + "rewards/symbolic_reward_partial_score/std": 0.1928967982530594, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0760550498962402, + "sampling/importance_sampling_ratio/min": 0.0006800789851695299, + "sampling/sampling_logp_difference/max": 7.293301582336426, + "sampling/sampling_logp_difference/mean": 0.14417186379432678, + "step": 785 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.30903005599975586, + "epoch": 2.068421052631579, + "grad_norm": 0.00587112782523036, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 786 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3140885829925537, + "epoch": 2.0710526315789473, + "grad_norm": 0.005677395034581423, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 787 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31708140671253204, + "epoch": 2.0736842105263156, + "grad_norm": 0.0033428401220589876, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 360.97265625, + "completions/mean_terminated_length": 360.97265625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.31590431928634644, + "epoch": 2.0763157894736843, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.007495602127164602, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 193679875.0, + "reward": 0.6812500357627869, + "reward_std": 0.1275385469198227, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.693359375, + "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, + "rewards/symbolic_reward_partial_score/mean": 0.8841146230697632, + "rewards/symbolic_reward_partial_score/std": 0.1895521730184555, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0787707567214966, + "sampling/importance_sampling_ratio/min": 1.7228163642357686e-06, + "sampling/sampling_logp_difference/max": 13.271550178527832, + "sampling/sampling_logp_difference/mean": 0.14790306985378265, + "step": 789 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3198508620262146, + "epoch": 2.0789473684210527, + "grad_norm": 0.007427311968058348, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 790 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3098384737968445, + "epoch": 2.081578947368421, + "grad_norm": 0.004864763468503952, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 791 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.30552271008491516, + "epoch": 2.0842105263157893, + "grad_norm": 0.004870227538049221, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 361.841796875, + "completions/mean_terminated_length": 361.841796875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.31432493031024933, + "epoch": 2.086842105263158, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.008887571282684803, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 194278290.0, + "reward": 0.6641601324081421, + "reward_std": 0.06735213100910187, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.669921875, + "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, + "rewards/symbolic_reward_partial_score/mean": 0.8740234375, + "rewards/symbolic_reward_partial_score/std": 0.20001934468746185, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0777101516723633, + "sampling/importance_sampling_ratio/min": 0.006325147580355406, + "sampling/sampling_logp_difference/max": 5.0632219314575195, + "sampling/sampling_logp_difference/mean": 0.14742319285869598, + "step": 793 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.30889126658439636, + "epoch": 2.0894736842105264, + "grad_norm": 0.0032396079041063786, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 794 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3082197904586792, + "epoch": 2.0921052631578947, + "grad_norm": 0.0014065575087442994, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 795 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3115806132555008, + "epoch": 2.094736842105263, + "grad_norm": 0.007706951815634966, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 359.45703125, + "completions/mean_terminated_length": 359.45703125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.3137265592813492, + "epoch": 2.0973684210526318, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.004259579814970493, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 194847612.0, + "reward": 0.7145506143569946, + "reward_std": 0.08891631662845612, + "rewards/progression_diversity/mean": -1.562127363285981e-05, + "rewards/progression_diversity/std": 0.00035346910590305924, + "rewards/symbolic_reward_accuracy/mean": 0.734375, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.9130859375, + "rewards/symbolic_reward_partial_score/std": 0.1638316661119461, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0777618885040283, + "sampling/importance_sampling_ratio/min": 0.000564085494261235, + "sampling/sampling_logp_difference/max": 7.480304718017578, + "sampling/sampling_logp_difference/mean": 0.1483234167098999, + "step": 797 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3083493560552597, + "epoch": 2.1, + "grad_norm": 0.00659141456708312, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 798 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.314989909529686, + "epoch": 2.1026315789473684, + "grad_norm": 0.0037467950023710728, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 799 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3185144364833832, + "epoch": 2.1052631578947367, + "grad_norm": 0.005139374174177647, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 365.982421875, + "completions/mean_terminated_length": 365.982421875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.31436602771282196, + "epoch": 2.1078947368421055, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.008306819945573807, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 195432787.0, + "reward": 0.7186523675918579, + "reward_std": 0.08997929096221924, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.740234375, + "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, + "rewards/symbolic_reward_partial_score/mean": 0.9150390625, + "rewards/symbolic_reward_partial_score/std": 0.1594066470861435, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0780589580535889, + "sampling/importance_sampling_ratio/min": 0.0019471251871436834, + "sampling/sampling_logp_difference/max": 6.241401195526123, + "sampling/sampling_logp_difference/mean": 0.15034829080104828, + "step": 801 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3124569356441498, + "epoch": 2.110526315789474, + "grad_norm": 0.007252343464642763, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 802 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31769220530986786, + "epoch": 2.113157894736842, + "grad_norm": 0.002544126473367214, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 803 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3188740909099579, + "epoch": 2.1157894736842104, + "grad_norm": 0.0020408392883837223, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 355.298828125, + "completions/mean_terminated_length": 355.298828125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.31769895553588867, + "epoch": 2.1184210526315788, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.0065668681636452675, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 196035532.0, + "reward": 0.694287121295929, + "reward_std": 0.09877711534500122, + "rewards/progression_diversity/mean": -1.5697775097578415e-06, + "rewards/progression_diversity/std": 3.552000998752192e-05, + "rewards/symbolic_reward_accuracy/mean": 0.70703125, + "rewards/symbolic_reward_accuracy/std": 0.455569326877594, + "rewards/symbolic_reward_partial_score/mean": 0.9002279043197632, + "rewards/symbolic_reward_partial_score/std": 0.16993996500968933, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0784680843353271, + "sampling/importance_sampling_ratio/min": 1.892145405690826e-06, + "sampling/sampling_logp_difference/max": 13.177799224853516, + "sampling/sampling_logp_difference/mean": 0.15134167671203613, + "step": 805 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31717509031295776, + "epoch": 2.1210526315789475, + "grad_norm": 0.008248434402048588, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 806 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3211695998907089, + "epoch": 2.123684210526316, + "grad_norm": 0.003757715690881014, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 807 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.32018236815929413, + "epoch": 2.126315789473684, + "grad_norm": 0.0037706305738538504, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 356.197265625, + "completions/mean_terminated_length": 356.197265625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.31526923179626465, + "epoch": 2.1289473684210525, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.007752216421067715, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 196643409.0, + "reward": 0.6648437976837158, + "reward_std": 0.09060361981391907, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.66796875, + "rewards/symbolic_reward_accuracy/std": 0.47140273451805115, + "rewards/symbolic_reward_partial_score/mean": 0.8802083134651184, + "rewards/symbolic_reward_partial_score/std": 0.20505410432815552, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078608512878418, + "sampling/importance_sampling_ratio/min": 0.0015027325134724379, + "sampling/sampling_logp_difference/max": 6.500470161437988, + "sampling/sampling_logp_difference/mean": 0.15123139321804047, + "step": 809 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3221448212862015, + "epoch": 2.1315789473684212, + "grad_norm": 0.006944730877876282, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 810 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3182772845029831, + "epoch": 2.1342105263157896, + "grad_norm": 0.009241136722266674, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 811 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3173188716173172, + "epoch": 2.136842105263158, + "grad_norm": 0.005788304843008518, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 353.5078125, + "completions/mean_terminated_length": 353.5078125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.3186788260936737, + "epoch": 2.139473684210526, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.005239107180386782, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 197251413.0, + "reward": 0.6026840209960938, + "reward_std": 0.0501655638217926, + "rewards/progression_diversity/mean": -0.00015046147746033967, + "rewards/progression_diversity/std": 0.002405304927378893, + "rewards/symbolic_reward_accuracy/mean": 0.578125, + "rewards/symbolic_reward_accuracy/std": 0.49434176087379456, + "rewards/symbolic_reward_partial_score/mean": 0.8527017831802368, + "rewards/symbolic_reward_partial_score/std": 0.20470060408115387, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0792396068572998, + "sampling/importance_sampling_ratio/min": 1.5721030877102748e-06, + "sampling/sampling_logp_difference/max": 13.363096237182617, + "sampling/sampling_logp_difference/mean": 0.1521165668964386, + "step": 813 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3178333342075348, + "epoch": 2.1421052631578945, + "grad_norm": 0.01202054787427187, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 814 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3216595947742462, + "epoch": 2.1447368421052633, + "grad_norm": 0.003936625551432371, + "learning_rate": 1e-06, + "loss": -0.0019, + "step": 815 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.32149362564086914, + "epoch": 2.1473684210526316, + "grad_norm": 0.002031634794548154, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 351.484375, + "completions/mean_terminated_length": 351.484375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.31737402081489563, + "epoch": 2.15, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.004952269606292248, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 197850701.0, + "reward": 0.6548340320587158, + "reward_std": 0.07701730728149414, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.6484375, + "rewards/symbolic_reward_accuracy/std": 0.4779251217842102, + "rewards/symbolic_reward_partial_score/mean": 0.8859049081802368, + "rewards/symbolic_reward_partial_score/std": 0.17860201001167297, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0793912410736084, + "sampling/importance_sampling_ratio/min": 0.0023158956319093704, + "sampling/sampling_logp_difference/max": 6.067958831787109, + "sampling/sampling_logp_difference/mean": 0.15115895867347717, + "step": 817 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.31825047731399536, + "epoch": 2.1526315789473682, + "grad_norm": 0.007298370823264122, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 818 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3138178437948227, + "epoch": 2.155263157894737, + "grad_norm": 0.002253433922305703, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 819 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.32211922109127045, + "epoch": 2.1578947368421053, + "grad_norm": 0.010363497771322727, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 383.619140625, + "completions/mean_terminated_length": 352.3072509765625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.3100568354129791, + "epoch": 2.1605263157894736, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.006256972439587116, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 198457098.0, + "reward": 0.6989597082138062, + "reward_std": 0.09930028021335602, + "rewards/progression_diversity/mean": -0.0014896361390128732, + "rewards/progression_diversity/std": 0.0337008498609066, + "rewards/symbolic_reward_accuracy/mean": 0.7109375, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.9080404043197632, + "rewards/symbolic_reward_partial_score/std": 0.16075260937213898, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0769511461257935, + "sampling/importance_sampling_ratio/min": 0.00047645941958762705, + "sampling/sampling_logp_difference/max": 7.649127960205078, + "sampling/sampling_logp_difference/mean": 0.14793072640895844, + "step": 821 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.32065755128860474, + "epoch": 2.163157894736842, + "grad_norm": 0.005824042018502951, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 822 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3172464370727539, + "epoch": 2.1657894736842107, + "grad_norm": 0.00658250181004405, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 823 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31654980778694153, + "epoch": 2.168421052631579, + "grad_norm": 0.008792352862656116, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 363.064453125, + "completions/mean_terminated_length": 363.064453125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.3157438635826111, + "epoch": 2.1710526315789473, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.008407440036535263, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 199052811.0, + "reward": 0.6742187738418579, + "reward_std": 0.13188248872756958, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.681640625, + "rewards/symbolic_reward_accuracy/std": 0.46629536151885986, + "rewards/symbolic_reward_partial_score/mean": 0.8841145634651184, + "rewards/symbolic_reward_partial_score/std": 0.19204509258270264, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0799140930175781, + "sampling/importance_sampling_ratio/min": 8.337887993548065e-05, + "sampling/sampling_logp_difference/max": 9.392115592956543, + "sampling/sampling_logp_difference/mean": 0.15036407113075256, + "step": 825 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31925569474697113, + "epoch": 2.1736842105263157, + "grad_norm": 0.009222878143191338, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 826 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.31960971653461456, + "epoch": 2.1763157894736844, + "grad_norm": 0.00924774631857872, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 827 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3173024207353592, + "epoch": 2.1789473684210527, + "grad_norm": 0.005093837156891823, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 358.44921875, + "completions/mean_terminated_length": 358.44921875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.30984506011009216, + "epoch": 2.181578947368421, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.005993766710162163, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 199616785.0, + "reward": 0.6996093988418579, + "reward_std": 0.09051661938428879, + "rewards/progression_diversity/mean": -9.057112038135529e-07, + "rewards/progression_diversity/std": 2.0493906049523503e-05, + "rewards/symbolic_reward_accuracy/mean": 0.716796875, + "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, + "rewards/symbolic_reward_partial_score/mean": 0.8984375, + "rewards/symbolic_reward_partial_score/std": 0.17569713294506073, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0780447721481323, + "sampling/importance_sampling_ratio/min": 1.4085967450228054e-06, + "sampling/sampling_logp_difference/max": 13.472916603088379, + "sampling/sampling_logp_difference/mean": 0.15053117275238037, + "step": 829 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.3208106756210327, + "epoch": 2.1842105263157894, + "grad_norm": 0.004268263466656208, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 830 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3216129243373871, + "epoch": 2.1868421052631577, + "grad_norm": 0.005367336794734001, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 831 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3205704092979431, + "epoch": 2.1894736842105265, + "grad_norm": 0.006977500859647989, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 358.544921875, + "completions/mean_terminated_length": 358.544921875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.3062279671430588, + "epoch": 2.192105263157895, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.00700410595163703, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 200194984.0, + "reward": 0.71435546875, + "reward_std": 0.13384464383125305, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.74609375, + "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, + "rewards/symbolic_reward_partial_score/mean": 0.8889974355697632, + "rewards/symbolic_reward_partial_score/std": 0.21046864986419678, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0783202648162842, + "sampling/importance_sampling_ratio/min": 6.207470869412646e-05, + "sampling/sampling_logp_difference/max": 9.687171936035156, + "sampling/sampling_logp_difference/mean": 0.14857394993305206, + "step": 833 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31446513533592224, + "epoch": 2.194736842105263, + "grad_norm": 0.0064468346536159515, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 834 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3156389594078064, + "epoch": 2.1973684210526314, + "grad_norm": 0.0036858366802334785, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 835 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3129158616065979, + "epoch": 2.2, + "grad_norm": 0.008718104101717472, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 348.08203125, + "completions/mean_terminated_length": 348.08203125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.31272754073143005, + "epoch": 2.2026315789473685, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.003958904184401035, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 200761490.0, + "reward": 0.73486328125, + "reward_std": 0.0685553327202797, + "rewards/progression_diversity/mean": -4.245833224558737e-06, + "rewards/progression_diversity/std": 9.607223910279572e-05, + "rewards/symbolic_reward_accuracy/mean": 0.76953125, + "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, + "rewards/symbolic_reward_partial_score/mean": 0.9104818105697632, + "rewards/symbolic_reward_partial_score/std": 0.17350530624389648, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0795412063598633, + "sampling/importance_sampling_ratio/min": 0.0001628218888072297, + "sampling/sampling_logp_difference/max": 8.722853660583496, + "sampling/sampling_logp_difference/mean": 0.1502777636051178, + "step": 837 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3207908719778061, + "epoch": 2.205263157894737, + "grad_norm": 0.0027855695225298405, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 838 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.32380297780036926, + "epoch": 2.207894736842105, + "grad_norm": 0.001798739074729383, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 839 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3216434121131897, + "epoch": 2.2105263157894735, + "grad_norm": 0.0038593984209001064, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 352.498046875, + "completions/mean_terminated_length": 352.498046875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.321834996342659, + "epoch": 2.213157894736842, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.011081775650382042, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 201336753.0, + "reward": 0.712841808795929, + "reward_std": 0.07380041480064392, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.736328125, + "rewards/symbolic_reward_accuracy/std": 0.4410543739795685, + "rewards/symbolic_reward_partial_score/mean": 0.9034830331802368, + "rewards/symbolic_reward_partial_score/std": 0.194377139210701, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0791889429092407, + "sampling/importance_sampling_ratio/min": 0.00019401576719246805, + "sampling/sampling_logp_difference/max": 8.547571182250977, + "sampling/sampling_logp_difference/mean": 0.15164169669151306, + "step": 841 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.3151762932538986, + "epoch": 2.2157894736842105, + "grad_norm": 0.0049751391634345055, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 842 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.32177719473838806, + "epoch": 2.218421052631579, + "grad_norm": 0.0013347979402169585, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 843 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3246469497680664, + "epoch": 2.221052631578947, + "grad_norm": 0.0021147457882761955, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 353.75, + "completions/mean_terminated_length": 353.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3158084601163864, + "epoch": 2.223684210526316, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.00637836754322052, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 201900305.0, + "reward": 0.7346680164337158, + "reward_std": 0.060360029339790344, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.765625, + "rewards/symbolic_reward_accuracy/std": 0.42402184009552, + "rewards/symbolic_reward_partial_score/mean": 0.9176432490348816, + "rewards/symbolic_reward_partial_score/std": 0.16336961090564728, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078678011894226, + "sampling/importance_sampling_ratio/min": 2.9706257009820547e-06, + "sampling/sampling_logp_difference/max": 12.726737976074219, + "sampling/sampling_logp_difference/mean": 0.1496877670288086, + "step": 845 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.3176430016756058, + "epoch": 2.2263157894736842, + "grad_norm": 0.004808385390788317, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 846 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.31751298904418945, + "epoch": 2.2289473684210526, + "grad_norm": 0.0020348995458334684, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 847 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.31586088240146637, + "epoch": 2.231578947368421, + "grad_norm": 0.004024218767881393, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 352.365234375, + "completions/mean_terminated_length": 352.365234375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.3219304084777832, + "epoch": 2.2342105263157896, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00704569835215807, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 202484204.0, + "reward": 0.6749019026756287, + "reward_std": 0.10070345550775528, + "rewards/progression_diversity/mean": -4.508948768489063e-05, + "rewards/progression_diversity/std": 0.0006228564889170229, + "rewards/symbolic_reward_accuracy/mean": 0.685546875, + "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, + "rewards/symbolic_reward_partial_score/mean": 0.8785807490348816, + "rewards/symbolic_reward_partial_score/std": 0.20275656878948212, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0787829160690308, + "sampling/importance_sampling_ratio/min": 1.0576643944659736e-05, + "sampling/sampling_logp_difference/max": 11.456862449645996, + "sampling/sampling_logp_difference/mean": 0.15037262439727783, + "step": 849 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.31789323687553406, + "epoch": 2.236842105263158, + "grad_norm": 0.008588818833231926, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 850 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.31510041654109955, + "epoch": 2.2394736842105263, + "grad_norm": 0.00736332219094038, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 851 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3124866932630539, + "epoch": 2.2421052631578946, + "grad_norm": 0.005612007807940245, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 361.51953125, + "completions/mean_terminated_length": 361.51953125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.32218506932258606, + "epoch": 2.2447368421052634, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.013904748484492302, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 203052758.0, + "reward": 0.6032225489616394, + "reward_std": 0.08952237665653229, + "rewards/progression_diversity/mean": -1.4995790479588322e-05, + "rewards/progression_diversity/std": 0.000339316000463441, + "rewards/symbolic_reward_accuracy/mean": 0.587890625, + "rewards/symbolic_reward_accuracy/std": 0.49269601702690125, + "rewards/symbolic_reward_partial_score/mean": 0.8349609375, + "rewards/symbolic_reward_partial_score/std": 0.21583224833011627, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0778234004974365, + "sampling/importance_sampling_ratio/min": 5.763072863373964e-07, + "sampling/sampling_logp_difference/max": 14.36662483215332, + "sampling/sampling_logp_difference/mean": 0.15012940764427185, + "step": 853 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3153466731309891, + "epoch": 2.2473684210526317, + "grad_norm": 0.0023524423595517874, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 854 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.31378166377544403, + "epoch": 2.25, + "grad_norm": 0.0045723277144134045, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 855 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.31204673647880554, + "epoch": 2.2526315789473683, + "grad_norm": 0.006419648882001638, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 364.69921875, + "completions/mean_terminated_length": 364.69921875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.3136000484228134, + "epoch": 2.2552631578947366, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.00942289736121893, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 203652796.0, + "reward": 0.6517090201377869, + "reward_std": 0.07550329715013504, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.64453125, + "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, + "rewards/symbolic_reward_partial_score/mean": 0.88330078125, + "rewards/symbolic_reward_partial_score/std": 0.18055777251720428, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0787506103515625, + "sampling/importance_sampling_ratio/min": 5.0096670747734606e-05, + "sampling/sampling_logp_difference/max": 9.901556015014648, + "sampling/sampling_logp_difference/mean": 0.1502964198589325, + "step": 857 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3159327059984207, + "epoch": 2.2578947368421054, + "grad_norm": 0.0035984772257506847, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 858 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3201962113380432, + "epoch": 2.2605263157894737, + "grad_norm": 0.002598309423774481, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 859 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3207338899374008, + "epoch": 2.263157894736842, + "grad_norm": 0.0067030079662799835, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 362.994140625, + "completions/mean_terminated_length": 362.994140625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.31222720444202423, + "epoch": 2.2657894736842104, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.007842420600354671, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 204248633.0, + "reward": 0.675830066204071, + "reward_std": 0.05877920240163803, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.6796875, + "rewards/symbolic_reward_accuracy/std": 0.4670529365539551, + "rewards/symbolic_reward_partial_score/mean": 0.8933919668197632, + "rewards/symbolic_reward_partial_score/std": 0.17396998405456543, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0796515941619873, + "sampling/importance_sampling_ratio/min": 0.00020882970420643687, + "sampling/sampling_logp_difference/max": 8.473991394042969, + "sampling/sampling_logp_difference/mean": 0.14983773231506348, + "step": 861 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.31324464082717896, + "epoch": 2.268421052631579, + "grad_norm": 0.008384945802390575, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 862 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31786786019802094, + "epoch": 2.2710526315789474, + "grad_norm": 0.002380169229581952, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 863 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.31832440197467804, + "epoch": 2.2736842105263158, + "grad_norm": 0.004632322117686272, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 369.892578125, + "completions/mean_terminated_length": 369.892578125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3140920400619507, + "epoch": 2.276315789473684, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005593942478299141, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 204799298.0, + "reward": 0.6139160394668579, + "reward_std": 0.10376207530498505, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.58984375, + "rewards/symbolic_reward_accuracy/std": 0.49234291911125183, + "rewards/symbolic_reward_partial_score/mean": 0.86669921875, + "rewards/symbolic_reward_partial_score/std": 0.19208206236362457, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0795059204101562, + "sampling/importance_sampling_ratio/min": 0.0054131243377923965, + "sampling/sampling_logp_difference/max": 5.218928813934326, + "sampling/sampling_logp_difference/mean": 0.14964452385902405, + "step": 865 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31835804879665375, + "epoch": 2.2789473684210524, + "grad_norm": 0.0054872226901352406, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 866 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31307150423526764, + "epoch": 2.281578947368421, + "grad_norm": 0.008840472437441349, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 867 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.31860506534576416, + "epoch": 2.2842105263157895, + "grad_norm": 0.003689047647640109, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 400.73046875, + "completions/mean_terminated_length": 369.4520568847656, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.3164720982313156, + "epoch": 2.286842105263158, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.00662199268117547, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 205400760.0, + "reward": 0.7893416881561279, + "reward_std": 0.0898476168513298, + "rewards/progression_diversity/mean": -0.0013872169656679034, + "rewards/progression_diversity/std": 0.031389135867357254, + "rewards/symbolic_reward_accuracy/mean": 0.84375, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.9436848759651184, + "rewards/symbolic_reward_partial_score/std": 0.14467516541481018, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0766570568084717, + "sampling/importance_sampling_ratio/min": 0.00467439740896225, + "sampling/sampling_logp_difference/max": 5.365654945373535, + "sampling/sampling_logp_difference/mean": 0.14632970094680786, + "step": 869 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3118371516466141, + "epoch": 2.2894736842105265, + "grad_norm": 0.003031267784535885, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 870 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.30880677700042725, + "epoch": 2.292105263157895, + "grad_norm": 0.005558013450354338, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 871 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.3143739849328995, + "epoch": 2.294736842105263, + "grad_norm": 0.008641637861728668, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 399.84765625, + "completions/mean_terminated_length": 368.5675048828125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.3094482719898224, + "epoch": 2.2973684210526315, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.008527515456080437, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 206010794.0, + "reward": 0.6527195572853088, + "reward_std": 0.08901824057102203, + "rewards/progression_diversity/mean": -0.001486002467572689, + "rewards/progression_diversity/std": 0.03277655318379402, + "rewards/symbolic_reward_accuracy/mean": 0.64453125, + "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, + "rewards/symbolic_reward_partial_score/mean": 0.88671875, + "rewards/symbolic_reward_partial_score/std": 0.17725172638893127, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.076696753501892, + "sampling/importance_sampling_ratio/min": 3.0586368211515946e-06, + "sampling/sampling_logp_difference/max": 12.697541236877441, + "sampling/sampling_logp_difference/mean": 0.1453593373298645, + "step": 873 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.311574786901474, + "epoch": 2.3, + "grad_norm": 0.004662012215703726, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 874 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3119223415851593, + "epoch": 2.3026315789473686, + "grad_norm": 0.005506505724042654, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 875 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.308918759226799, + "epoch": 2.305263157894737, + "grad_norm": 0.007730038370937109, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 371.2109375, + "completions/mean_terminated_length": 371.2109375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.3163398802280426, + "epoch": 2.307894736842105, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0073974900878965855, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 206607350.0, + "reward": 0.6987792253494263, + "reward_std": 0.1293921172618866, + "rewards/progression_diversity/mean": -1.3751643564319238e-05, + "rewards/progression_diversity/std": 0.0003111641854047775, + "rewards/symbolic_reward_accuracy/mean": 0.71875, + "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, + "rewards/symbolic_reward_partial_score/mean": 0.8917642831802368, + "rewards/symbolic_reward_partial_score/std": 0.1961551159620285, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0803439617156982, + "sampling/importance_sampling_ratio/min": 0.004385761916637421, + "sampling/sampling_logp_difference/max": 5.429391860961914, + "sampling/sampling_logp_difference/mean": 0.1513446569442749, + "step": 877 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3198176920413971, + "epoch": 2.3105263157894735, + "grad_norm": 0.004171326756477356, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 878 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.31697770953178406, + "epoch": 2.3131578947368423, + "grad_norm": 0.01108588743954897, + "learning_rate": 1e-06, + "loss": 0.0061, + "step": 879 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3188299834728241, + "epoch": 2.3157894736842106, + "grad_norm": 0.007227355148643255, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 361.14453125, + "completions/mean_terminated_length": 361.14453125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.3121223598718643, + "epoch": 2.318421052631579, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.011712749488651752, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 207172704.0, + "reward": 0.6546875238418579, + "reward_std": 0.07038979977369308, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.6484375, + "rewards/symbolic_reward_accuracy/std": 0.4779251217842102, + "rewards/symbolic_reward_partial_score/mean": 0.8854166269302368, + "rewards/symbolic_reward_partial_score/std": 0.17847900092601776, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0788229703903198, + "sampling/importance_sampling_ratio/min": 6.657434278167784e-05, + "sampling/sampling_logp_difference/max": 9.617191314697266, + "sampling/sampling_logp_difference/mean": 0.15001147985458374, + "step": 881 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31704777479171753, + "epoch": 2.3210526315789473, + "grad_norm": 0.0035065063275396824, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 882 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.3097846359014511, + "epoch": 2.3236842105263156, + "grad_norm": 0.005223968997597694, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 883 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3145690858364105, + "epoch": 2.3263157894736843, + "grad_norm": 0.007196424994617701, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 366.859375, + "completions/mean_terminated_length": 366.859375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.31969161331653595, + "epoch": 2.3289473684210527, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.005184574518352747, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 207759832.0, + "reward": 0.7319824695587158, + "reward_std": 0.10044944286346436, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.755859375, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.92822265625, + "rewards/symbolic_reward_partial_score/std": 0.14612270891666412, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0796695947647095, + "sampling/importance_sampling_ratio/min": 2.608057229736005e-06, + "sampling/sampling_logp_difference/max": 12.856904983520508, + "sampling/sampling_logp_difference/mean": 0.15157514810562134, + "step": 885 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.31721703708171844, + "epoch": 2.331578947368421, + "grad_norm": 0.005828100256621838, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 886 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.31899966299533844, + "epoch": 2.3342105263157893, + "grad_norm": 0.0027556861750781536, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 887 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31219226121902466, + "epoch": 2.336842105263158, + "grad_norm": 0.004852119833230972, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 368.1484375, + "completions/mean_terminated_length": 368.1484375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.31981049478054047, + "epoch": 2.3394736842105264, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.007404921110719442, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 208337956.0, + "reward": 0.6898437738418579, + "reward_std": 0.07387437671422958, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.703125, + "rewards/symbolic_reward_accuracy/std": 0.45732781291007996, + "rewards/symbolic_reward_partial_score/mean": 0.8932291269302368, + "rewards/symbolic_reward_partial_score/std": 0.190824955701828, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0800843238830566, + "sampling/importance_sampling_ratio/min": 5.298529504216276e-05, + "sampling/sampling_logp_difference/max": 9.84549617767334, + "sampling/sampling_logp_difference/mean": 0.1499912440776825, + "step": 889 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.3165969252586365, + "epoch": 2.3421052631578947, + "grad_norm": 0.005117423832416534, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 890 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.31549713015556335, + "epoch": 2.344736842105263, + "grad_norm": 0.004304162692278624, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 891 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.31113120913505554, + "epoch": 2.3473684210526318, + "grad_norm": 0.009535240940749645, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 364.076171875, + "completions/mean_terminated_length": 364.076171875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.31199419498443604, + "epoch": 2.35, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.006672441493719816, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 208898955.0, + "reward": 0.7187011241912842, + "reward_std": 0.07201861590147018, + "rewards/progression_diversity/mean": -5.167676135897636e-06, + "rewards/progression_diversity/std": 0.00011693117266986519, + "rewards/symbolic_reward_accuracy/mean": 0.744140625, + "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, + "rewards/symbolic_reward_partial_score/mean": 0.9073893427848816, + "rewards/symbolic_reward_partial_score/std": 0.16758772730827332, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0769355297088623, + "sampling/importance_sampling_ratio/min": 8.111090892271022e-07, + "sampling/sampling_logp_difference/max": 14.024863243103027, + "sampling/sampling_logp_difference/mean": 0.14778931438922882, + "step": 893 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.3058849573135376, + "epoch": 2.3526315789473684, + "grad_norm": 0.004412407986819744, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 894 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.30890876054763794, + "epoch": 2.3552631578947367, + "grad_norm": 0.0028338998090475798, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 895 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.3096920996904373, + "epoch": 2.3578947368421055, + "grad_norm": 0.008265499956905842, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 364.822265625, + "completions/mean_terminated_length": 364.822265625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.3139221966266632, + "epoch": 2.360526315789474, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.005665302742272615, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 209503248.0, + "reward": 0.5989741683006287, + "reward_std": 0.10015460103750229, + "rewards/progression_diversity/mean": -4.6549830585718155e-05, + "rewards/progression_diversity/std": 0.001053302432410419, + "rewards/symbolic_reward_accuracy/mean": 0.572265625, + "rewards/symbolic_reward_accuracy/std": 0.4952339828014374, + "rewards/symbolic_reward_partial_score/mean": 0.85205078125, + "rewards/symbolic_reward_partial_score/std": 0.20019729435443878, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077416181564331, + "sampling/importance_sampling_ratio/min": 0.00034949558903463185, + "sampling/sampling_logp_difference/max": 7.959019660949707, + "sampling/sampling_logp_difference/mean": 0.1495225429534912, + "step": 897 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.31885823607444763, + "epoch": 2.363157894736842, + "grad_norm": 0.006921331398189068, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 898 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.30479341745376587, + "epoch": 2.3657894736842104, + "grad_norm": 0.007010175846517086, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 899 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3115847110748291, + "epoch": 2.3684210526315788, + "grad_norm": 0.004461990669369698, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 358.376953125, + "completions/mean_terminated_length": 358.376953125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.3171516954898834, + "epoch": 2.3710526315789475, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.007584826089441776, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 210096721.0, + "reward": 0.6747558116912842, + "reward_std": 0.06001054495573044, + "rewards/progression_diversity/mean": -1.2740825695800595e-05, + "rewards/progression_diversity/std": 0.0002882919798139483, + "rewards/symbolic_reward_accuracy/mean": 0.685546875, + "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, + "rewards/symbolic_reward_partial_score/mean": 0.8780924081802368, + "rewards/symbolic_reward_partial_score/std": 0.2119402289390564, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0778324604034424, + "sampling/importance_sampling_ratio/min": 1.234491810464533e-05, + "sampling/sampling_logp_difference/max": 11.302266120910645, + "sampling/sampling_logp_difference/mean": 0.14901325106620789, + "step": 901 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3117760568857193, + "epoch": 2.373684210526316, + "grad_norm": 0.004368501249700785, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 902 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.30777251720428467, + "epoch": 2.376315789473684, + "grad_norm": 0.0013312747469171882, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 903 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31223200261592865, + "epoch": 2.3789473684210525, + "grad_norm": 0.005191898439079523, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 387.095703125, + "completions/mean_terminated_length": 355.7906188964844, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.30744390189647675, + "epoch": 2.3815789473684212, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.008135558106005192, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 210698722.0, + "reward": 0.6670783758163452, + "reward_std": 0.05189436301589012, + "rewards/progression_diversity/mean": -0.0011522338027134538, + "rewards/progression_diversity/std": 0.026072077453136444, + "rewards/symbolic_reward_accuracy/mean": 0.6640625, + "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, + "rewards/symbolic_reward_partial_score/mean": 0.8955078125, + "rewards/symbolic_reward_partial_score/std": 0.1667071133852005, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0773719549179077, + "sampling/importance_sampling_ratio/min": 4.233967047184706e-05, + "sampling/sampling_logp_difference/max": 10.069786071777344, + "sampling/sampling_logp_difference/mean": 0.1469884216785431, + "step": 905 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.3112915903329849, + "epoch": 2.3842105263157896, + "grad_norm": 0.0049289437010884285, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 906 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.3160187900066376, + "epoch": 2.386842105263158, + "grad_norm": 0.006295245606452227, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 907 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.31758415699005127, + "epoch": 2.389473684210526, + "grad_norm": 0.0024000697303563356, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14656.0, + "completions/max_terminated_length": 14656.0, + "completions/mean_length": 387.697265625, + "completions/mean_terminated_length": 387.697265625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.307947114109993, + "epoch": 2.3921052631578945, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0064180451445281506, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 211330567.0, + "reward": 0.6816225051879883, + "reward_std": 0.10136310756206512, + "rewards/progression_diversity/mean": -0.0018144649220630527, + "rewards/progression_diversity/std": 0.04105665162205696, + "rewards/symbolic_reward_accuracy/mean": 0.685546875, + "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, + "rewards/symbolic_reward_partial_score/mean": 0.9016927480697632, + "rewards/symbolic_reward_partial_score/std": 0.17035500705242157, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0767887830734253, + "sampling/importance_sampling_ratio/min": 3.258802098571323e-05, + "sampling/sampling_logp_difference/max": 10.331565856933594, + "sampling/sampling_logp_difference/mean": 0.14458686113357544, + "step": 909 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31323176622390747, + "epoch": 2.3947368421052633, + "grad_norm": 0.0073518105782568455, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 910 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.30697983503341675, + "epoch": 2.3973684210526316, + "grad_norm": 0.006437649950385094, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 911 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3090358376502991, + "epoch": 2.4, + "grad_norm": 0.00426133070141077, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 364.94921875, + "completions/mean_terminated_length": 364.94921875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.30903904139995575, + "epoch": 2.4026315789473682, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.007032964378595352, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 211941421.0, + "reward": 0.6682119965553284, + "reward_std": 0.07401406019926071, + "rewards/progression_diversity/mean": -9.134229185292497e-05, + "rewards/progression_diversity/std": 0.002066840184852481, + "rewards/symbolic_reward_accuracy/mean": 0.681640625, + "rewards/symbolic_reward_accuracy/std": 0.46629536151885986, + "rewards/symbolic_reward_partial_score/mean": 0.8640950918197632, + "rewards/symbolic_reward_partial_score/std": 0.22005455195903778, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07720947265625, + "sampling/importance_sampling_ratio/min": 0.0013609235174953938, + "sampling/sampling_logp_difference/max": 6.5995917320251465, + "sampling/sampling_logp_difference/mean": 0.14810317754745483, + "step": 913 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3084554970264435, + "epoch": 2.405263157894737, + "grad_norm": 0.0028849325608462095, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 914 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3103426396846771, + "epoch": 2.4078947368421053, + "grad_norm": 0.005803754087537527, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 915 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3055001199245453, + "epoch": 2.4105263157894736, + "grad_norm": 0.0037809647619724274, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 358.556640625, + "completions/mean_terminated_length": 358.556640625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.3065197467803955, + "epoch": 2.413157894736842, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.009809279814362526, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 212527306.0, + "reward": 0.706250011920929, + "reward_std": 0.08705538511276245, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.720703125, + "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, + "rewards/symbolic_reward_partial_score/mean": 0.9134114980697632, + "rewards/symbolic_reward_partial_score/std": 0.15249691903591156, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0773168802261353, + "sampling/importance_sampling_ratio/min": 0.0006119803292676806, + "sampling/sampling_logp_difference/max": 7.398810386657715, + "sampling/sampling_logp_difference/mean": 0.14674416184425354, + "step": 917 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3042609840631485, + "epoch": 2.4157894736842107, + "grad_norm": 0.008614557795226574, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 918 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3085095137357712, + "epoch": 2.418421052631579, + "grad_norm": 0.004739607684314251, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 919 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.30383746325969696, + "epoch": 2.4210526315789473, + "grad_norm": 0.0061707510612905025, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 355.4609375, + "completions/mean_terminated_length": 355.4609375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.3117644786834717, + "epoch": 2.4236842105263157, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.008792375214397907, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 213128470.0, + "reward": 0.6263182163238525, + "reward_std": 0.07272778451442719, + "rewards/progression_diversity/mean": -1.6492393115186132e-05, + "rewards/progression_diversity/std": 0.0003731802280526608, + "rewards/symbolic_reward_accuracy/mean": 0.611328125, + "rewards/symbolic_reward_accuracy/std": 0.4879252314567566, + "rewards/symbolic_reward_partial_score/mean": 0.8650715947151184, + "rewards/symbolic_reward_partial_score/std": 0.18900856375694275, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0778659582138062, + "sampling/importance_sampling_ratio/min": 0.00012650190910790116, + "sampling/sampling_logp_difference/max": 8.975253105163574, + "sampling/sampling_logp_difference/mean": 0.14732477068901062, + "step": 921 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3068103492259979, + "epoch": 2.4263157894736844, + "grad_norm": 0.005262767896056175, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 922 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3068346828222275, + "epoch": 2.4289473684210527, + "grad_norm": 0.006233863532543182, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 923 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.30970314145088196, + "epoch": 2.431578947368421, + "grad_norm": 0.004028057213872671, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 364.669921875, + "completions/mean_terminated_length": 364.669921875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.3095943033695221, + "epoch": 2.4342105263157894, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.009729484096169472, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 213728813.0, + "reward": 0.7836426496505737, + "reward_std": 0.08076313883066177, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.837890625, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.9363607168197632, + "rewards/symbolic_reward_partial_score/std": 0.15720035135746002, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0774470567703247, + "sampling/importance_sampling_ratio/min": 7.029231028354843e-07, + "sampling/sampling_logp_difference/max": 14.168018341064453, + "sampling/sampling_logp_difference/mean": 0.1473565697669983, + "step": 925 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3090842068195343, + "epoch": 2.4368421052631577, + "grad_norm": 0.007715750485658646, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 926 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.3084794133901596, + "epoch": 2.4394736842105265, + "grad_norm": 0.005060678347945213, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 927 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.3028125911951065, + "epoch": 2.442105263157895, + "grad_norm": 0.007612347137182951, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 355.70703125, + "completions/mean_terminated_length": 355.70703125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.31409741938114166, + "epoch": 2.444736842105263, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0024258929770439863, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 214313399.0, + "reward": 0.6506836414337158, + "reward_std": 0.04020765423774719, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.634765625, + "rewards/symbolic_reward_accuracy/std": 0.4819667339324951, + "rewards/symbolic_reward_partial_score/mean": 0.8994140625, + "rewards/symbolic_reward_partial_score/std": 0.15110193192958832, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0788359642028809, + "sampling/importance_sampling_ratio/min": 0.001144380308687687, + "sampling/sampling_logp_difference/max": 6.772891998291016, + "sampling/sampling_logp_difference/mean": 0.14749541878700256, + "step": 929 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.31147317588329315, + "epoch": 2.4473684210526314, + "grad_norm": 0.0049695298075675964, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 930 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.30958548188209534, + "epoch": 2.45, + "grad_norm": 0.002006982918828726, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 931 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.30286461114883423, + "epoch": 2.4526315789473685, + "grad_norm": 0.00774997565895319, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 356.365234375, + "completions/mean_terminated_length": 356.365234375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.31697480380535126, + "epoch": 2.455263157894737, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.008618047460913658, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 214924690.0, + "reward": 0.6026855707168579, + "reward_std": 0.07293908298015594, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.58984375, + "rewards/symbolic_reward_accuracy/std": 0.49234291911125183, + "rewards/symbolic_reward_partial_score/mean": 0.8292643427848816, + "rewards/symbolic_reward_partial_score/std": 0.22252783179283142, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0781829357147217, + "sampling/importance_sampling_ratio/min": 0.0011154379462823272, + "sampling/sampling_logp_difference/max": 6.798508167266846, + "sampling/sampling_logp_difference/mean": 0.14927701652050018, + "step": 933 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.30709315836429596, + "epoch": 2.457894736842105, + "grad_norm": 0.0025286353193223476, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 934 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.3124760091304779, + "epoch": 2.4605263157894735, + "grad_norm": 0.006756810937076807, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 935 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31513579189777374, + "epoch": 2.463157894736842, + "grad_norm": 0.0030686540994793177, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 354.447265625, + "completions/mean_terminated_length": 354.447265625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.3113676607608795, + "epoch": 2.4657894736842105, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.007624417077749968, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 215512823.0, + "reward": 0.6218748092651367, + "reward_std": 0.11250467598438263, + "rewards/progression_diversity/mean": -2.46087020059349e-05, + "rewards/progression_diversity/std": 0.0005568313645198941, + "rewards/symbolic_reward_accuracy/mean": 0.615234375, + "rewards/symbolic_reward_accuracy/std": 0.4870156943798065, + "rewards/symbolic_reward_partial_score/mean": 0.8424479365348816, + "rewards/symbolic_reward_partial_score/std": 0.2174653708934784, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0776278972625732, + "sampling/importance_sampling_ratio/min": 2.9739478577539558e-06, + "sampling/sampling_logp_difference/max": 12.72562026977539, + "sampling/sampling_logp_difference/mean": 0.14833664894104004, + "step": 937 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.31023257970809937, + "epoch": 2.468421052631579, + "grad_norm": 0.007213308941572905, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 938 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3112625777721405, + "epoch": 2.4710526315789476, + "grad_norm": 0.006306259427219629, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 939 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3071421980857849, + "epoch": 2.473684210526316, + "grad_norm": 0.007035477552562952, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 350.392578125, + "completions/mean_terminated_length": 350.392578125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.3151005804538727, + "epoch": 2.4763157894736842, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.010886289179325104, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 216085184.0, + "reward": 0.7312005758285522, + "reward_std": 0.06138945370912552, + "rewards/progression_diversity/mean": -6.468767242040485e-05, + "rewards/progression_diversity/std": 0.0014637151034548879, + "rewards/symbolic_reward_accuracy/mean": 0.759765625, + "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, + "rewards/symbolic_reward_partial_score/mean": 0.9178059697151184, + "rewards/symbolic_reward_partial_score/std": 0.16282695531845093, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0770689249038696, + "sampling/importance_sampling_ratio/min": 2.975442839669995e-06, + "sampling/sampling_logp_difference/max": 12.725117683410645, + "sampling/sampling_logp_difference/mean": 0.14930102229118347, + "step": 941 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3122284412384033, + "epoch": 2.4789473684210526, + "grad_norm": 0.00767452223226428, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 942 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.31038355827331543, + "epoch": 2.481578947368421, + "grad_norm": 0.0039905463345348835, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 943 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3104354441165924, + "epoch": 2.4842105263157896, + "grad_norm": 0.006206648889929056, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 349.552734375, + "completions/mean_terminated_length": 349.552734375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.307844340801239, + "epoch": 2.486842105263158, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.005118571221828461, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 216655451.0, + "reward": 0.6357910633087158, + "reward_std": 0.08715735375881195, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.626953125, + "rewards/symbolic_reward_accuracy/std": 0.48408737778663635, + "rewards/symbolic_reward_partial_score/mean": 0.8653971552848816, + "rewards/symbolic_reward_partial_score/std": 0.18656498193740845, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.076431155204773, + "sampling/importance_sampling_ratio/min": 2.6057361424136616e-07, + "sampling/sampling_logp_difference/max": 15.160380363464355, + "sampling/sampling_logp_difference/mean": 0.1468237042427063, + "step": 945 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30096369981765747, + "epoch": 2.4894736842105263, + "grad_norm": 0.008310060016810894, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 946 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3050964027643204, + "epoch": 2.4921052631578946, + "grad_norm": 0.006288197357207537, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 947 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3067568242549896, + "epoch": 2.4947368421052634, + "grad_norm": 0.008080433122813702, + "learning_rate": 1e-06, + "loss": -0.0019, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 352.97265625, + "completions/mean_terminated_length": 352.97265625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.30862340331077576, + "epoch": 2.4973684210526317, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.002505539683625102, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 217229293.0, + "reward": 0.7032715082168579, + "reward_std": 0.05813654884696007, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.716796875, + "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, + "rewards/symbolic_reward_partial_score/mean": 0.91064453125, + "rewards/symbolic_reward_partial_score/std": 0.16002444922924042, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0768301486968994, + "sampling/importance_sampling_ratio/min": 0.000333755393512547, + "sampling/sampling_logp_difference/max": 8.005102157592773, + "sampling/sampling_logp_difference/mean": 0.14894793927669525, + "step": 949 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.310753732919693, + "epoch": 2.5, + "grad_norm": 0.002883767941966653, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 950 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.3066103011369705, + "epoch": 2.5026315789473683, + "grad_norm": 0.0024179958272725344, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 951 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3090369999408722, + "epoch": 2.5052631578947366, + "grad_norm": 0.0026873471215367317, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 380.767578125, + "completions/mean_terminated_length": 349.4501037597656, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.3078317791223526, + "epoch": 2.5078947368421054, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.005132244899868965, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 217815542.0, + "reward": 0.7311402559280396, + "reward_std": 0.08192440122365952, + "rewards/progression_diversity/mean": -0.0012174799339845777, + "rewards/progression_diversity/std": 0.026926402002573013, + "rewards/symbolic_reward_accuracy/mean": 0.7578125, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.9215494394302368, + "rewards/symbolic_reward_partial_score/std": 0.15562941133975983, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0754692554473877, + "sampling/importance_sampling_ratio/min": 2.2065607936383458e-06, + "sampling/sampling_logp_difference/max": 13.024075508117676, + "sampling/sampling_logp_difference/mean": 0.1453796923160553, + "step": 953 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.31123657524585724, + "epoch": 2.5105263157894737, + "grad_norm": 0.003947115037590265, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 954 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3104862570762634, + "epoch": 2.513157894736842, + "grad_norm": 0.002622234169393778, + "learning_rate": 1e-06, + "loss": 0.028, + "step": 955 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.30933481454849243, + "epoch": 2.515789473684211, + "grad_norm": 0.003103440860286355, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 344.740234375, + "completions/mean_terminated_length": 344.740234375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.3079196810722351, + "epoch": 2.518421052631579, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.004070614464581013, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 218374161.0, + "reward": 0.7833008170127869, + "reward_std": 0.07904037088155746, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.83203125, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.9469400644302368, + "rewards/symbolic_reward_partial_score/std": 0.13283653557300568, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0777184963226318, + "sampling/importance_sampling_ratio/min": 7.50339386286214e-05, + "sampling/sampling_logp_difference/max": 9.497570037841797, + "sampling/sampling_logp_difference/mean": 0.14970475435256958, + "step": 957 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3124520182609558, + "epoch": 2.5210526315789474, + "grad_norm": 0.003689341014251113, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 958 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3111405521631241, + "epoch": 2.5236842105263158, + "grad_norm": 0.0028974253218621016, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 959 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3061821907758713, + "epoch": 2.526315789473684, + "grad_norm": 0.005521416198462248, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 346.318359375, + "completions/mean_terminated_length": 346.318359375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.3132731467485428, + "epoch": 2.5289473684210524, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.004160603508353233, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 218967796.0, + "reward": 0.6078125238418579, + "reward_std": 0.07761308550834656, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.58984375, + "rewards/symbolic_reward_accuracy/std": 0.49234291911125183, + "rewards/symbolic_reward_partial_score/mean": 0.8463541269302368, + "rewards/symbolic_reward_partial_score/std": 0.21938246488571167, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0782113075256348, + "sampling/importance_sampling_ratio/min": 0.0008083023130893707, + "sampling/sampling_logp_difference/max": 7.120574474334717, + "sampling/sampling_logp_difference/mean": 0.14856785535812378, + "step": 961 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30516865849494934, + "epoch": 2.531578947368421, + "grad_norm": 0.006756477057933807, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 962 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3062412440776825, + "epoch": 2.5342105263157895, + "grad_norm": 0.0025619908701628447, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 963 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3087068498134613, + "epoch": 2.536842105263158, + "grad_norm": 0.0025542299263179302, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 348.484375, + "completions/mean_terminated_length": 348.484375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.30975644290447235, + "epoch": 2.5394736842105265, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.00557674840092659, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 219543692.0, + "reward": 0.653564453125, + "reward_std": 0.044423237442970276, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.646484375, + "rewards/symbolic_reward_accuracy/std": 0.47852855920791626, + "rewards/symbolic_reward_partial_score/mean": 0.8855794668197632, + "rewards/symbolic_reward_partial_score/std": 0.1733708679676056, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0785977840423584, + "sampling/importance_sampling_ratio/min": 0.0008315914310514927, + "sampling/sampling_logp_difference/max": 7.092169284820557, + "sampling/sampling_logp_difference/mean": 0.14907319843769073, + "step": 965 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.3097044676542282, + "epoch": 2.542105263157895, + "grad_norm": 0.002045287750661373, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 966 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.31358230113983154, + "epoch": 2.544736842105263, + "grad_norm": 0.007611148990690708, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 967 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.31234195828437805, + "epoch": 2.5473684210526315, + "grad_norm": 0.006483261939138174, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 348.541015625, + "completions/mean_terminated_length": 348.541015625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.310483381152153, + "epoch": 2.55, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.003465034533292055, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 220121281.0, + "reward": 0.6914063096046448, + "reward_std": 0.060588110238313675, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.70703125, + "rewards/symbolic_reward_accuracy/std": 0.455569326877594, + "rewards/symbolic_reward_partial_score/mean": 0.890625, + "rewards/symbolic_reward_partial_score/std": 0.18579120934009552, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0780143737792969, + "sampling/importance_sampling_ratio/min": 0.00011447574070189148, + "sampling/sampling_logp_difference/max": 9.07514762878418, + "sampling/sampling_logp_difference/mean": 0.1497250497341156, + "step": 969 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.30818693339824677, + "epoch": 2.5526315789473686, + "grad_norm": 0.0023296221625059843, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 970 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.3092123121023178, + "epoch": 2.555263157894737, + "grad_norm": 0.007309067994356155, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 971 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.3103838115930557, + "epoch": 2.557894736842105, + "grad_norm": 0.002875859383493662, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 341.330078125, + "completions/mean_terminated_length": 341.330078125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.30619047582149506, + "epoch": 2.5605263157894735, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.009937925264239311, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 220689162.0, + "reward": 0.7670894861221313, + "reward_std": 0.06431086361408234, + "rewards/progression_diversity/mean": -3.514744821586646e-05, + "rewards/progression_diversity/std": 0.0007952958694659173, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.9436848759651184, + "rewards/symbolic_reward_partial_score/std": 0.12426012754440308, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.077713966369629, + "sampling/importance_sampling_ratio/min": 0.002786138793453574, + "sampling/sampling_logp_difference/max": 5.883098602294922, + "sampling/sampling_logp_difference/mean": 0.14811402559280396, + "step": 973 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.31514872610569, + "epoch": 2.5631578947368423, + "grad_norm": 0.002241847338154912, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 974 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3097695857286453, + "epoch": 2.5657894736842106, + "grad_norm": 0.006119919009506702, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 975 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.306579053401947, + "epoch": 2.568421052631579, + "grad_norm": 0.008558162488043308, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 340.4921875, + "completions/mean_terminated_length": 340.4921875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.3134208023548126, + "epoch": 2.5710526315789473, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.007445094641298056, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 221250118.0, + "reward": 0.6376464366912842, + "reward_std": 0.05477411672472954, + "rewards/progression_diversity/mean": -7.944680874061305e-06, + "rewards/progression_diversity/std": 0.00017976762319449335, + "rewards/symbolic_reward_accuracy/mean": 0.62109375, + "rewards/symbolic_reward_accuracy/std": 0.4855891764163971, + "rewards/symbolic_reward_partial_score/mean": 0.88330078125, + "rewards/symbolic_reward_partial_score/std": 0.16613253951072693, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0782685279846191, + "sampling/importance_sampling_ratio/min": 0.0026720340829342604, + "sampling/sampling_logp_difference/max": 5.924915313720703, + "sampling/sampling_logp_difference/mean": 0.15010234713554382, + "step": 977 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.31288468837738037, + "epoch": 2.5736842105263156, + "grad_norm": 0.003037536283954978, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 978 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.3096996992826462, + "epoch": 2.5763157894736843, + "grad_norm": 0.0053139738738536835, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 979 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.3134246915578842, + "epoch": 2.5789473684210527, + "grad_norm": 0.004795863758772612, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 336.384765625, + "completions/mean_terminated_length": 336.384765625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.3105523884296417, + "epoch": 2.581578947368421, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.003965785726904869, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 221824331.0, + "reward": 0.74609375, + "reward_std": 0.05215379595756531, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.787109375, + "rewards/symbolic_reward_accuracy/std": 0.409751296043396, + "rewards/symbolic_reward_partial_score/mean": 0.9127604365348816, + "rewards/symbolic_reward_partial_score/std": 0.17551816999912262, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0778260231018066, + "sampling/importance_sampling_ratio/min": 0.0006920514279045165, + "sampling/sampling_logp_difference/max": 7.275850296020508, + "sampling/sampling_logp_difference/mean": 0.1488214135169983, + "step": 981 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.30993202328681946, + "epoch": 2.5842105263157897, + "grad_norm": 0.0053926375694572926, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 982 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.30970150232315063, + "epoch": 2.586842105263158, + "grad_norm": 0.0028977948240935802, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 983 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.30929918587207794, + "epoch": 2.5894736842105264, + "grad_norm": 0.005668803583830595, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 344.669921875, + "completions/mean_terminated_length": 344.669921875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.3082158714532852, + "epoch": 2.5921052631578947, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.004866783507168293, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 222418626.0, + "reward": 0.6892090439796448, + "reward_std": 0.04069463908672333, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.705078125, + "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, + "rewards/symbolic_reward_partial_score/mean": 0.88720703125, + "rewards/symbolic_reward_partial_score/std": 0.20415787398815155, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0783774852752686, + "sampling/importance_sampling_ratio/min": 0.004361480474472046, + "sampling/sampling_logp_difference/max": 5.434943675994873, + "sampling/sampling_logp_difference/mean": 0.1484195590019226, + "step": 985 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.30943770706653595, + "epoch": 2.594736842105263, + "grad_norm": 0.00852410402148962, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 986 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.30831897258758545, + "epoch": 2.5973684210526313, + "grad_norm": 0.003436718601733446, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 987 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.3049241006374359, + "epoch": 2.6, + "grad_norm": 0.007047669496387243, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 341.205078125, + "completions/mean_terminated_length": 341.205078125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.31349438428878784, + "epoch": 2.6026315789473684, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.004920005798339844, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 222963083.0, + "reward": 0.7357420921325684, + "reward_std": 0.031880468130111694, + "rewards/progression_diversity/mean": -1.5017576515674591e-05, + "rewards/progression_diversity/std": 0.00033980896114371717, + "rewards/symbolic_reward_accuracy/mean": 0.771484375, + "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, + "rewards/symbolic_reward_partial_score/mean": 0.9095052480697632, + "rewards/symbolic_reward_partial_score/std": 0.17970143258571625, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0788280963897705, + "sampling/importance_sampling_ratio/min": 0.0024846631567925215, + "sampling/sampling_logp_difference/max": 5.997618198394775, + "sampling/sampling_logp_difference/mean": 0.15029466152191162, + "step": 989 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.3132941573858261, + "epoch": 2.6052631578947367, + "grad_norm": 0.005506443325430155, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 990 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.31322436034679413, + "epoch": 2.6078947368421055, + "grad_norm": 0.0013167249271646142, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 991 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.31497417390346527, + "epoch": 2.610526315789474, + "grad_norm": 0.003209513844922185, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 339.025390625, + "completions/mean_terminated_length": 339.025390625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.3111710846424103, + "epoch": 2.613157894736842, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0031757066026329994, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 223535640.0, + "reward": 0.7018066644668579, + "reward_std": 0.05514729022979736, + "rewards/progression_diversity/mean": -2.5508261387585662e-06, + "rewards/progression_diversity/std": 5.771860742243007e-05, + "rewards/symbolic_reward_accuracy/mean": 0.71875, + "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, + "rewards/symbolic_reward_partial_score/mean": 0.90185546875, + "rewards/symbolic_reward_partial_score/std": 0.1845003068447113, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0782794952392578, + "sampling/importance_sampling_ratio/min": 0.0020101333502680063, + "sampling/sampling_logp_difference/max": 6.209554195404053, + "sampling/sampling_logp_difference/mean": 0.1487007737159729, + "step": 993 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.3111952245235443, + "epoch": 2.6157894736842104, + "grad_norm": 0.00311102787964046, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 994 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.3100218176841736, + "epoch": 2.6184210526315788, + "grad_norm": 0.008853343315422535, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 995 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.30624426901340485, + "epoch": 2.6210526315789475, + "grad_norm": 0.00726295355707407, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 338.2734375, + "completions/mean_terminated_length": 338.2734375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.3080877661705017, + "epoch": 2.623684210526316, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.005093069281429052, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 224128164.0, + "reward": 0.6446288824081421, + "reward_std": 0.07044163346290588, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.634765625, + "rewards/symbolic_reward_accuracy/std": 0.4819667339324951, + "rewards/symbolic_reward_partial_score/mean": 0.8792317509651184, + "rewards/symbolic_reward_partial_score/std": 0.18442124128341675, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0780870914459229, + "sampling/importance_sampling_ratio/min": 0.001496361568570137, + "sampling/sampling_logp_difference/max": 6.504718780517578, + "sampling/sampling_logp_difference/mean": 0.1492321938276291, + "step": 997 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.3098251223564148, + "epoch": 2.626315789473684, + "grad_norm": 0.003775182878598571, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 998 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.30809755623340607, + "epoch": 2.6289473684210525, + "grad_norm": 0.005265187006443739, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 999 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.3101976066827774, + "epoch": 2.6315789473684212, + "grad_norm": 0.00613139383494854, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 343.287109375, + "completions/mean_terminated_length": 343.287109375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.31136079132556915, + "epoch": 2.6342105263157896, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.0061224219389259815, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 224689047.0, + "reward": 0.6758301258087158, + "reward_std": 0.10899586975574493, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.677734375, + "rewards/symbolic_reward_accuracy/std": 0.46780112385749817, + "rewards/symbolic_reward_partial_score/mean": 0.8972982168197632, + "rewards/symbolic_reward_partial_score/std": 0.16858573257923126, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07841956615448, + "sampling/importance_sampling_ratio/min": 0.0017329779220744967, + "sampling/sampling_logp_difference/max": 6.357913970947266, + "sampling/sampling_logp_difference/mean": 0.1503494828939438, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3088783025741577, + "epoch": 2.636842105263158, + "grad_norm": 0.0070188590325415134, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30825355648994446, + "epoch": 2.639473684210526, + "grad_norm": 0.003072237130254507, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.31032636761665344, + "epoch": 2.6421052631578945, + "grad_norm": 0.0050218356773257256, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 344.0, + "completions/mean_terminated_length": 344.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.30877159535884857, + "epoch": 2.6447368421052633, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0041364701464772224, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 225278327.0, + "reward": 0.7000486850738525, + "reward_std": 0.07234279066324234, + "rewards/progression_diversity/mean": -1.8634102161740884e-05, + "rewards/progression_diversity/std": 0.0004216416273266077, + "rewards/symbolic_reward_accuracy/mean": 0.708984375, + "rewards/symbolic_reward_accuracy/std": 0.45467492938041687, + "rewards/symbolic_reward_partial_score/mean": 0.91552734375, + "rewards/symbolic_reward_partial_score/std": 0.15014779567718506, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0777002573013306, + "sampling/importance_sampling_ratio/min": 0.001144947949796915, + "sampling/sampling_logp_difference/max": 6.772396087646484, + "sampling/sampling_logp_difference/mean": 0.14913979172706604, + "step": 1005 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3107386380434036, + "epoch": 2.6473684210526316, + "grad_norm": 0.005770625080913305, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.3138565123081207, + "epoch": 2.65, + "grad_norm": 0.0046647596172988415, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.3070129156112671, + "epoch": 2.6526315789473687, + "grad_norm": 0.01046669203788042, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 341.783203125, + "completions/mean_terminated_length": 341.783203125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.3066733479499817, + "epoch": 2.655263157894737, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.006327769719064236, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 225847624.0, + "reward": 0.6906728148460388, + "reward_std": 0.05062294378876686, + "rewards/progression_diversity/mean": -0.0001056864857673645, + "rewards/progression_diversity/std": 0.00239141215570271, + "rewards/symbolic_reward_accuracy/mean": 0.70703125, + "rewards/symbolic_reward_accuracy/std": 0.455569326877594, + "rewards/symbolic_reward_partial_score/mean": 0.88818359375, + "rewards/symbolic_reward_partial_score/std": 0.20053695142269135, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0784711837768555, + "sampling/importance_sampling_ratio/min": 0.0028838764410465956, + "sampling/sampling_logp_difference/max": 5.8486199378967285, + "sampling/sampling_logp_difference/mean": 0.14774033427238464, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.3070971816778183, + "epoch": 2.6578947368421053, + "grad_norm": 0.007127484772354364, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.3089192360639572, + "epoch": 2.6605263157894736, + "grad_norm": 0.0016588810831308365, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.3057337701320648, + "epoch": 2.663157894736842, + "grad_norm": 0.006851482670754194, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14683.0, + "completions/mean_length": 406.533203125, + "completions/mean_terminated_length": 375.2661437988281, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.3020845800638199, + "epoch": 2.6657894736842103, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.008201662451028824, + "learning_rate": 1e-06, + "loss": 0.0252, + "num_tokens": 226443897.0, + "reward": 0.721895158290863, + "reward_std": 0.05035192146897316, + "rewards/progression_diversity/mean": -0.00287051172927022, + "rewards/progression_diversity/std": 0.04405975714325905, + "rewards/symbolic_reward_accuracy/mean": 0.748046875, + "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, + "rewards/symbolic_reward_partial_score/mean": 0.9109700322151184, + "rewards/symbolic_reward_partial_score/std": 0.17348289489746094, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.073678970336914, + "sampling/importance_sampling_ratio/min": 0.0020475396886467934, + "sampling/sampling_logp_difference/max": 6.1911163330078125, + "sampling/sampling_logp_difference/mean": 0.14129182696342468, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.30159105360507965, + "epoch": 2.668421052631579, + "grad_norm": 0.004705225117504597, + "learning_rate": 1e-06, + "loss": 0.0296, + "step": 1014 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3036012649536133, + "epoch": 2.6710526315789473, + "grad_norm": 0.002584053436294198, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 1015 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.3087976425886154, + "epoch": 2.6736842105263157, + "grad_norm": 0.0061477916315197945, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 344.1484375, + "completions/mean_terminated_length": 344.1484375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.3104751855134964, + "epoch": 2.6763157894736844, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.00948393065482378, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 226990885.0, + "reward": 0.7349108457565308, + "reward_std": 0.06460164487361908, + "rewards/progression_diversity/mean": -0.0001266201288672164, + "rewards/progression_diversity/std": 0.0028650863096117973, + "rewards/symbolic_reward_accuracy/mean": 0.76171875, + "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, + "rewards/symbolic_reward_partial_score/mean": 0.92626953125, + "rewards/symbolic_reward_partial_score/std": 0.14307039976119995, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0778229236602783, + "sampling/importance_sampling_ratio/min": 0.000825743016321212, + "sampling/sampling_logp_difference/max": 7.099226951599121, + "sampling/sampling_logp_difference/mean": 0.15059536695480347, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.309643492102623, + "epoch": 2.6789473684210527, + "grad_norm": 0.0018985444912686944, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1018 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3100026994943619, + "epoch": 2.681578947368421, + "grad_norm": 0.0010688561014831066, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.30169081687927246, + "epoch": 2.6842105263157894, + "grad_norm": 0.006834503263235092, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 351.275390625, + "completions/mean_terminated_length": 351.275390625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.30858996510505676, + "epoch": 2.6868421052631577, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.007892133668065071, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 227591730.0, + "reward": 0.7020506858825684, + "reward_std": 0.07354225218296051, + "rewards/progression_diversity/mean": -1.1801354048657231e-05, + "rewards/progression_diversity/std": 0.0002670341345947236, + "rewards/symbolic_reward_accuracy/mean": 0.7109375, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.9182942509651184, + "rewards/symbolic_reward_partial_score/std": 0.14587374031543732, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0771903991699219, + "sampling/importance_sampling_ratio/min": 0.0010955760953947902, + "sampling/sampling_logp_difference/max": 6.816474914550781, + "sampling/sampling_logp_difference/mean": 0.14943185448646545, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.30647024512290955, + "epoch": 2.6894736842105265, + "grad_norm": 0.0044861165806651115, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.30685843527317047, + "epoch": 2.692105263157895, + "grad_norm": 0.0015153492568060756, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1023 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.31040677428245544, + "epoch": 2.694736842105263, + "grad_norm": 0.0017616256373003125, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 345.9140625, + "completions/mean_terminated_length": 345.9140625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.29469770193099976, + "epoch": 2.6973684210526314, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00477286521345377, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 228152454.0, + "reward": 0.7686523795127869, + "reward_std": 0.113402359187603, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.822265625, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.9176432490348816, + "rewards/symbolic_reward_partial_score/std": 0.18666434288024902, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0759270191192627, + "sampling/importance_sampling_ratio/min": 0.0006319702370092273, + "sampling/sampling_logp_difference/max": 7.366668224334717, + "sampling/sampling_logp_difference/mean": 0.14713361859321594, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.30262984335422516, + "epoch": 2.7, + "grad_norm": 0.007038114592432976, + "learning_rate": 1e-06, + "loss": 0.003, + "step": 1026 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.30532851815223694, + "epoch": 2.7026315789473685, + "grad_norm": 0.006812415551394224, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1027 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.30250802636146545, + "epoch": 2.705263157894737, + "grad_norm": 0.002279542852193117, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 342.79296875, + "completions/mean_terminated_length": 342.79296875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.30581048130989075, + "epoch": 2.707894736842105, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.011115007102489471, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 228731932.0, + "reward": 0.6307129263877869, + "reward_std": 0.0981566533446312, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.630859375, + "rewards/symbolic_reward_accuracy/std": 0.4830440282821655, + "rewards/symbolic_reward_partial_score/mean": 0.8406575918197632, + "rewards/symbolic_reward_partial_score/std": 0.23888207972049713, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0764038562774658, + "sampling/importance_sampling_ratio/min": 0.0026295094285160303, + "sampling/sampling_logp_difference/max": 5.940958023071289, + "sampling/sampling_logp_difference/mean": 0.14800997078418732, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.30258332192897797, + "epoch": 2.7105263157894735, + "grad_norm": 0.004817832726985216, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.30187736451625824, + "epoch": 2.713157894736842, + "grad_norm": 0.00844305194914341, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3083288073539734, + "epoch": 2.7157894736842105, + "grad_norm": 0.004226917400956154, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 348.5625, + "completions/mean_terminated_length": 348.5625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.30741892755031586, + "epoch": 2.718421052631579, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.005195413250476122, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 229322044.0, + "reward": 0.5599607229232788, + "reward_std": 0.08515633642673492, + "rewards/progression_diversity/mean": -2.2997846826910973e-05, + "rewards/progression_diversity/std": 0.0005203818436712027, + "rewards/symbolic_reward_accuracy/mean": 0.5078125, + "rewards/symbolic_reward_accuracy/std": 0.5004279017448425, + "rewards/symbolic_reward_partial_score/mean": 0.8509114384651184, + "rewards/symbolic_reward_partial_score/std": 0.1921633780002594, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0783910751342773, + "sampling/importance_sampling_ratio/min": 0.00017082311387639493, + "sampling/sampling_logp_difference/max": 8.674881935119629, + "sampling/sampling_logp_difference/mean": 0.1485639214515686, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.30229929089546204, + "epoch": 2.7210526315789476, + "grad_norm": 0.005828053690493107, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3022471219301224, + "epoch": 2.723684210526316, + "grad_norm": 0.002931118942797184, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1035 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3109671622514725, + "epoch": 2.7263157894736842, + "grad_norm": 0.0031470481771975756, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 342.962890625, + "completions/mean_terminated_length": 342.962890625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.3021458685398102, + "epoch": 2.7289473684210526, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.00692404480651021, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 229923145.0, + "reward": 0.698925793170929, + "reward_std": 0.05597582459449768, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.716796875, + "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, + "rewards/symbolic_reward_partial_score/mean": 0.8961588144302368, + "rewards/symbolic_reward_partial_score/std": 0.17904885113239288, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.076554298400879, + "sampling/importance_sampling_ratio/min": 0.0001375319843646139, + "sampling/sampling_logp_difference/max": 8.891654014587402, + "sampling/sampling_logp_difference/mean": 0.1469387412071228, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.30062244832515717, + "epoch": 2.731578947368421, + "grad_norm": 0.008866215124726295, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.30110302567481995, + "epoch": 2.734210526315789, + "grad_norm": 0.002656978787854314, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 1039 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.30177319049835205, + "epoch": 2.736842105263158, + "grad_norm": 0.0029455830808728933, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 347.380859375, + "completions/mean_terminated_length": 347.380859375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.307174950838089, + "epoch": 2.7394736842105263, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.006679740268737078, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 230498796.0, + "reward": 0.7644531726837158, + "reward_std": 0.03681304678320885, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.80859375, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.931640625, + "rewards/symbolic_reward_partial_score/std": 0.15179350972175598, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0770549774169922, + "sampling/importance_sampling_ratio/min": 0.00020917513757012784, + "sampling/sampling_logp_difference/max": 8.472338676452637, + "sampling/sampling_logp_difference/mean": 0.14685845375061035, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2970338463783264, + "epoch": 2.7421052631578946, + "grad_norm": 0.0014519651886075735, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.30320054292678833, + "epoch": 2.7447368421052634, + "grad_norm": 0.008143628016114235, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.30637988448143005, + "epoch": 2.7473684210526317, + "grad_norm": 0.0031961810309439898, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 370.67578125, + "completions/mean_terminated_length": 339.33856201171875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.2997948229312897, + "epoch": 2.75, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.0036635242868214846, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 231087718.0, + "reward": 0.6814371943473816, + "reward_std": 0.06737735867500305, + "rewards/progression_diversity/mean": -0.000813078077044338, + "rewards/progression_diversity/std": 0.018397856503725052, + "rewards/symbolic_reward_accuracy/mean": 0.701171875, + "rewards/symbolic_reward_accuracy/std": 0.45819199085235596, + "rewards/symbolic_reward_partial_score/mean": 0.8697916865348816, + "rewards/symbolic_reward_partial_score/std": 0.22192302346229553, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.074350118637085, + "sampling/importance_sampling_ratio/min": 9.22444343132156e-08, + "sampling/sampling_logp_difference/max": 16.198823928833008, + "sampling/sampling_logp_difference/mean": 0.14467072486877441, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.3004506677389145, + "epoch": 2.7526315789473683, + "grad_norm": 0.004989446606487036, + "learning_rate": 1e-06, + "loss": 0.0273, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3001198023557663, + "epoch": 2.7552631578947366, + "grad_norm": 0.0067935604602098465, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 1047 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2999381124973297, + "epoch": 2.7578947368421054, + "grad_norm": 0.0061677745543420315, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 344.810546875, + "completions/mean_terminated_length": 344.810546875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.3047361671924591, + "epoch": 2.7605263157894737, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.005580128636211157, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 231680421.0, + "reward": 0.6251953840255737, + "reward_std": 0.09940549731254578, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.61328125, + "rewards/symbolic_reward_accuracy/std": 0.48747459053993225, + "rewards/symbolic_reward_partial_score/mean": 0.857421875, + "rewards/symbolic_reward_partial_score/std": 0.20798955857753754, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0773075819015503, + "sampling/importance_sampling_ratio/min": 0.0017322030616924167, + "sampling/sampling_logp_difference/max": 6.35836124420166, + "sampling/sampling_logp_difference/mean": 0.1478845775127411, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.301644891500473, + "epoch": 2.763157894736842, + "grad_norm": 0.005946438293904066, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.30532458424568176, + "epoch": 2.765789473684211, + "grad_norm": 0.008235945366322994, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1051 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30299271643161774, + "epoch": 2.768421052631579, + "grad_norm": 0.0029075501952320337, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 343.134765625, + "completions/mean_terminated_length": 343.134765625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.3021633177995682, + "epoch": 2.7710526315789474, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.005222593899816275, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 232260074.0, + "reward": 0.7296382188796997, + "reward_std": 0.08588685095310211, + "rewards/progression_diversity/mean": -5.076556408312172e-05, + "rewards/progression_diversity/std": 0.0011486934963613749, + "rewards/symbolic_reward_accuracy/mean": 0.759765625, + "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, + "rewards/symbolic_reward_partial_score/mean": 0.91259765625, + "rewards/symbolic_reward_partial_score/std": 0.17004802823066711, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0771366357803345, + "sampling/importance_sampling_ratio/min": 6.993500778484929e-10, + "sampling/sampling_logp_difference/max": 21.080869674682617, + "sampling/sampling_logp_difference/mean": 0.14611807465553284, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.3046809434890747, + "epoch": 2.7736842105263158, + "grad_norm": 0.005133381113409996, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1054 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.29959002137184143, + "epoch": 2.776315789473684, + "grad_norm": 0.009239893406629562, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1055 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3024829030036926, + "epoch": 2.7789473684210524, + "grad_norm": 0.008001086302101612, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 366.3203125, + "completions/mean_terminated_length": 334.97454833984375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.29493679106235504, + "epoch": 2.781578947368421, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.006747152656316757, + "learning_rate": 1e-06, + "loss": 0.027, + "num_tokens": 232852942.0, + "reward": 0.6987649202346802, + "reward_std": 0.0806761085987091, + "rewards/progression_diversity/mean": -0.0014349485281854868, + "rewards/progression_diversity/std": 0.03182978555560112, + "rewards/symbolic_reward_accuracy/mean": 0.720703125, + "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, + "rewards/symbolic_reward_partial_score/mean": 0.8878580927848816, + "rewards/symbolic_reward_partial_score/std": 0.20636910200119019, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0740900039672852, + "sampling/importance_sampling_ratio/min": 0.0002785032265819609, + "sampling/sampling_logp_difference/max": 8.186080932617188, + "sampling/sampling_logp_difference/mean": 0.1437458097934723, + "step": 1057 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3012588322162628, + "epoch": 2.7842105263157895, + "grad_norm": 0.006597322411835194, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.30101022124290466, + "epoch": 2.786842105263158, + "grad_norm": 0.008424767293035984, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.30317388474941254, + "epoch": 2.7894736842105265, + "grad_norm": 0.0033637969754636288, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 333.837890625, + "completions/mean_terminated_length": 333.837890625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.29957854747772217, + "epoch": 2.792105263157895, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0058438642881810665, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 233455707.0, + "reward": 0.6902344226837158, + "reward_std": 0.06666649132966995, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.693359375, + "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, + "rewards/symbolic_reward_partial_score/mean": 0.9140625, + "rewards/symbolic_reward_partial_score/std": 0.1394774317741394, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0762736797332764, + "sampling/importance_sampling_ratio/min": 0.0014469054294750094, + "sampling/sampling_logp_difference/max": 6.538328170776367, + "sampling/sampling_logp_difference/mean": 0.14617058634757996, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.29650571942329407, + "epoch": 2.794736842105263, + "grad_norm": 0.003479498904198408, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.3057336062192917, + "epoch": 2.7973684210526315, + "grad_norm": 0.01066509634256363, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1063 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3051034063100815, + "epoch": 2.8, + "grad_norm": 0.004155632574111223, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 358.43359375, + "completions/mean_terminated_length": 327.0724182128906, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.3072111904621124, + "epoch": 2.8026315789473686, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.0020747000817209482, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 234053881.0, + "reward": 0.6687940955162048, + "reward_std": 0.04760359972715378, + "rewards/progression_diversity/mean": -0.000473738502478227, + "rewards/progression_diversity/std": 0.00999171007424593, + "rewards/symbolic_reward_accuracy/mean": 0.6796875, + "rewards/symbolic_reward_accuracy/std": 0.4670529365539551, + "rewards/symbolic_reward_partial_score/mean": 0.87060546875, + "rewards/symbolic_reward_partial_score/std": 0.21114623546600342, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0746687650680542, + "sampling/importance_sampling_ratio/min": 0.006617438979446888, + "sampling/sampling_logp_difference/max": 5.018046855926514, + "sampling/sampling_logp_difference/mean": 0.14455817639827728, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3023747056722641, + "epoch": 2.805263157894737, + "grad_norm": 0.004723989870399237, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1066 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.3038836717605591, + "epoch": 2.807894736842105, + "grad_norm": 0.0013983779354020953, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2940693497657776, + "epoch": 2.8105263157894735, + "grad_norm": 0.0014218107098713517, + "learning_rate": 1e-06, + "loss": 0.0293, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 325.5859375, + "completions/mean_terminated_length": 325.5859375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.30475354194641113, + "epoch": 2.8131578947368423, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0021612788550555706, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 234609189.0, + "reward": 0.7227051258087158, + "reward_std": 0.0341927669942379, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.74609375, + "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, + "rewards/symbolic_reward_partial_score/mean": 0.9168294668197632, + "rewards/symbolic_reward_partial_score/std": 0.15962786972522736, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0750136375427246, + "sampling/importance_sampling_ratio/min": 0.0031064762733876705, + "sampling/sampling_logp_difference/max": 5.774266242980957, + "sampling/sampling_logp_difference/mean": 0.14548389613628387, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2990523725748062, + "epoch": 2.8157894736842106, + "grad_norm": 0.006843502167612314, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2972310781478882, + "epoch": 2.818421052631579, + "grad_norm": 0.003569354536011815, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.30257561802864075, + "epoch": 2.8210526315789473, + "grad_norm": 0.0011047772131860256, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1141.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 342.23828125, + "completions/mean_terminated_length": 342.23828125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.29948727786540985, + "epoch": 2.8236842105263156, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.004726483020931482, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 235189567.0, + "reward": 0.7009764909744263, + "reward_std": 0.09700538963079453, + "rewards/progression_diversity/mean": -1.0271490282320883e-05, + "rewards/progression_diversity/std": 0.00023241728194989264, + "rewards/symbolic_reward_accuracy/mean": 0.716796875, + "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, + "rewards/symbolic_reward_partial_score/mean": 0.9029948115348816, + "rewards/symbolic_reward_partial_score/std": 0.17346768081188202, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0757352113723755, + "sampling/importance_sampling_ratio/min": 0.00030262480140663683, + "sampling/sampling_logp_difference/max": 8.10301685333252, + "sampling/sampling_logp_difference/mean": 0.1474587321281433, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.29904985427856445, + "epoch": 2.8263157894736843, + "grad_norm": 0.005854406394064426, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1074 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3006422519683838, + "epoch": 2.8289473684210527, + "grad_norm": 0.002233149018138647, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 1075 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30377885699272156, + "epoch": 2.831578947368421, + "grad_norm": 0.002246702089905739, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 330.958984375, + "completions/mean_terminated_length": 330.958984375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.3022722750902176, + "epoch": 2.8342105263157897, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.006821685470640659, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 235734794.0, + "reward": 0.7696281671524048, + "reward_std": 0.053564704954624176, + "rewards/progression_diversity/mean": -7.848920358810574e-05, + "rewards/progression_diversity/std": 0.0012581268092617393, + "rewards/symbolic_reward_accuracy/mean": 0.814453125, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.9365234375, + "rewards/symbolic_reward_partial_score/std": 0.15159018337726593, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0741266012191772, + "sampling/importance_sampling_ratio/min": 0.002895012963563204, + "sampling/sampling_logp_difference/max": 5.844765663146973, + "sampling/sampling_logp_difference/mean": 0.14536406099796295, + "step": 1077 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2940616309642792, + "epoch": 2.836842105263158, + "grad_norm": 0.0018197406316176057, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2982686161994934, + "epoch": 2.8394736842105264, + "grad_norm": 0.0015975015703588724, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1079 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2993506193161011, + "epoch": 2.8421052631578947, + "grad_norm": 0.0066468799486756325, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 334.35546875, + "completions/mean_terminated_length": 334.35546875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.29797013103961945, + "epoch": 2.844736842105263, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.007863621227443218, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 236330464.0, + "reward": 0.7291990518569946, + "reward_std": 0.08173226565122604, + "rewards/progression_diversity/mean": -1.640707705519162e-05, + "rewards/progression_diversity/std": 0.00037124980008229613, + "rewards/symbolic_reward_accuracy/mean": 0.755859375, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.9189453125, + "rewards/symbolic_reward_partial_score/std": 0.1701214611530304, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0763745307922363, + "sampling/importance_sampling_ratio/min": 0.0045486390590667725, + "sampling/sampling_logp_difference/max": 5.392927169799805, + "sampling/sampling_logp_difference/mean": 0.14487287402153015, + "step": 1081 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.2998821288347244, + "epoch": 2.8473684210526313, + "grad_norm": 0.012033012695610523, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3000533878803253, + "epoch": 2.85, + "grad_norm": 0.004561115987598896, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1083 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.29871946573257446, + "epoch": 2.8526315789473684, + "grad_norm": 0.002735659945756197, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 324.82421875, + "completions/mean_terminated_length": 324.82421875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.29053474962711334, + "epoch": 2.8552631578947367, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.004544893279671669, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 236888230.0, + "reward": 0.7167476415634155, + "reward_std": 0.08542241156101227, + "rewards/progression_diversity/mean": -4.313869430916384e-05, + "rewards/progression_diversity/std": 0.0007985467091202736, + "rewards/symbolic_reward_accuracy/mean": 0.732421875, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.92431640625, + "rewards/symbolic_reward_partial_score/std": 0.14619389176368713, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0729988813400269, + "sampling/importance_sampling_ratio/min": 0.010550652630627155, + "sampling/sampling_logp_difference/max": 4.551567554473877, + "sampling/sampling_logp_difference/mean": 0.14252474904060364, + "step": 1085 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.29088248312473297, + "epoch": 2.8578947368421055, + "grad_norm": 0.004123357590287924, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1086 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2879483252763748, + "epoch": 2.860526315789474, + "grad_norm": 0.007720407098531723, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2954999506473541, + "epoch": 2.863157894736842, + "grad_norm": 0.0022629539016634226, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 331.248046875, + "completions/mean_terminated_length": 331.248046875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.29624035954475403, + "epoch": 2.8657894736842104, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.009815968573093414, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 237464645.0, + "reward": 0.759228527545929, + "reward_std": 0.08564972877502441, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.80078125, + "rewards/symbolic_reward_accuracy/std": 0.39980348944664, + "rewards/symbolic_reward_partial_score/mean": 0.92919921875, + "rewards/symbolic_reward_partial_score/std": 0.162931889295578, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.074161410331726, + "sampling/importance_sampling_ratio/min": 1.4037870641914196e-05, + "sampling/sampling_logp_difference/max": 11.173751831054688, + "sampling/sampling_logp_difference/mean": 0.14445924758911133, + "step": 1089 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2950233966112137, + "epoch": 2.8684210526315788, + "grad_norm": 0.0048875752836465836, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 1090 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2931629717350006, + "epoch": 2.8710526315789475, + "grad_norm": 0.0026949893217533827, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.29851092398166656, + "epoch": 2.873684210526316, + "grad_norm": 0.004008137155324221, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 332.041015625, + "completions/mean_terminated_length": 332.041015625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.2946535646915436, + "epoch": 2.876315789473684, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.003926341887563467, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 238041626.0, + "reward": 0.749999463558197, + "reward_std": 0.073408342897892, + "rewards/progression_diversity/mean": -5.6169395975302905e-05, + "rewards/progression_diversity/std": 0.00099645322188735, + "rewards/symbolic_reward_accuracy/mean": 0.779296875, + "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, + "rewards/symbolic_reward_partial_score/mean": 0.94140625, + "rewards/symbolic_reward_partial_score/std": 0.1294398009777069, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0737342834472656, + "sampling/importance_sampling_ratio/min": 0.002836739644408226, + "sampling/sampling_logp_difference/max": 5.865099906921387, + "sampling/sampling_logp_difference/mean": 0.14539852738380432, + "step": 1093 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.29140058159828186, + "epoch": 2.8789473684210525, + "grad_norm": 0.003920792136341333, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2940357029438019, + "epoch": 2.8815789473684212, + "grad_norm": 0.008678693324327469, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1095 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.29278014600276947, + "epoch": 2.8842105263157896, + "grad_norm": 0.0035982029512524605, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 326.4765625, + "completions/mean_terminated_length": 326.4765625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.2947344183921814, + "epoch": 2.886842105263158, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.007296448573470116, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 238592558.0, + "reward": 0.7330079078674316, + "reward_std": 0.05293193459510803, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.75390625, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.935546875, + "rewards/symbolic_reward_partial_score/std": 0.12586505711078644, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0746783018112183, + "sampling/importance_sampling_ratio/min": 0.0004666089080274105, + "sampling/sampling_logp_difference/max": 7.670019149780273, + "sampling/sampling_logp_difference/mean": 0.1425882875919342, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2921011298894882, + "epoch": 2.889473684210526, + "grad_norm": 0.00487733306363225, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.29850101470947266, + "epoch": 2.8921052631578945, + "grad_norm": 0.003617974929511547, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2937781363725662, + "epoch": 2.8947368421052633, + "grad_norm": 0.002885440830141306, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 336.349609375, + "completions/mean_terminated_length": 336.349609375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.292828232049942, + "epoch": 2.8973684210526316, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.010650069452822208, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 239156225.0, + "reward": 0.7841760516166687, + "reward_std": 0.08980467170476913, + "rewards/progression_diversity/mean": -0.0003705902199726552, + "rewards/progression_diversity/std": 0.008385499007999897, + "rewards/symbolic_reward_accuracy/mean": 0.83203125, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.9498697519302368, + "rewards/symbolic_reward_partial_score/std": 0.12645751237869263, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0728933811187744, + "sampling/importance_sampling_ratio/min": 0.0003060997696593404, + "sampling/sampling_logp_difference/max": 8.091599464416504, + "sampling/sampling_logp_difference/mean": 0.1435360312461853, + "step": 1101 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2941160798072815, + "epoch": 2.9, + "grad_norm": 0.004061248153448105, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1102 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2895169258117676, + "epoch": 2.9026315789473687, + "grad_norm": 0.002520245499908924, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.2936854809522629, + "epoch": 2.905263157894737, + "grad_norm": 0.0031915016006678343, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 322.76171875, + "completions/mean_terminated_length": 322.76171875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.2948418855667114, + "epoch": 2.9078947368421053, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.007212890312075615, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 239736295.0, + "reward": 0.7425780296325684, + "reward_std": 0.08646564930677414, + "rewards/progression_diversity/mean": -1.5266872651409358e-05, + "rewards/progression_diversity/std": 0.00034544989466667175, + "rewards/symbolic_reward_accuracy/mean": 0.771484375, + "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, + "rewards/symbolic_reward_partial_score/mean": 0.9322916269302368, + "rewards/symbolic_reward_partial_score/std": 0.13940434157848358, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.073547601699829, + "sampling/importance_sampling_ratio/min": 0.0009037154377438128, + "sampling/sampling_logp_difference/max": 7.00899600982666, + "sampling/sampling_logp_difference/mean": 0.14504487812519073, + "step": 1105 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2899184674024582, + "epoch": 2.9105263157894736, + "grad_norm": 0.0034633921459317207, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1106 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.29138121008872986, + "epoch": 2.913157894736842, + "grad_norm": 0.0033394438214600086, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.292303666472435, + "epoch": 2.9157894736842103, + "grad_norm": 0.004198621492832899, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 327.3125, + "completions/mean_terminated_length": 327.3125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.29678307473659515, + "epoch": 2.918421052631579, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.00880721490830183, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 240292327.0, + "reward": 0.7222640514373779, + "reward_std": 0.05592235177755356, + "rewards/progression_diversity/mean": -0.00015940384764689952, + "rewards/progression_diversity/std": 0.00271405978128314, + "rewards/symbolic_reward_accuracy/mean": 0.748046875, + "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, + "rewards/symbolic_reward_partial_score/mean": 0.9114583134651184, + "rewards/symbolic_reward_partial_score/std": 0.1814991533756256, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.073778510093689, + "sampling/importance_sampling_ratio/min": 6.552357808686793e-05, + "sampling/sampling_logp_difference/max": 9.633100509643555, + "sampling/sampling_logp_difference/mean": 0.14584583044052124, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.29646173119544983, + "epoch": 2.9210526315789473, + "grad_norm": 0.0014886661665514112, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1110 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.29257290065288544, + "epoch": 2.9236842105263157, + "grad_norm": 0.00437309592962265, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2957308143377304, + "epoch": 2.9263157894736844, + "grad_norm": 0.0028557151090353727, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 327.962890625, + "completions/mean_terminated_length": 327.962890625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.2977665513753891, + "epoch": 2.9289473684210527, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.0022349138744175434, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 240855892.0, + "reward": 0.7024399042129517, + "reward_std": 0.05233702063560486, + "rewards/progression_diversity/mean": -0.00014923579874448478, + "rewards/progression_diversity/std": 0.0023670888040214777, + "rewards/symbolic_reward_accuracy/mean": 0.71875, + "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, + "rewards/symbolic_reward_partial_score/mean": 0.9039713740348816, + "rewards/symbolic_reward_partial_score/std": 0.1682942807674408, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0742888450622559, + "sampling/importance_sampling_ratio/min": 0.0002084570296574384, + "sampling/sampling_logp_difference/max": 8.475777626037598, + "sampling/sampling_logp_difference/mean": 0.14589810371398926, + "step": 1113 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.29716482758522034, + "epoch": 2.931578947368421, + "grad_norm": 0.003905137535184622, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 1114 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.29154519736766815, + "epoch": 2.9342105263157894, + "grad_norm": 0.0021905223838984966, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1115 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.29408158361911774, + "epoch": 2.9368421052631577, + "grad_norm": 0.004286495503038168, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 328.890625, + "completions/mean_terminated_length": 328.890625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.2955780029296875, + "epoch": 2.9394736842105265, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.009008477441966534, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 241443612.0, + "reward": 0.6625969409942627, + "reward_std": 0.04375801980495453, + "rewards/progression_diversity/mean": -7.874615403125063e-05, + "rewards/progression_diversity/std": 0.0017818220658227801, + "rewards/symbolic_reward_accuracy/mean": 0.66015625, + "rewards/symbolic_reward_accuracy/std": 0.4741191864013672, + "rewards/symbolic_reward_partial_score/mean": 0.8883463144302368, + "rewards/symbolic_reward_partial_score/std": 0.17636752128601074, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0732474327087402, + "sampling/importance_sampling_ratio/min": 0.0015634673181921244, + "sampling/sampling_logp_difference/max": 6.460849285125732, + "sampling/sampling_logp_difference/mean": 0.14395904541015625, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.28961172699928284, + "epoch": 2.942105263157895, + "grad_norm": 0.0067161088809370995, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1118 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2924419641494751, + "epoch": 2.944736842105263, + "grad_norm": 0.004618462640792131, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.29112420976161957, + "epoch": 2.9473684210526314, + "grad_norm": 0.0027965642511844635, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 321.8125, + "completions/mean_terminated_length": 321.8125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.2882765978574753, + "epoch": 2.95, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.003613464767113328, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 242024220.0, + "reward": 0.6606934070587158, + "reward_std": 0.06243230402469635, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.65625, + "rewards/symbolic_reward_accuracy/std": 0.4754233956336975, + "rewards/symbolic_reward_partial_score/mean": 0.8898111581802368, + "rewards/symbolic_reward_partial_score/std": 0.17068804800510406, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0739712715148926, + "sampling/importance_sampling_ratio/min": 0.00012918752327095717, + "sampling/sampling_logp_difference/max": 8.954245567321777, + "sampling/sampling_logp_difference/mean": 0.14313524961471558, + "step": 1121 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2939921021461487, + "epoch": 2.9526315789473685, + "grad_norm": 0.0043477327562868595, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.28974662721157074, + "epoch": 2.955263157894737, + "grad_norm": 0.005428643431514502, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1123 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2926223576068878, + "epoch": 2.957894736842105, + "grad_norm": 0.0011934597278013825, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 315.794921875, + "completions/mean_terminated_length": 315.794921875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.28804731369018555, + "epoch": 2.9605263157894735, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.007428732700645924, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 242597395.0, + "reward": 0.7770508527755737, + "reward_std": 0.08177044987678528, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.8203125, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.9495442509651184, + "rewards/symbolic_reward_partial_score/std": 0.1216139942407608, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0729912519454956, + "sampling/importance_sampling_ratio/min": 0.000365082873031497, + "sampling/sampling_logp_difference/max": 7.915386199951172, + "sampling/sampling_logp_difference/mean": 0.1422921121120453, + "step": 1125 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2904288172721863, + "epoch": 2.963157894736842, + "grad_norm": 0.006175660528242588, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1126 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.28833121061325073, + "epoch": 2.9657894736842105, + "grad_norm": 0.0024276673793792725, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.29089853167533875, + "epoch": 2.968421052631579, + "grad_norm": 0.002199800219386816, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 382.857421875, + "completions/mean_terminated_length": 320.10784912109375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.28097280859947205, + "epoch": 2.9710526315789476, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0047841547057032585, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 243197226.0, + "reward": 0.6922287940979004, + "reward_std": 0.10817395895719528, + "rewards/progression_diversity/mean": -0.0007603330886922777, + "rewards/progression_diversity/std": 0.015662960708141327, + "rewards/symbolic_reward_accuracy/mean": 0.708984375, + "rewards/symbolic_reward_accuracy/std": 0.45467492938041687, + "rewards/symbolic_reward_partial_score/mean": 0.8907877802848816, + "rewards/symbolic_reward_partial_score/std": 0.19231903553009033, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.071429967880249, + "sampling/importance_sampling_ratio/min": 0.001035291119478643, + "sampling/sampling_logp_difference/max": 6.873072624206543, + "sampling/sampling_logp_difference/mean": 0.1391676664352417, + "step": 1129 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29636819660663605, + "epoch": 2.973684210526316, + "grad_norm": 0.005953166633844376, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1130 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2874244600534439, + "epoch": 2.9763157894736842, + "grad_norm": 0.0069323936477303505, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1131 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.292653352022171, + "epoch": 2.9789473684210526, + "grad_norm": 0.008772444911301136, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 381.43359375, + "completions/mean_terminated_length": 318.6784362792969, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.2889240086078644, + "epoch": 2.981578947368421, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.01059815101325512, + "learning_rate": 1e-06, + "loss": 0.0317, + "num_tokens": 243830536.0, + "reward": 0.6876782178878784, + "reward_std": 0.1107964962720871, + "rewards/progression_diversity/mean": -0.0017093454953283072, + "rewards/progression_diversity/std": 0.03062625229358673, + "rewards/symbolic_reward_accuracy/mean": 0.6953125, + "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, + "rewards/symbolic_reward_partial_score/mean": 0.9029947519302368, + "rewards/symbolic_reward_partial_score/std": 0.1694251000881195, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0704015493392944, + "sampling/importance_sampling_ratio/min": 0.00027761273668147624, + "sampling/sampling_logp_difference/max": 8.18928337097168, + "sampling/sampling_logp_difference/mean": 0.1361396610736847, + "step": 1133 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2885236442089081, + "epoch": 2.984210526315789, + "grad_norm": 0.0028568825218826532, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1134 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.28880971670150757, + "epoch": 2.986842105263158, + "grad_norm": 0.0019433270208537579, + "learning_rate": 1e-06, + "loss": -0.0019, + "step": 1135 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.28411930799484253, + "epoch": 2.9894736842105263, + "grad_norm": 0.006811541970819235, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 317.9765625, + "completions/mean_terminated_length": 317.9765625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.29524092376232147, + "epoch": 2.9921052631578946, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0021421704441308975, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 244400316.0, + "reward": 0.8205077648162842, + "reward_std": 0.03574543446302414, + "rewards/progression_diversity/mean": -6.387043413269566e-06, + "rewards/progression_diversity/std": 0.0001445223024347797, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9615885019302368, + "rewards/symbolic_reward_partial_score/std": 0.13740219175815582, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0735294818878174, + "sampling/importance_sampling_ratio/min": 0.0010848650708794594, + "sampling/sampling_logp_difference/max": 6.826299667358398, + "sampling/sampling_logp_difference/mean": 0.14254550635814667, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.29247477650642395, + "epoch": 2.9947368421052634, + "grad_norm": 0.0035705517511814833, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.29122480750083923, + "epoch": 2.9973684210526317, + "grad_norm": 0.003994923550635576, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2921169400215149, + "epoch": 3.0, + "grad_norm": 0.0015517091378569603, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1140 + }, + { + "epoch": 3.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.000244140625, + "eval_completions/max_length": 1206.28125, + "eval_completions/max_terminated_length": 711.5625, + "eval_completions/mean_length": 320.849609375, + "eval_completions/mean_terminated_length": 316.92619037628174, + "eval_completions/min_length": 157.03125, + "eval_completions/min_terminated_length": 157.03125, + "eval_entropy": 0.28882430028170347, + "eval_frac_reward_zero_std": 0.77734375, + "eval_loss": 0.0005815211334265769, + "eval_num_tokens": 244400316.0, + "eval_reward": 0.7942132484167814, + "eval_reward_std": 0.044389907372533344, + "eval_rewards/progression_diversity/mean": -6.53385166060616e-05, + "eval_rewards/progression_diversity/std": 0.0007133460931072477, + "eval_rewards/symbolic_reward_accuracy/mean": 0.85400390625, + "eval_rewards/symbolic_reward_accuracy/std": 0.3091083026956767, + "eval_rewards/symbolic_reward_partial_score/mean": 0.942626953125, + "eval_rewards/symbolic_reward_partial_score/std": 0.13389483519131318, + "eval_rewards/tag_count_reward/mean": -0.009765625, + "eval_rewards/tag_count_reward/std": 0.04943125694990158, + "eval_runtime": 168.3957, + "eval_samples_per_second": 1.485, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.071236602962017, + "eval_sampling/importance_sampling_ratio/min": 0.003390098211092703, + "eval_sampling/sampling_logp_difference/max": 20.573874562978745, + "eval_sampling/sampling_logp_difference/mean": 0.14651945093646646, + "eval_steps_per_second": 0.012, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 315.07421875, + "completions/mean_terminated_length": 315.07421875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.28790298104286194, + "epoch": 3.0026315789473683, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.010073988698422909, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 244965442.0, + "reward": 0.741455078125, + "reward_std": 0.10208778083324432, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.771484375, + "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, + "rewards/symbolic_reward_partial_score/mean": 0.9285481572151184, + "rewards/symbolic_reward_partial_score/std": 0.14986184239387512, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0723671913146973, + "sampling/importance_sampling_ratio/min": 0.0002522034337744117, + "sampling/sampling_logp_difference/max": 8.285274505615234, + "sampling/sampling_logp_difference/mean": 0.14196112751960754, + "step": 1141 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2821872681379318, + "epoch": 3.0052631578947366, + "grad_norm": 0.010949229821562767, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1142 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.29011546075344086, + "epoch": 3.0078947368421054, + "grad_norm": 0.01058491412550211, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1143 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.28657861053943634, + "epoch": 3.0105263157894737, + "grad_norm": 0.006004285998642445, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 322.423828125, + "completions/mean_terminated_length": 322.423828125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.28619617223739624, + "epoch": 3.013157894736842, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.005281769670546055, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 245537499.0, + "reward": 0.7984375357627869, + "reward_std": 0.04122573137283325, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.9505208730697632, + "rewards/symbolic_reward_partial_score/std": 0.12660686671733856, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0715934038162231, + "sampling/importance_sampling_ratio/min": 0.0029423898085951805, + "sampling/sampling_logp_difference/max": 5.828533172607422, + "sampling/sampling_logp_difference/mean": 0.14253807067871094, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2869596779346466, + "epoch": 3.0157894736842104, + "grad_norm": 0.0015528675867244601, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2859642207622528, + "epoch": 3.018421052631579, + "grad_norm": 0.009877209551632404, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1147 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2844521105289459, + "epoch": 3.0210526315789474, + "grad_norm": 0.00118832488078624, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 319.44921875, + "completions/mean_terminated_length": 319.44921875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.2846902459859848, + "epoch": 3.0236842105263158, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.003452679142355919, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 246112705.0, + "reward": 0.7942871451377869, + "reward_std": 0.06831326335668564, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.9366861581802368, + "rewards/symbolic_reward_partial_score/std": 0.1584509164094925, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0727237462997437, + "sampling/importance_sampling_ratio/min": 0.00042281081550754607, + "sampling/sampling_logp_difference/max": 7.768585681915283, + "sampling/sampling_logp_difference/mean": 0.142282634973526, + "step": 1149 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.28898128867149353, + "epoch": 3.026315789473684, + "grad_norm": 0.002851970260962844, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.28988291323184967, + "epoch": 3.028947368421053, + "grad_norm": 0.0016596262576058507, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.28207847476005554, + "epoch": 3.031578947368421, + "grad_norm": 0.0025477732997387648, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 320.615234375, + "completions/mean_terminated_length": 320.615234375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.2825598418712616, + "epoch": 3.0342105263157895, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.003103437600657344, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 246672988.0, + "reward": 0.810498058795929, + "reward_std": 0.0731193870306015, + "rewards/progression_diversity/mean": -1.2725857914119842e-06, + "rewards/progression_diversity/std": 2.8795329853892326e-05, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.93994140625, + "rewards/symbolic_reward_partial_score/std": 0.17745040357112885, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.071824550628662, + "sampling/importance_sampling_ratio/min": 0.0028972846921533346, + "sampling/sampling_logp_difference/max": 5.8439812660217285, + "sampling/sampling_logp_difference/mean": 0.14137542247772217, + "step": 1153 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.28350427746772766, + "epoch": 3.036842105263158, + "grad_norm": 0.0032226620241999626, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1154 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2870677411556244, + "epoch": 3.039473684210526, + "grad_norm": 0.00388253852725029, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1155 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2872246205806732, + "epoch": 3.042105263157895, + "grad_norm": 0.0034006177447736263, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 314.228515625, + "completions/mean_terminated_length": 314.228515625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.28370508551597595, + "epoch": 3.044736842105263, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.00431436114013195, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 247245201.0, + "reward": 0.7706543207168579, + "reward_std": 0.07343325018882751, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.8203125, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.92822265625, + "rewards/symbolic_reward_partial_score/std": 0.16191674768924713, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0711597204208374, + "sampling/importance_sampling_ratio/min": 7.088058919180185e-05, + "sampling/sampling_logp_difference/max": 9.554513931274414, + "sampling/sampling_logp_difference/mean": 0.1412719488143921, + "step": 1157 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2819001376628876, + "epoch": 3.0473684210526315, + "grad_norm": 0.005949938669800758, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 1158 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.2845749258995056, + "epoch": 3.05, + "grad_norm": 0.009011217392981052, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 1159 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.28015366196632385, + "epoch": 3.0526315789473686, + "grad_norm": 0.0038384415674954653, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 315.591796875, + "completions/mean_terminated_length": 315.591796875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.27949464321136475, + "epoch": 3.055263157894737, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004537342581897974, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 247801568.0, + "reward": 0.7544918060302734, + "reward_std": 0.05182567238807678, + "rewards/progression_diversity/mean": -4.503194213612005e-05, + "rewards/progression_diversity/std": 0.0010189565364271402, + "rewards/symbolic_reward_accuracy/mean": 0.796875, + "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, + "rewards/symbolic_reward_partial_score/mean": 0.9212239980697632, + "rewards/symbolic_reward_partial_score/std": 0.1719035506248474, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0718927383422852, + "sampling/importance_sampling_ratio/min": 1.6995967598631978e-05, + "sampling/sampling_logp_difference/max": 10.982534408569336, + "sampling/sampling_logp_difference/mean": 0.14235100150108337, + "step": 1161 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2826980799436569, + "epoch": 3.057894736842105, + "grad_norm": 0.00524579593911767, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2870752066373825, + "epoch": 3.0605263157894735, + "grad_norm": 0.005643745418637991, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 1163 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2850530445575714, + "epoch": 3.0631578947368423, + "grad_norm": 0.007966126315295696, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 321.208984375, + "completions/mean_terminated_length": 321.208984375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.2806635946035385, + "epoch": 3.0657894736842106, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.004814115818589926, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 248375691.0, + "reward": 0.7567381858825684, + "reward_std": 0.047071587294340134, + "rewards/progression_diversity/mean": -1.0009222933149431e-05, + "rewards/progression_diversity/std": 0.00022648285084869713, + "rewards/symbolic_reward_accuracy/mean": 0.80078125, + "rewards/symbolic_reward_accuracy/std": 0.39980348944664, + "rewards/symbolic_reward_partial_score/mean": 0.9208984375, + "rewards/symbolic_reward_partial_score/std": 0.16742649674415588, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.071966528892517, + "sampling/importance_sampling_ratio/min": 0.00016959384083747864, + "sampling/sampling_logp_difference/max": 8.682104110717773, + "sampling/sampling_logp_difference/mean": 0.1426159143447876, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.28873564302921295, + "epoch": 3.068421052631579, + "grad_norm": 0.0025189740117639303, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2868616282939911, + "epoch": 3.0710526315789473, + "grad_norm": 0.0013349374057725072, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.28100864589214325, + "epoch": 3.0736842105263156, + "grad_norm": 0.002027245005592704, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 315.712890625, + "completions/mean_terminated_length": 315.712890625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.285733699798584, + "epoch": 3.0763157894736843, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0012603303184732795, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 248913112.0, + "reward": 0.800585150718689, + "reward_std": 0.02160678058862686, + "rewards/progression_diversity/mean": -8.479159441776574e-05, + "rewards/progression_diversity/std": 0.0017334631411358714, + "rewards/symbolic_reward_accuracy/mean": 0.865234375, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.9381510615348816, + "rewards/symbolic_reward_partial_score/std": 0.16102522611618042, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.071446418762207, + "sampling/importance_sampling_ratio/min": 1.3583598246214024e-08, + "sampling/sampling_logp_difference/max": 18.114402770996094, + "sampling/sampling_logp_difference/mean": 0.14119446277618408, + "step": 1169 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.28660933673381805, + "epoch": 3.0789473684210527, + "grad_norm": 0.0005353147862479091, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2914620041847229, + "epoch": 3.081578947368421, + "grad_norm": 0.0039962842129170895, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2793559432029724, + "epoch": 3.0842105263157893, + "grad_norm": 0.004289968870580196, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 313.99609375, + "completions/mean_terminated_length": 313.99609375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.2798050791025162, + "epoch": 3.086842105263158, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.004182688891887665, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 249467158.0, + "reward": 0.8178223371505737, + "reward_std": 0.051801957190036774, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.95263671875, + "rewards/symbolic_reward_partial_score/std": 0.13850493729114532, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0719172954559326, + "sampling/importance_sampling_ratio/min": 4.298911881051026e-05, + "sampling/sampling_logp_difference/max": 10.054563522338867, + "sampling/sampling_logp_difference/mean": 0.1404031664133072, + "step": 1173 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.28199082612991333, + "epoch": 3.0894736842105264, + "grad_norm": 0.0009707204881124198, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.28277716040611267, + "epoch": 3.0921052631578947, + "grad_norm": 0.006554114166647196, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1175 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.27743278443813324, + "epoch": 3.094736842105263, + "grad_norm": 0.001463320222683251, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 317.107421875, + "completions/mean_terminated_length": 317.107421875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.27970318496227264, + "epoch": 3.0973684210526318, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0029366682283580303, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 250034989.0, + "reward": 0.7665039300918579, + "reward_std": 0.03309086337685585, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.80859375, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.9378255009651184, + "rewards/symbolic_reward_partial_score/std": 0.14253705739974976, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.071006178855896, + "sampling/importance_sampling_ratio/min": 0.00015880883438512683, + "sampling/sampling_logp_difference/max": 8.747809410095215, + "sampling/sampling_logp_difference/mean": 0.14218075573444366, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.28671663999557495, + "epoch": 3.1, + "grad_norm": 0.007308136206120253, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1178 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.28224754333496094, + "epoch": 3.1026315789473684, + "grad_norm": 0.0011736652813851833, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.28825029730796814, + "epoch": 3.1052631578947367, + "grad_norm": 0.0011161916190758348, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 316.990234375, + "completions/mean_terminated_length": 316.990234375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.2790067791938782, + "epoch": 3.1078947368421055, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0015674019232392311, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 250602600.0, + "reward": 0.7606445550918579, + "reward_std": 0.05273447930812836, + "rewards/progression_diversity/mean": -2.454866489642882e-06, + "rewards/progression_diversity/std": 5.5547287047374994e-05, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.9222005605697632, + "rewards/symbolic_reward_partial_score/std": 0.17716993391513824, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0702251195907593, + "sampling/importance_sampling_ratio/min": 0.0002612802491057664, + "sampling/sampling_logp_difference/max": 8.249917030334473, + "sampling/sampling_logp_difference/mean": 0.13987314701080322, + "step": 1181 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.28073863685131073, + "epoch": 3.110526315789474, + "grad_norm": 0.001168043352663517, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1182 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.27858008444309235, + "epoch": 3.113157894736842, + "grad_norm": 0.0010491248685866594, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 1183 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.28045469522476196, + "epoch": 3.1157894736842104, + "grad_norm": 0.007387771271169186, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 343.73828125, + "completions/mean_terminated_length": 312.34832763671875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.2847536504268646, + "epoch": 3.1184210526315788, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.001716139609925449, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 251185730.0, + "reward": 0.7362680435180664, + "reward_std": 0.028277598321437836, + "rewards/progression_diversity/mean": -0.0011275196447968483, + "rewards/progression_diversity/std": 0.02525327354669571, + "rewards/symbolic_reward_accuracy/mean": 0.767578125, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.9191080927848816, + "rewards/symbolic_reward_partial_score/std": 0.16660796105861664, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0696632862091064, + "sampling/importance_sampling_ratio/min": 4.833166167372838e-05, + "sampling/sampling_logp_difference/max": 9.937423706054688, + "sampling/sampling_logp_difference/mean": 0.13752613961696625, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.27466967701911926, + "epoch": 3.1210526315789475, + "grad_norm": 0.003341392381116748, + "learning_rate": 1e-06, + "loss": 0.0292, + "step": 1186 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2820413112640381, + "epoch": 3.123684210526316, + "grad_norm": 0.0033147488720715046, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1187 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.278613805770874, + "epoch": 3.126315789473684, + "grad_norm": 0.0009309493470937014, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 350.95703125, + "completions/mean_terminated_length": 319.5812072753906, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.2692241817712784, + "epoch": 3.1289473684210525, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.007036528550088406, + "learning_rate": 1e-06, + "loss": 0.0301, + "num_tokens": 251774220.0, + "reward": 0.7931602001190186, + "reward_std": 0.043564073741436005, + "rewards/progression_diversity/mean": -0.00038891550502739847, + "rewards/progression_diversity/std": 0.008800153620541096, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.9368489980697632, + "rewards/symbolic_reward_partial_score/std": 0.165929913520813, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0689858198165894, + "sampling/importance_sampling_ratio/min": 0.0024728423450142145, + "sampling/sampling_logp_difference/max": 6.002387046813965, + "sampling/sampling_logp_difference/mean": 0.13437007367610931, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.28147004544734955, + "epoch": 3.1315789473684212, + "grad_norm": 0.004228492267429829, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.27740536630153656, + "epoch": 3.1342105263157896, + "grad_norm": 0.002545455237850547, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.275810107588768, + "epoch": 3.136842105263158, + "grad_norm": 0.0027159368619322777, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 311.162109375, + "completions/mean_terminated_length": 311.162109375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.277396023273468, + "epoch": 3.139473684210526, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.002241811016574502, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 252345503.0, + "reward": 0.7347656488418579, + "reward_std": 0.04471675306558609, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.767578125, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.9140625, + "rewards/symbolic_reward_partial_score/std": 0.17476648092269897, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069706916809082, + "sampling/importance_sampling_ratio/min": 0.0019460662733763456, + "sampling/sampling_logp_difference/max": 6.241945266723633, + "sampling/sampling_logp_difference/mean": 0.13861989974975586, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.27622032165527344, + "epoch": 3.1421052631578945, + "grad_norm": 0.008059216663241386, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2781420797109604, + "epoch": 3.1447368421052633, + "grad_norm": 0.003802333725616336, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.27944231033325195, + "epoch": 3.1473684210526316, + "grad_norm": 0.0019333260133862495, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 315.603515625, + "completions/mean_terminated_length": 315.603515625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.2817239761352539, + "epoch": 3.15, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.0066985213197767735, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 252910900.0, + "reward": 0.7115720510482788, + "reward_std": 0.038549572229385376, + "rewards/progression_diversity/mean": -1.895956665975973e-05, + "rewards/progression_diversity/std": 0.0004290060023777187, + "rewards/symbolic_reward_accuracy/mean": 0.73828125, + "rewards/symbolic_reward_accuracy/std": 0.44000017642974854, + "rewards/symbolic_reward_partial_score/mean": 0.8953450918197632, + "rewards/symbolic_reward_partial_score/std": 0.18525949120521545, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0701593160629272, + "sampling/importance_sampling_ratio/min": 0.0006146501400507987, + "sampling/sampling_logp_difference/max": 7.3944573402404785, + "sampling/sampling_logp_difference/mean": 0.14031648635864258, + "step": 1197 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.28350114822387695, + "epoch": 3.1526315789473682, + "grad_norm": 0.0015183095820248127, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.27260275185108185, + "epoch": 3.155263157894737, + "grad_norm": 0.003470479976385832, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2772819846868515, + "epoch": 3.1578947368421053, + "grad_norm": 0.007544191554188728, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 311.41796875, + "completions/mean_terminated_length": 311.41796875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.2722645252943039, + "epoch": 3.1605263157894736, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0009831043425947428, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 253476138.0, + "reward": 0.696484386920929, + "reward_std": 0.02265625074505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.71484375, + "rewards/symbolic_reward_accuracy/std": 0.45193037390708923, + "rewards/symbolic_reward_partial_score/mean": 0.8919271230697632, + "rewards/symbolic_reward_partial_score/std": 0.19320917129516602, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069796085357666, + "sampling/importance_sampling_ratio/min": 0.0006912931567057967, + "sampling/sampling_logp_difference/max": 7.276946544647217, + "sampling/sampling_logp_difference/mean": 0.1394611895084381, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.280326783657074, + "epoch": 3.163157894736842, + "grad_norm": 0.0007917368202470243, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.27794717252254486, + "epoch": 3.1657894736842107, + "grad_norm": 0.0003407469193916768, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1203 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.27963490784168243, + "epoch": 3.168421052631579, + "grad_norm": 0.0004433590220287442, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 346.998046875, + "completions/mean_terminated_length": 315.6144714355469, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.2839759588241577, + "epoch": 3.1710526315789473, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.003669463098049164, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 254036937.0, + "reward": 0.8009728789329529, + "reward_std": 0.04420147091150284, + "rewards/progression_diversity/mean": -0.00037003474426455796, + "rewards/progression_diversity/std": 0.007489512208849192, + "rewards/symbolic_reward_accuracy/mean": 0.869140625, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.9322916865348816, + "rewards/symbolic_reward_partial_score/std": 0.17563912272453308, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0679644346237183, + "sampling/importance_sampling_ratio/min": 6.135964940767735e-05, + "sampling/sampling_logp_difference/max": 9.698758125305176, + "sampling/sampling_logp_difference/mean": 0.1385321319103241, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.28549641370773315, + "epoch": 3.1736842105263157, + "grad_norm": 0.006869847420603037, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1206 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2799648195505142, + "epoch": 3.1763157894736844, + "grad_norm": 0.002433902584016323, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1207 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.27347370982170105, + "epoch": 3.1789473684210527, + "grad_norm": 0.0075611830689013, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 309.787109375, + "completions/mean_terminated_length": 309.787109375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.27822238206863403, + "epoch": 3.181578947368421, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.009059883654117584, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 254579324.0, + "reward": 0.728661298751831, + "reward_std": 0.05693569406867027, + "rewards/progression_diversity/mean": -8.019153028726578e-05, + "rewards/progression_diversity/std": 0.0014274234417825937, + "rewards/symbolic_reward_accuracy/mean": 0.7578125, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.9132487177848816, + "rewards/symbolic_reward_partial_score/std": 0.1662641316652298, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.070951223373413, + "sampling/importance_sampling_ratio/min": 0.00033222284400835633, + "sampling/sampling_logp_difference/max": 8.00970458984375, + "sampling/sampling_logp_difference/mean": 0.14075878262519836, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.28292274475097656, + "epoch": 3.1842105263157894, + "grad_norm": 0.0033197884913533926, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1210 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2789665162563324, + "epoch": 3.1868421052631577, + "grad_norm": 0.0019438323797658086, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.28156426548957825, + "epoch": 3.1894736842105265, + "grad_norm": 0.006589422933757305, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 310.138671875, + "completions/mean_terminated_length": 310.138671875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.27788078784942627, + "epoch": 3.192105263157895, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0016114837490022182, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 255143587.0, + "reward": 0.7333984971046448, + "reward_std": 0.02498510479927063, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.76953125, + "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, + "rewards/symbolic_reward_partial_score/mean": 0.9055989384651184, + "rewards/symbolic_reward_partial_score/std": 0.18524064123630524, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0696802139282227, + "sampling/importance_sampling_ratio/min": 7.757533353469626e-07, + "sampling/sampling_logp_difference/max": 14.06943130493164, + "sampling/sampling_logp_difference/mean": 0.1414531171321869, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2840680330991745, + "epoch": 3.194736842105263, + "grad_norm": 0.0007972043240442872, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.27601639926433563, + "epoch": 3.1973684210526314, + "grad_norm": 0.00040065511711873114, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1215 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2749394178390503, + "epoch": 3.2, + "grad_norm": 0.005288100801408291, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 310.4296875, + "completions/mean_terminated_length": 310.4296875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.2765820473432541, + "epoch": 3.2026315789473685, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.002937893383204937, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 255726687.0, + "reward": 0.7673335075378418, + "reward_std": 0.03701866418123245, + "rewards/progression_diversity/mean": -5.3788204240845516e-05, + "rewards/progression_diversity/std": 0.00121708819642663, + "rewards/symbolic_reward_accuracy/mean": 0.810546875, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.9366862177848816, + "rewards/symbolic_reward_partial_score/std": 0.1429397314786911, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0701409578323364, + "sampling/importance_sampling_ratio/min": 1.0667208698578179e-06, + "sampling/sampling_logp_difference/max": 13.750921249389648, + "sampling/sampling_logp_difference/mean": 0.13888150453567505, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2759207487106323, + "epoch": 3.205263157894737, + "grad_norm": 0.0068762172013521194, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.27509351074695587, + "epoch": 3.207894736842105, + "grad_norm": 0.003071358660236001, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.27888280153274536, + "epoch": 3.2105263157894735, + "grad_norm": 0.0011286369990557432, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 308.841796875, + "completions/mean_terminated_length": 308.841796875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.2750840187072754, + "epoch": 3.213157894736842, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004304279573261738, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 256278798.0, + "reward": 0.7308099269866943, + "reward_std": 0.051743410527706146, + "rewards/progression_diversity/mean": -6.387459143297747e-05, + "rewards/progression_diversity/std": 0.0013390847016125917, + "rewards/symbolic_reward_accuracy/mean": 0.755859375, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.92431640625, + "rewards/symbolic_reward_partial_score/std": 0.14684316515922546, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0694196224212646, + "sampling/importance_sampling_ratio/min": 0.0019063284853473306, + "sampling/sampling_logp_difference/max": 6.262576103210449, + "sampling/sampling_logp_difference/mean": 0.14004170894622803, + "step": 1221 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.27862994372844696, + "epoch": 3.2157894736842105, + "grad_norm": 0.002798795234411955, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1222 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2836599797010422, + "epoch": 3.218421052631579, + "grad_norm": 0.0013353745453059673, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 1223 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.28007832169532776, + "epoch": 3.221052631578947, + "grad_norm": 0.00800714548677206, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1284.0, + "completions/max_terminated_length": 1284.0, + "completions/mean_length": 305.09765625, + "completions/mean_terminated_length": 305.09765625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.2768820822238922, + "epoch": 3.223684210526316, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.007991495542228222, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 256848000.0, + "reward": 0.8162106275558472, + "reward_std": 0.038142770528793335, + "rewards/progression_diversity/mean": -3.331673360662535e-05, + "rewards/progression_diversity/std": 0.0007538715726695955, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.955078125, + "rewards/symbolic_reward_partial_score/std": 0.1319531947374344, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0698317289352417, + "sampling/importance_sampling_ratio/min": 4.832327613257803e-05, + "sampling/sampling_logp_difference/max": 9.937597274780273, + "sampling/sampling_logp_difference/mean": 0.13788989186286926, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.274690717458725, + "epoch": 3.2263157894736842, + "grad_norm": 0.006914569530636072, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.27500303089618683, + "epoch": 3.2289473684210526, + "grad_norm": 0.0019397508585825562, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.277148962020874, + "epoch": 3.231578947368421, + "grad_norm": 0.002220799447968602, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 297.98828125, + "completions/mean_terminated_length": 297.98828125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.2688996493816376, + "epoch": 3.2342105263157896, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.005563766695559025, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 257413402.0, + "reward": 0.7893544435501099, + "reward_std": 0.037740692496299744, + "rewards/progression_diversity/mean": -0.00010110770381288603, + "rewards/progression_diversity/std": 0.0022878062445670366, + "rewards/symbolic_reward_accuracy/mean": 0.841796875, + "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, + "rewards/symbolic_reward_partial_score/mean": 0.9475911855697632, + "rewards/symbolic_reward_partial_score/std": 0.1305176019668579, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069476842880249, + "sampling/importance_sampling_ratio/min": 0.00020935774955432862, + "sampling/sampling_logp_difference/max": 8.471466064453125, + "sampling/sampling_logp_difference/mean": 0.1374514102935791, + "step": 1229 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2761152386665344, + "epoch": 3.236842105263158, + "grad_norm": 0.004724735394120216, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.27613112330436707, + "epoch": 3.2394736842105263, + "grad_norm": 0.0010733611416071653, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.276006281375885, + "epoch": 3.2421052631578946, + "grad_norm": 0.004941441584378481, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 300.60546875, + "completions/mean_terminated_length": 300.60546875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.2708422541618347, + "epoch": 3.2447368421052634, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.0021209348924458027, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 257974288.0, + "reward": 0.8088867664337158, + "reward_std": 0.04001738876104355, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.86328125, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.9697265625, + "rewards/symbolic_reward_partial_score/std": 0.07978670299053192, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0682693719863892, + "sampling/importance_sampling_ratio/min": 0.0019491848070174456, + "sampling/sampling_logp_difference/max": 6.240344047546387, + "sampling/sampling_logp_difference/mean": 0.13673865795135498, + "step": 1233 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.27098171412944794, + "epoch": 3.2473684210526317, + "grad_norm": 0.005656884983181953, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1234 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.27767522633075714, + "epoch": 3.25, + "grad_norm": 0.0009761822293512523, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 1235 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.26963022351264954, + "epoch": 3.2526315789473683, + "grad_norm": 0.0069908928126096725, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 300.423828125, + "completions/mean_terminated_length": 300.423828125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.2717629671096802, + "epoch": 3.2552631578947366, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.006111988332122564, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 258553929.0, + "reward": 0.7229492664337158, + "reward_std": 0.02226562611758709, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.75, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.9098306894302368, + "rewards/symbolic_reward_partial_score/std": 0.17206500470638275, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0691936016082764, + "sampling/importance_sampling_ratio/min": 0.00034812605008482933, + "sampling/sampling_logp_difference/max": 7.962945938110352, + "sampling/sampling_logp_difference/mean": 0.13795682787895203, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.26943157613277435, + "epoch": 3.2578947368421054, + "grad_norm": 0.0070730592124164104, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2729327529668808, + "epoch": 3.2605263157894737, + "grad_norm": 0.000548347074072808, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.27310749888420105, + "epoch": 3.263157894736842, + "grad_norm": 0.0007199924439191818, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 299.822265625, + "completions/mean_terminated_length": 299.822265625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.2718695104122162, + "epoch": 3.2657894736842104, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.007086692377924919, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 259099214.0, + "reward": 0.7880857586860657, + "reward_std": 0.02265714481472969, + "rewards/progression_diversity/mean": -2.2376687411451712e-05, + "rewards/progression_diversity/std": 0.0005063266144134104, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.955078125, + "rewards/symbolic_reward_partial_score/std": 0.1139356791973114, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0676381587982178, + "sampling/importance_sampling_ratio/min": 0.0004350260423962027, + "sampling/sampling_logp_difference/max": 7.740104675292969, + "sampling/sampling_logp_difference/mean": 0.13587021827697754, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2723766714334488, + "epoch": 3.268421052631579, + "grad_norm": 0.000981443445198238, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1242 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2734517604112625, + "epoch": 3.2710526315789474, + "grad_norm": 0.0007923005032353103, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2730279564857483, + "epoch": 3.2736842105263158, + "grad_norm": 0.01129809208214283, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 296.421875, + "completions/mean_terminated_length": 296.421875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.2747436612844467, + "epoch": 3.276315789473684, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.011831044219434261, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 259648294.0, + "reward": 0.6915037631988525, + "reward_std": 0.07890917360782623, + "rewards/progression_diversity/mean": -1.475131830375176e-05, + "rewards/progression_diversity/std": 0.00033378423540852964, + "rewards/symbolic_reward_accuracy/mean": 0.7109375, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.8831380605697632, + "rewards/symbolic_reward_partial_score/std": 0.19753031432628632, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0684700012207031, + "sampling/importance_sampling_ratio/min": 6.384307926055044e-05, + "sampling/sampling_logp_difference/max": 9.659082412719727, + "sampling/sampling_logp_difference/mean": 0.13689962029457092, + "step": 1245 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.27058111131191254, + "epoch": 3.2789473684210524, + "grad_norm": 0.0020035523921251297, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1246 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2713269144296646, + "epoch": 3.281578947368421, + "grad_norm": 0.004604648798704147, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1247 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2719510495662689, + "epoch": 3.2842105263157895, + "grad_norm": 0.0033603121992200613, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 297.6171875, + "completions/mean_terminated_length": 297.6171875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.26710009574890137, + "epoch": 3.286842105263158, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.000993250752799213, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 260195138.0, + "reward": 0.7834961414337158, + "reward_std": 0.012673637829720974, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.837890625, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.9358723759651184, + "rewards/symbolic_reward_partial_score/std": 0.16824129223823547, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0676732063293457, + "sampling/importance_sampling_ratio/min": 4.49665293444923e-07, + "sampling/sampling_logp_difference/max": 14.614762306213379, + "sampling/sampling_logp_difference/mean": 0.13557732105255127, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2676374465227127, + "epoch": 3.2894736842105265, + "grad_norm": 0.0007264050655066967, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.270687460899353, + "epoch": 3.292105263157895, + "grad_norm": 0.0006257990025915205, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1251 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.26940611004829407, + "epoch": 3.294736842105263, + "grad_norm": 0.0004618112579919398, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 300.1953125, + "completions/mean_terminated_length": 300.1953125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.2769618481397629, + "epoch": 3.2973684210526315, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0007221846026368439, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 260752646.0, + "reward": 0.7583496570587158, + "reward_std": 0.01035156287252903, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.80859375, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.91064453125, + "rewards/symbolic_reward_partial_score/std": 0.2116525024175644, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0673704147338867, + "sampling/importance_sampling_ratio/min": 0.0001698398555163294, + "sampling/sampling_logp_difference/max": 8.680654525756836, + "sampling/sampling_logp_difference/mean": 0.13527140021324158, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2640591859817505, + "epoch": 3.3, + "grad_norm": 0.0006942602340131998, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2654099464416504, + "epoch": 3.3026315789473686, + "grad_norm": 0.0003709284064825624, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2712586522102356, + "epoch": 3.305263157894737, + "grad_norm": 0.006947053596377373, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 301.51953125, + "completions/mean_terminated_length": 301.51953125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.26971058547496796, + "epoch": 3.307894736842105, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0009350181207992136, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 261279792.0, + "reward": 0.782421886920929, + "reward_std": 0.0234375037252903, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.9361978769302368, + "rewards/symbolic_reward_partial_score/std": 0.1689293086528778, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0669386386871338, + "sampling/importance_sampling_ratio/min": 5.623637093776779e-07, + "sampling/sampling_logp_difference/max": 14.391117095947266, + "sampling/sampling_logp_difference/mean": 0.13500767946243286, + "step": 1257 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2693052887916565, + "epoch": 3.3105263157894735, + "grad_norm": 0.0008573352824896574, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2674746811389923, + "epoch": 3.3131578947368423, + "grad_norm": 0.008638602681457996, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2724864184856415, + "epoch": 3.3157894736842106, + "grad_norm": 0.0008081798441708088, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 331.048828125, + "completions/mean_terminated_length": 299.6340637207031, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.26643824577331543, + "epoch": 3.318421052631579, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.003980845678597689, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 261877801.0, + "reward": 0.764533281326294, + "reward_std": 0.02937212586402893, + "rewards/progression_diversity/mean": -0.001752678770571947, + "rewards/progression_diversity/std": 0.03948704153299332, + "rewards/symbolic_reward_accuracy/mean": 0.810546875, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.9280599355697632, + "rewards/symbolic_reward_partial_score/std": 0.15806357562541962, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0653882026672363, + "sampling/importance_sampling_ratio/min": 0.0031156677287071943, + "sampling/sampling_logp_difference/max": 5.7713117599487305, + "sampling/sampling_logp_difference/mean": 0.12959060072898865, + "step": 1261 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2636767327785492, + "epoch": 3.3210526315789473, + "grad_norm": 0.0007575162453576922, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.26859790086746216, + "epoch": 3.3236842105263156, + "grad_norm": 0.0006437603733502328, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.26694221794605255, + "epoch": 3.3263157894736843, + "grad_norm": 0.004774495959281921, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 299.962890625, + "completions/mean_terminated_length": 299.962890625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.26413773000240326, + "epoch": 3.3289473684210527, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.009386946447193623, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 262445878.0, + "reward": 0.710888683795929, + "reward_std": 0.07148764282464981, + "rewards/progression_diversity/mean": -2.774343556666281e-06, + "rewards/progression_diversity/std": 6.277623469941318e-05, + "rewards/symbolic_reward_accuracy/mean": 0.734375, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.90087890625, + "rewards/symbolic_reward_partial_score/std": 0.18419794738292694, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.067664623260498, + "sampling/importance_sampling_ratio/min": 0.00022712373174726963, + "sampling/sampling_logp_difference/max": 8.390015602111816, + "sampling/sampling_logp_difference/mean": 0.13395264744758606, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2680993378162384, + "epoch": 3.331578947368421, + "grad_norm": 0.004663961473852396, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2699938416481018, + "epoch": 3.3342105263157893, + "grad_norm": 0.006105378735810518, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1267 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.26628097891807556, + "epoch": 3.336842105263158, + "grad_norm": 0.001197861391119659, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 388.677734375, + "completions/mean_terminated_length": 294.40277099609375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.2528943419456482, + "epoch": 3.3394736842105264, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0033977432176470757, + "learning_rate": 1e-06, + "loss": 0.0351, + "num_tokens": 263023825.0, + "reward": 0.7753678560256958, + "reward_std": 0.027289744466543198, + "rewards/progression_diversity/mean": -0.0022837440483272076, + "rewards/progression_diversity/std": 0.029835714027285576, + "rewards/symbolic_reward_accuracy/mean": 0.8203125, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.9453125, + "rewards/symbolic_reward_partial_score/std": 0.13062210381031036, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0619099140167236, + "sampling/importance_sampling_ratio/min": 1.0306473086529877e-05, + "sampling/sampling_logp_difference/max": 11.482738494873047, + "sampling/sampling_logp_difference/mean": 0.12313318997621536, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2718654125928879, + "epoch": 3.3421052631578947, + "grad_norm": 0.0010849793907254934, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2673253118991852, + "epoch": 3.344736842105263, + "grad_norm": 0.0015741854440420866, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.26140786707401276, + "epoch": 3.3473684210526318, + "grad_norm": 0.006531618069857359, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 291.61328125, + "completions/mean_terminated_length": 291.61328125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.26315978169441223, + "epoch": 3.35, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.008593485690653324, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 263592299.0, + "reward": 0.7869629263877869, + "reward_std": 0.03779665380716324, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.84375, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.9357096552848816, + "rewards/symbolic_reward_partial_score/std": 0.1584857553243637, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0645825862884521, + "sampling/importance_sampling_ratio/min": 8.691700713825412e-06, + "sampling/sampling_logp_difference/max": 11.653141975402832, + "sampling/sampling_logp_difference/mean": 0.1318080574274063, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2618803530931473, + "epoch": 3.3526315789473684, + "grad_norm": 0.0023191396612674, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.25944313406944275, + "epoch": 3.3552631578947367, + "grad_norm": 0.0014273182023316622, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.26387016475200653, + "epoch": 3.3578947368421055, + "grad_norm": 0.001026469049975276, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 285.99609375, + "completions/mean_terminated_length": 285.99609375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.26820504665374756, + "epoch": 3.360526315789474, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0026411775033921003, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 264145865.0, + "reward": 0.8200682401657104, + "reward_std": 0.048733148723840714, + "rewards/progression_diversity/mean": -1.5301797247957438e-05, + "rewards/progression_diversity/std": 0.000346240121871233, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9601237177848816, + "rewards/symbolic_reward_partial_score/std": 0.11702742427587509, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0657628774642944, + "sampling/importance_sampling_ratio/min": 1.4597096651414176e-06, + "sampling/sampling_logp_difference/max": 13.437273025512695, + "sampling/sampling_logp_difference/mean": 0.13426843285560608, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.26382070779800415, + "epoch": 3.363157894736842, + "grad_norm": 0.0014408208662644029, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2653789073228836, + "epoch": 3.3657894736842104, + "grad_norm": 0.003974873572587967, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1279 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.26832902431488037, + "epoch": 3.3684210526315788, + "grad_norm": 0.0066981143318116665, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 289.771484375, + "completions/mean_terminated_length": 289.771484375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.26377448439598083, + "epoch": 3.3710526315789475, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0039009610190987587, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 264702388.0, + "reward": 0.7869141101837158, + "reward_std": 0.05072927474975586, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.951171875, + "rewards/symbolic_reward_partial_score/std": 0.12138548493385315, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.066290259361267, + "sampling/importance_sampling_ratio/min": 4.976352101948578e-06, + "sampling/sampling_logp_difference/max": 12.210813522338867, + "sampling/sampling_logp_difference/mean": 0.13322612643241882, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.26462262868881226, + "epoch": 3.373684210526316, + "grad_norm": 0.003261660458520055, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2681008577346802, + "epoch": 3.376315789473684, + "grad_norm": 0.0013276836834847927, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.26759590208530426, + "epoch": 3.3789473684210525, + "grad_norm": 0.0027356266509741545, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 293.91796875, + "completions/mean_terminated_length": 293.91796875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.26821961998939514, + "epoch": 3.3815789473684212, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0032564904540777206, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 265264362.0, + "reward": 0.8073240518569946, + "reward_std": 0.05884532630443573, + "rewards/progression_diversity/mean": -1.5981662727426738e-05, + "rewards/progression_diversity/std": 0.0003616237663663924, + "rewards/symbolic_reward_accuracy/mean": 0.875, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.9410806894302368, + "rewards/symbolic_reward_partial_score/std": 0.17385253310203552, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0658769607543945, + "sampling/importance_sampling_ratio/min": 5.955379037914099e-06, + "sampling/sampling_logp_difference/max": 12.03121566772461, + "sampling/sampling_logp_difference/mean": 0.13309326767921448, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2667364180088043, + "epoch": 3.3842105263157896, + "grad_norm": 0.006176314316689968, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1286 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.2665433883666992, + "epoch": 3.386842105263158, + "grad_norm": 0.0025707941967993975, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1287 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2672065645456314, + "epoch": 3.389473684210526, + "grad_norm": 0.0027666857931762934, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 291.099609375, + "completions/mean_terminated_length": 291.099609375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.26022814214229584, + "epoch": 3.3921052631578945, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0038764202035963535, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 265834237.0, + "reward": 0.7874018549919128, + "reward_std": 0.04602134972810745, + "rewards/progression_diversity/mean": -5.456768849398941e-05, + "rewards/progression_diversity/std": 0.0012347258161753416, + "rewards/symbolic_reward_accuracy/mean": 0.841796875, + "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, + "rewards/symbolic_reward_partial_score/mean": 0.9410807490348816, + "rewards/symbolic_reward_partial_score/std": 0.14504456520080566, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.065362572669983, + "sampling/importance_sampling_ratio/min": 9.59261305979453e-05, + "sampling/sampling_logp_difference/max": 9.251932144165039, + "sampling/sampling_logp_difference/mean": 0.13144491612911224, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2617509812116623, + "epoch": 3.3947368421052633, + "grad_norm": 0.0021487956400960684, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.261522576212883, + "epoch": 3.3973684210526316, + "grad_norm": 0.006694071926176548, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.26264551281929016, + "epoch": 3.4, + "grad_norm": 0.001856036949902773, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 288.107421875, + "completions/mean_terminated_length": 288.107421875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.2683214247226715, + "epoch": 3.4026315789473682, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0031836843118071556, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 266343668.0, + "reward": 0.8083491325378418, + "reward_std": 0.03162518888711929, + "rewards/progression_diversity/mean": -4.605756112141535e-05, + "rewards/progression_diversity/std": 0.0007501619402319193, + "rewards/symbolic_reward_accuracy/mean": 0.86328125, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.9679361581802368, + "rewards/symbolic_reward_partial_score/std": 0.08607691526412964, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0654386281967163, + "sampling/importance_sampling_ratio/min": 0.007917046546936035, + "sampling/sampling_logp_difference/max": 4.8387370109558105, + "sampling/sampling_logp_difference/mean": 0.13370326161384583, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.27221499383449554, + "epoch": 3.405263157894737, + "grad_norm": 0.002062569372355938, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2661845535039902, + "epoch": 3.4078947368421053, + "grad_norm": 0.0034429843071848154, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2656361758708954, + "epoch": 3.4105263157894736, + "grad_norm": 0.0014259653398767114, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 316.431640625, + "completions/mean_terminated_length": 284.9882507324219, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.26236018538475037, + "epoch": 3.413157894736842, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0020829113200306892, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 266900465.0, + "reward": 0.7546237707138062, + "reward_std": 0.02533288300037384, + "rewards/progression_diversity/mean": -0.001499977894127369, + "rewards/progression_diversity/std": 0.0339406281709671, + "rewards/symbolic_reward_accuracy/mean": 0.7890625, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.9373372793197632, + "rewards/symbolic_reward_partial_score/std": 0.1312447488307953, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0632517337799072, + "sampling/importance_sampling_ratio/min": 9.111730037147936e-07, + "sampling/sampling_logp_difference/max": 13.908533096313477, + "sampling/sampling_logp_difference/mean": 0.12851907312870026, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.26197686791419983, + "epoch": 3.4157894736842107, + "grad_norm": 0.001598592847585678, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1298 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.25969552993774414, + "epoch": 3.418421052631579, + "grad_norm": 0.005127459764480591, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2652065008878708, + "epoch": 3.4210526315789473, + "grad_norm": 0.0016107510309666395, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 292.716796875, + "completions/mean_terminated_length": 292.716796875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.27297018468379974, + "epoch": 3.4236842105263157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.004471481777727604, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 267449312.0, + "reward": 0.73178631067276, + "reward_std": 0.05781938135623932, + "rewards/progression_diversity/mean": -8.318589971167967e-05, + "rewards/progression_diversity/std": 0.0018822819693014026, + "rewards/symbolic_reward_accuracy/mean": 0.759765625, + "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, + "rewards/symbolic_reward_partial_score/mean": 0.9197591543197632, + "rewards/symbolic_reward_partial_score/std": 0.1563287228345871, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0674350261688232, + "sampling/importance_sampling_ratio/min": 0.0004979167133569717, + "sampling/sampling_logp_difference/max": 7.605077743530273, + "sampling/sampling_logp_difference/mean": 0.13303379714488983, + "step": 1301 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2664797157049179, + "epoch": 3.4263157894736844, + "grad_norm": 0.0027420800179243088, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1302 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.2613489329814911, + "epoch": 3.4289473684210527, + "grad_norm": 0.002885916270315647, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2659630626440048, + "epoch": 3.431578947368421, + "grad_norm": 0.0036468280013650656, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 287.36328125, + "completions/mean_terminated_length": 287.36328125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.26053567230701447, + "epoch": 3.4342105263157894, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.005564328748732805, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 268009434.0, + "reward": 0.7170894742012024, + "reward_std": 0.02636650763452053, + "rewards/progression_diversity/mean": -4.2535000829957426e-05, + "rewards/progression_diversity/std": 0.0006822184659540653, + "rewards/symbolic_reward_accuracy/mean": 0.740234375, + "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, + "rewards/symbolic_reward_partial_score/mean": 0.9098306894302368, + "rewards/symbolic_reward_partial_score/std": 0.16496829688549042, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0655784606933594, + "sampling/importance_sampling_ratio/min": 5.745126145484392e-06, + "sampling/sampling_logp_difference/max": 12.067158699035645, + "sampling/sampling_logp_difference/mean": 0.13085705041885376, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.26320306956768036, + "epoch": 3.4368421052631577, + "grad_norm": 0.004667403642088175, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1306 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.261497437953949, + "epoch": 3.4394736842105265, + "grad_norm": 0.0008139449637383223, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.25740575790405273, + "epoch": 3.442105263157895, + "grad_norm": 0.001273100497201085, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 286.341796875, + "completions/mean_terminated_length": 286.341796875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.26350516080856323, + "epoch": 3.444736842105263, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.008655845187604427, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 268549001.0, + "reward": 0.810546875, + "reward_std": 0.01718750223517418, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.869140625, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.9635416269302368, + "rewards/symbolic_reward_partial_score/std": 0.1107163056731224, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0654571056365967, + "sampling/importance_sampling_ratio/min": 2.639793456182815e-05, + "sampling/sampling_logp_difference/max": 10.542224884033203, + "sampling/sampling_logp_difference/mean": 0.13366496562957764, + "step": 1309 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.26279450953006744, + "epoch": 3.4473684210526314, + "grad_norm": 0.00044046566472388804, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.26690690219402313, + "epoch": 3.45, + "grad_norm": 0.0007240689010359347, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2649247348308563, + "epoch": 3.4526315789473685, + "grad_norm": 0.0007236701203510165, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 289.13671875, + "completions/mean_terminated_length": 289.13671875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.26410022377967834, + "epoch": 3.455263157894737, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.004910608287900686, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 269110191.0, + "reward": 0.803564190864563, + "reward_std": 0.04589978978037834, + "rewards/progression_diversity/mean": -3.375462256371975e-05, + "rewards/progression_diversity/std": 0.0007637799135409296, + "rewards/symbolic_reward_accuracy/mean": 0.86328125, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.9519856572151184, + "rewards/symbolic_reward_partial_score/std": 0.13935703039169312, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0666524171829224, + "sampling/importance_sampling_ratio/min": 0.0037739586550742388, + "sampling/sampling_logp_difference/max": 5.5796308517456055, + "sampling/sampling_logp_difference/mean": 0.1313626617193222, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2641823887825012, + "epoch": 3.457894736842105, + "grad_norm": 0.0013223750283941627, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1314 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2643107920885086, + "epoch": 3.4605263157894735, + "grad_norm": 0.007209242787212133, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2640030235052109, + "epoch": 3.463157894736842, + "grad_norm": 0.004676418378949165, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 284.884765625, + "completions/mean_terminated_length": 284.884765625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.2607775628566742, + "epoch": 3.4657894736842105, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.000587392773013562, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 269647188.0, + "reward": 0.7455061674118042, + "reward_std": 0.005475577898323536, + "rewards/progression_diversity/mean": -0.0001706962939351797, + "rewards/progression_diversity/std": 0.002271100878715515, + "rewards/symbolic_reward_accuracy/mean": 0.779296875, + "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, + "rewards/symbolic_reward_partial_score/mean": 0.9264323115348816, + "rewards/symbolic_reward_partial_score/std": 0.1475026160478592, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.065974473953247, + "sampling/importance_sampling_ratio/min": 0.00015449296915903687, + "sampling/sampling_logp_difference/max": 8.775362014770508, + "sampling/sampling_logp_difference/mean": 0.13090203702449799, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2620484530925751, + "epoch": 3.468421052631579, + "grad_norm": 0.0020306191872805357, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1318 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.262591689825058, + "epoch": 3.4710526315789476, + "grad_norm": 0.00032434010063298047, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.26363037526607513, + "epoch": 3.473684210526316, + "grad_norm": 0.0005222202162258327, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 286.896484375, + "completions/mean_terminated_length": 286.896484375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.2641294151544571, + "epoch": 3.4763157894736842, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.005580322351306677, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 270190207.0, + "reward": 0.759863018989563, + "reward_std": 0.06166636943817139, + "rewards/progression_diversity/mean": -2.578865496616345e-05, + "rewards/progression_diversity/std": 0.0005835306365042925, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.9195963144302368, + "rewards/symbolic_reward_partial_score/std": 0.17569199204444885, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0666582584381104, + "sampling/importance_sampling_ratio/min": 0.00020486714493017644, + "sampling/sampling_logp_difference/max": 8.493148803710938, + "sampling/sampling_logp_difference/mean": 0.13263538479804993, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.26649659872055054, + "epoch": 3.4789473684210526, + "grad_norm": 0.004749960731714964, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2594447731971741, + "epoch": 3.481578947368421, + "grad_norm": 0.0007304165628738701, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.26071953773498535, + "epoch": 3.4842105263157896, + "grad_norm": 0.005144048947840929, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 283.166015625, + "completions/mean_terminated_length": 283.166015625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.26111868023872375, + "epoch": 3.486842105263158, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.013651230372488499, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 270757684.0, + "reward": 0.7337884902954102, + "reward_std": 0.054217878729104996, + "rewards/progression_diversity/mean": -6.13354059169069e-05, + "rewards/progression_diversity/std": 0.001387861673720181, + "rewards/symbolic_reward_accuracy/mean": 0.76171875, + "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, + "rewards/symbolic_reward_partial_score/mean": 0.9225260019302368, + "rewards/symbolic_reward_partial_score/std": 0.16891297698020935, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0644073486328125, + "sampling/importance_sampling_ratio/min": 4.552837395976894e-08, + "sampling/sampling_logp_difference/max": 16.904930114746094, + "sampling/sampling_logp_difference/mean": 0.1341029703617096, + "step": 1325 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2623433470726013, + "epoch": 3.4894736842105263, + "grad_norm": 0.004322811029851437, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1326 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.256848469376564, + "epoch": 3.4921052631578946, + "grad_norm": 0.007806100882589817, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1327 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.25882723927497864, + "epoch": 3.4947368421052634, + "grad_norm": 0.00808742269873619, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 281.03515625, + "completions/mean_terminated_length": 281.03515625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.25531578063964844, + "epoch": 3.4973684210526317, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.001909639686346054, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 271302054.0, + "reward": 0.8040038347244263, + "reward_std": 0.03667568787932396, + "rewards/progression_diversity/mean": -7.209268460428575e-06, + "rewards/progression_diversity/std": 0.0001631271152291447, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.9612630605697632, + "rewards/symbolic_reward_partial_score/std": 0.09996785968542099, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0645685195922852, + "sampling/importance_sampling_ratio/min": 0.0025024518836289644, + "sampling/sampling_logp_difference/max": 5.990484237670898, + "sampling/sampling_logp_difference/mean": 0.1278265118598938, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.25595368444919586, + "epoch": 3.5, + "grad_norm": 0.0006472188397310674, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2598939687013626, + "epoch": 3.5026315789473683, + "grad_norm": 0.0017913918709382415, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1331 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.25719694793224335, + "epoch": 3.5052631578947366, + "grad_norm": 0.002060194732621312, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 283.7265625, + "completions/mean_terminated_length": 283.7265625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.2666303217411041, + "epoch": 3.5078947368421054, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.00912644062191248, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 271841626.0, + "reward": 0.7412598133087158, + "reward_std": 0.023930229246616364, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.775390625, + "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, + "rewards/symbolic_reward_partial_score/mean": 0.9200846552848816, + "rewards/symbolic_reward_partial_score/std": 0.1616220623254776, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0666561126708984, + "sampling/importance_sampling_ratio/min": 0.0017295932630077004, + "sampling/sampling_logp_difference/max": 6.359869003295898, + "sampling/sampling_logp_difference/mean": 0.13231700658798218, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.26160600781440735, + "epoch": 3.5105263157894737, + "grad_norm": 0.0008754784357734025, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2645406574010849, + "epoch": 3.513157894736842, + "grad_norm": 0.00632048211991787, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1335 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.26721756160259247, + "epoch": 3.515789473684211, + "grad_norm": 0.0029721923638135195, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 280.01953125, + "completions/mean_terminated_length": 280.01953125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.26314379274845123, + "epoch": 3.518421052631579, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0004468945844564587, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 272389124.0, + "reward": 0.7737305164337158, + "reward_std": 0.005078125279396772, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.810546875, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.9580078125, + "rewards/symbolic_reward_partial_score/std": 0.09104800969362259, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0662033557891846, + "sampling/importance_sampling_ratio/min": 0.0034528106916695833, + "sampling/sampling_logp_difference/max": 5.668566703796387, + "sampling/sampling_logp_difference/mean": 0.13059526681900024, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.262877956032753, + "epoch": 3.5210526315789474, + "grad_norm": 0.005230678245425224, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.25840798020362854, + "epoch": 3.5236842105263158, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2623683661222458, + "epoch": 3.526315789473684, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 281.23828125, + "completions/mean_terminated_length": 281.23828125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.26039692759513855, + "epoch": 3.5289473684210524, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0022722186986356974, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 272901054.0, + "reward": 0.7767575979232788, + "reward_std": 0.0289163701236248, + "rewards/progression_diversity/mean": -2.1483450836967677e-05, + "rewards/progression_diversity/std": 0.00048611496458761394, + "rewards/symbolic_reward_accuracy/mean": 0.82421875, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.9407552480697632, + "rewards/symbolic_reward_partial_score/std": 0.13769857585430145, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.064164400100708, + "sampling/importance_sampling_ratio/min": 0.00021925149485468864, + "sampling/sampling_logp_difference/max": 8.425291061401367, + "sampling/sampling_logp_difference/mean": 0.12925145030021667, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.26014891266822815, + "epoch": 3.531578947368421, + "grad_norm": 0.0019025214714929461, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.26306456327438354, + "epoch": 3.5342105263157895, + "grad_norm": 0.00047899127821438015, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.25312456488609314, + "epoch": 3.536842105263158, + "grad_norm": 0.006323935464024544, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 272.009765625, + "completions/mean_terminated_length": 272.009765625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.26376163959503174, + "epoch": 3.5394736842105265, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0009184160153381526, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 273438275.0, + "reward": 0.79150390625, + "reward_std": 0.019352849572896957, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.9430338144302368, + "rewards/symbolic_reward_partial_score/std": 0.15738774836063385, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0648845434188843, + "sampling/importance_sampling_ratio/min": 0.004287427756935358, + "sampling/sampling_logp_difference/max": 5.452068328857422, + "sampling/sampling_logp_difference/mean": 0.130482017993927, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.26225438714027405, + "epoch": 3.542105263157895, + "grad_norm": 0.0004132894682697952, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2599750906229019, + "epoch": 3.544736842105263, + "grad_norm": 0.0004837804299313575, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2639758139848709, + "epoch": 3.5473684210526315, + "grad_norm": 0.003910145256668329, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 282.12109375, + "completions/mean_terminated_length": 282.12109375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.2593047469854355, + "epoch": 3.55, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0070974016562104225, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 273977185.0, + "reward": 0.7821289300918579, + "reward_std": 0.03948177397251129, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.837890625, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.9313150644302368, + "rewards/symbolic_reward_partial_score/std": 0.17086125910282135, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0641109943389893, + "sampling/importance_sampling_ratio/min": 0.0002722894714679569, + "sampling/sampling_logp_difference/max": 8.20864486694336, + "sampling/sampling_logp_difference/mean": 0.13093721866607666, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.26446548104286194, + "epoch": 3.5526315789473686, + "grad_norm": 0.0035018729977309704, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.25814053416252136, + "epoch": 3.555263157894737, + "grad_norm": 0.00219136499799788, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2600250542163849, + "epoch": 3.557894736842105, + "grad_norm": 0.0010674957884475589, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 276.349609375, + "completions/mean_terminated_length": 276.349609375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.2562939524650574, + "epoch": 3.5605263157894735, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0012435732642188668, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 274526996.0, + "reward": 0.7630362510681152, + "reward_std": 0.016148678958415985, + "rewards/progression_diversity/mean": -8.660568710183725e-05, + "rewards/progression_diversity/std": 0.0017533308127894998, + "rewards/symbolic_reward_accuracy/mean": 0.802734375, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.93798828125, + "rewards/symbolic_reward_partial_score/std": 0.14083407819271088, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0641676187515259, + "sampling/importance_sampling_ratio/min": 7.245798315125285e-06, + "sampling/sampling_logp_difference/max": 11.835088729858398, + "sampling/sampling_logp_difference/mean": 0.13085180521011353, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.26552921533584595, + "epoch": 3.5631578947368423, + "grad_norm": 0.0011110203340649605, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2580649256706238, + "epoch": 3.5657894736842106, + "grad_norm": 0.0030603648629039526, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1355 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.26225745677948, + "epoch": 3.568421052631579, + "grad_norm": 0.0005105588352307677, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15014.0, + "completions/max_terminated_length": 15014.0, + "completions/mean_length": 303.72265625, + "completions/mean_terminated_length": 303.72265625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.2583850473165512, + "epoch": 3.5710526315789473, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.002601654501631856, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 275084966.0, + "reward": 0.7722633481025696, + "reward_std": 0.02485058829188347, + "rewards/progression_diversity/mean": -0.00022908755636308342, + "rewards/progression_diversity/std": 0.005183659493923187, + "rewards/symbolic_reward_accuracy/mean": 0.822265625, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.9303385615348816, + "rewards/symbolic_reward_partial_score/std": 0.16154132783412933, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0623509883880615, + "sampling/importance_sampling_ratio/min": 0.0005308397230692208, + "sampling/sampling_logp_difference/max": 7.541050434112549, + "sampling/sampling_logp_difference/mean": 0.12857502698898315, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.257750928401947, + "epoch": 3.5736842105263156, + "grad_norm": 0.007370724342763424, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.25655198097229004, + "epoch": 3.5763157894736843, + "grad_norm": 0.0012657454935833812, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2600477933883667, + "epoch": 3.5789473684210527, + "grad_norm": 0.0008869823068380356, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 271.412109375, + "completions/mean_terminated_length": 271.412109375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.25901901721954346, + "epoch": 3.581578947368421, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.004224944394081831, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 275619737.0, + "reward": 0.738232433795929, + "reward_std": 0.052099138498306274, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.767578125, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.9256184697151184, + "rewards/symbolic_reward_partial_score/std": 0.15123844146728516, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0638682842254639, + "sampling/importance_sampling_ratio/min": 0.0003394426894374192, + "sampling/sampling_logp_difference/max": 7.988205432891846, + "sampling/sampling_logp_difference/mean": 0.13007181882858276, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2613148093223572, + "epoch": 3.5842105263157897, + "grad_norm": 0.0035895437467843294, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.26094360649585724, + "epoch": 3.586842105263158, + "grad_norm": 0.002421436831355095, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2611810863018036, + "epoch": 3.5894736842105264, + "grad_norm": 0.004130725748836994, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 276.9140625, + "completions/mean_terminated_length": 276.9140625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.262939915060997, + "epoch": 3.5921052631578947, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.003355985274538398, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 276174669.0, + "reward": 0.7457025051116943, + "reward_std": 0.07194074988365173, + "rewards/progression_diversity/mean": -6.150803528726101e-05, + "rewards/progression_diversity/std": 0.0010658144019544125, + "rewards/symbolic_reward_accuracy/mean": 0.783203125, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.9192708730697632, + "rewards/symbolic_reward_partial_score/std": 0.16867269575595856, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0644850730895996, + "sampling/importance_sampling_ratio/min": 0.0005842425744049251, + "sampling/sampling_logp_difference/max": 7.445194244384766, + "sampling/sampling_logp_difference/mean": 0.13014239072799683, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2584049552679062, + "epoch": 3.594736842105263, + "grad_norm": 0.008518553338944912, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1366 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2618335336446762, + "epoch": 3.5973684210526313, + "grad_norm": 0.0025971720460802317, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.26184357702732086, + "epoch": 3.6, + "grad_norm": 0.001905369688756764, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 279.400390625, + "completions/mean_terminated_length": 279.400390625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.26188020408153534, + "epoch": 3.6026315789473684, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.004297807812690735, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 276700858.0, + "reward": 0.7915037870407104, + "reward_std": 0.039244234561920166, + "rewards/progression_diversity/mean": -1.5363497368525714e-05, + "rewards/progression_diversity/std": 0.00034763626172207296, + "rewards/symbolic_reward_accuracy/mean": 0.849609375, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.9391275644302368, + "rewards/symbolic_reward_partial_score/std": 0.15407440066337585, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.065136194229126, + "sampling/importance_sampling_ratio/min": 0.0005955998785793781, + "sampling/sampling_logp_difference/max": 7.425941467285156, + "sampling/sampling_logp_difference/mean": 0.13098467886447906, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.26105809211730957, + "epoch": 3.6052631578947367, + "grad_norm": 0.004089301452040672, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2629578411579132, + "epoch": 3.6078947368421055, + "grad_norm": 0.006167134270071983, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.25889579951763153, + "epoch": 3.610526315789474, + "grad_norm": 0.006159425713121891, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 282.240234375, + "completions/mean_terminated_length": 282.240234375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.2573448568582535, + "epoch": 3.613157894736842, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.011007736437022686, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 277228821.0, + "reward": 0.7931638956069946, + "reward_std": 0.0751810073852539, + "rewards/progression_diversity/mean": -1.619287286303006e-05, + "rewards/progression_diversity/std": 0.00036640287726186216, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.9407552480697632, + "rewards/symbolic_reward_partial_score/std": 0.15025249123573303, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0642564296722412, + "sampling/importance_sampling_ratio/min": 9.635853348299861e-05, + "sampling/sampling_logp_difference/max": 9.247434616088867, + "sampling/sampling_logp_difference/mean": 0.1293666511774063, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.26405245065689087, + "epoch": 3.6157894736842104, + "grad_norm": 0.00313938083127141, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.26752831041812897, + "epoch": 3.6184210526315788, + "grad_norm": 0.0031525518279522657, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2577017545700073, + "epoch": 3.6210526315789475, + "grad_norm": 0.002522657625377178, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 279.568359375, + "completions/mean_terminated_length": 279.568359375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.26175713539123535, + "epoch": 3.623684210526316, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.004775851499289274, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 277769592.0, + "reward": 0.8418935537338257, + "reward_std": 0.05553803592920303, + "rewards/progression_diversity/mean": -9.922643948812038e-05, + "rewards/progression_diversity/std": 0.0015981434844434261, + "rewards/symbolic_reward_accuracy/mean": 0.916015625, + "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, + "rewards/symbolic_reward_partial_score/mean": 0.9742838144302368, + "rewards/symbolic_reward_partial_score/std": 0.09493894129991531, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0631273984909058, + "sampling/importance_sampling_ratio/min": 0.0002650808892212808, + "sampling/sampling_logp_difference/max": 8.235475540161133, + "sampling/sampling_logp_difference/mean": 0.1295655369758606, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.25658373534679413, + "epoch": 3.626315789473684, + "grad_norm": 0.0063101500272750854, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1378 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.25902336835861206, + "epoch": 3.6289473684210525, + "grad_norm": 0.0057982527650892735, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.26077018678188324, + "epoch": 3.6315789473684212, + "grad_norm": 0.005432313308119774, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 280.951171875, + "completions/mean_terminated_length": 280.951171875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.25891734659671783, + "epoch": 3.6342105263157896, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.005559414625167847, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 278318751.0, + "reward": 0.7943359613418579, + "reward_std": 0.03447442501783371, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.9407551884651184, + "rewards/symbolic_reward_partial_score/std": 0.1546209156513214, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0632920265197754, + "sampling/importance_sampling_ratio/min": 5.388126373873092e-05, + "sampling/sampling_logp_difference/max": 9.828727722167969, + "sampling/sampling_logp_difference/mean": 0.12783315777778625, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.25600631535053253, + "epoch": 3.636842105263158, + "grad_norm": 0.005643780808895826, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1382 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2576565742492676, + "epoch": 3.639473684210526, + "grad_norm": 0.0014839650830253959, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1383 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.25885675847530365, + "epoch": 3.6421052631578945, + "grad_norm": 0.001807388965971768, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 284.384765625, + "completions/mean_terminated_length": 284.384765625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.25956813991069794, + "epoch": 3.6447368421052633, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.005248944275081158, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 278891204.0, + "reward": 0.7840330004692078, + "reward_std": 0.05429323390126228, + "rewards/progression_diversity/mean": -2.221354043285828e-05, + "rewards/progression_diversity/std": 0.000469754304504022, + "rewards/symbolic_reward_accuracy/mean": 0.833984375, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.9454752206802368, + "rewards/symbolic_reward_partial_score/std": 0.13321354985237122, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.063035488128662, + "sampling/importance_sampling_ratio/min": 0.00014696984726469964, + "sampling/sampling_logp_difference/max": 8.82528305053711, + "sampling/sampling_logp_difference/mean": 0.1286298781633377, + "step": 1385 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.25760167837142944, + "epoch": 3.6473684210526316, + "grad_norm": 0.0028908655513077974, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 1386 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.2583495229482651, + "epoch": 3.65, + "grad_norm": 0.0017066035652533174, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1387 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2530466616153717, + "epoch": 3.6526315789473687, + "grad_norm": 0.0022362645249813795, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 288.814453125, + "completions/mean_terminated_length": 288.814453125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.2583690285682678, + "epoch": 3.655263157894737, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.005710822995752096, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 279461253.0, + "reward": 0.7866692543029785, + "reward_std": 0.05079422518610954, + "rewards/progression_diversity/mean": -7.545309927081689e-05, + "rewards/progression_diversity/std": 0.0013873358257114887, + "rewards/symbolic_reward_accuracy/mean": 0.83203125, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.9581705331802368, + "rewards/symbolic_reward_partial_score/std": 0.10960590839385986, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0640332698822021, + "sampling/importance_sampling_ratio/min": 0.0019857485312968493, + "sampling/sampling_logp_difference/max": 6.22175931930542, + "sampling/sampling_logp_difference/mean": 0.12767165899276733, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.257061704993248, + "epoch": 3.6578947368421053, + "grad_norm": 0.004931934643536806, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1390 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2629953771829605, + "epoch": 3.6605263157894736, + "grad_norm": 0.0034470842219889164, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1391 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.26158788800239563, + "epoch": 3.663157894736842, + "grad_norm": 0.0022328526247292757, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 286.408203125, + "completions/mean_terminated_length": 286.408203125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.2523958906531334, + "epoch": 3.6657894736842103, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.005374122876673937, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 279998006.0, + "reward": 0.8294917345046997, + "reward_std": 0.03202075511217117, + "rewards/progression_diversity/mean": -4.53010288765654e-05, + "rewards/progression_diversity/std": 0.0010250452905893326, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9602864384651184, + "rewards/symbolic_reward_partial_score/std": 0.13371798396110535, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0621941089630127, + "sampling/importance_sampling_ratio/min": 0.00307788816280663, + "sampling/sampling_logp_difference/max": 5.783511638641357, + "sampling/sampling_logp_difference/mean": 0.12583598494529724, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2517199218273163, + "epoch": 3.668421052631579, + "grad_norm": 0.0023138711694628, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1394 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2528526932001114, + "epoch": 3.6710526315789473, + "grad_norm": 0.003128192387521267, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.25596775114536285, + "epoch": 3.6736842105263157, + "grad_norm": 0.0009576270822435617, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 293.642578125, + "completions/mean_terminated_length": 293.642578125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.25471819937229156, + "epoch": 3.6763157894736844, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0031971256248652935, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 280548831.0, + "reward": 0.8655747175216675, + "reward_std": 0.0322401225566864, + "rewards/progression_diversity/mean": -0.0001476671895943582, + "rewards/progression_diversity/std": 0.0032431327272206545, + "rewards/symbolic_reward_accuracy/mean": 0.951171875, + "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, + "rewards/symbolic_reward_partial_score/mean": 0.98291015625, + "rewards/symbolic_reward_partial_score/std": 0.0817250907421112, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.062921404838562, + "sampling/importance_sampling_ratio/min": 0.0002203639887738973, + "sampling/sampling_logp_difference/max": 8.4202299118042, + "sampling/sampling_logp_difference/mean": 0.1252584308385849, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2561066001653671, + "epoch": 3.6789473684210527, + "grad_norm": 0.004226773511618376, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 1398 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.25608038902282715, + "epoch": 3.681578947368421, + "grad_norm": 0.0008029827149584889, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1399 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.24976499378681183, + "epoch": 3.6842105263157894, + "grad_norm": 0.0008318249019794166, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 291.421875, + "completions/mean_terminated_length": 291.421875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.2544937878847122, + "epoch": 3.6868421052631577, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.00313924765214324, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 281088311.0, + "reward": 0.8208984732627869, + "reward_std": 0.05074112489819527, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.890625, + "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, + "rewards/symbolic_reward_partial_score/mean": 0.955078125, + "rewards/symbolic_reward_partial_score/std": 0.14979958534240723, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0617702007293701, + "sampling/importance_sampling_ratio/min": 1.9943448933190666e-05, + "sampling/sampling_logp_difference/max": 10.822609901428223, + "sampling/sampling_logp_difference/mean": 0.12716618180274963, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2559308558702469, + "epoch": 3.6894736842105265, + "grad_norm": 0.001412192708812654, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.25313878059387207, + "epoch": 3.692105263157895, + "grad_norm": 0.0063199615105986595, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2549208104610443, + "epoch": 3.694736842105263, + "grad_norm": 0.004130657762289047, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 291.791015625, + "completions/mean_terminated_length": 291.791015625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.25114137679338455, + "epoch": 3.6973684210526314, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0008531836792826653, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 281645004.0, + "reward": 0.8064441084861755, + "reward_std": 0.011723745614290237, + "rewards/progression_diversity/mean": -0.00012486957712098956, + "rewards/progression_diversity/std": 0.002825475763529539, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.9459635615348816, + "rewards/symbolic_reward_partial_score/std": 0.14375825226306915, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0619075298309326, + "sampling/importance_sampling_ratio/min": 0.00024697749176993966, + "sampling/sampling_logp_difference/max": 8.30621337890625, + "sampling/sampling_logp_difference/mean": 0.12634673714637756, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.25604428350925446, + "epoch": 3.7, + "grad_norm": 0.00038085339474491775, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.25144171714782715, + "epoch": 3.7026315789473685, + "grad_norm": 0.0005420586676336825, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.25223299860954285, + "epoch": 3.705263157894737, + "grad_norm": 0.0004907060065306723, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 291.05078125, + "completions/mean_terminated_length": 291.05078125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.24966774135828018, + "epoch": 3.707894736842105, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.002532323356717825, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 282182150.0, + "reward": 0.8462873697280884, + "reward_std": 0.05212379992008209, + "rewards/progression_diversity/mean": -0.00016891930135898292, + "rewards/progression_diversity/std": 0.0038222072180360556, + "rewards/symbolic_reward_accuracy/mean": 0.923828125, + "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, + "rewards/symbolic_reward_partial_score/mean": 0.9733072519302368, + "rewards/symbolic_reward_partial_score/std": 0.10344868153333664, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.06183922290802, + "sampling/importance_sampling_ratio/min": 0.0015853223158046603, + "sampling/sampling_logp_difference/max": 6.446967601776123, + "sampling/sampling_logp_difference/mean": 0.12602993845939636, + "step": 1409 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2533850073814392, + "epoch": 3.7105263157894735, + "grad_norm": 0.00075558852404356, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.25219717621803284, + "epoch": 3.713157894736842, + "grad_norm": 0.004231073893606663, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1411 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.25498561561107635, + "epoch": 3.7157894736842105, + "grad_norm": 0.0037051881663501263, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 289.005859375, + "completions/mean_terminated_length": 289.005859375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.24799177050590515, + "epoch": 3.718421052631579, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.003672688500955701, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 282737257.0, + "reward": 0.7589839696884155, + "reward_std": 0.0520077683031559, + "rewards/progression_diversity/mean": -3.9446502341888845e-05, + "rewards/progression_diversity/std": 0.0007376011344604194, + "rewards/symbolic_reward_accuracy/mean": 0.796875, + "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, + "rewards/symbolic_reward_partial_score/mean": 0.9361979365348816, + "rewards/symbolic_reward_partial_score/std": 0.14094945788383484, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0618805885314941, + "sampling/importance_sampling_ratio/min": 0.00025599630316719413, + "sampling/sampling_logp_difference/max": 8.270347595214844, + "sampling/sampling_logp_difference/mean": 0.12418229877948761, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2508721351623535, + "epoch": 3.7210526315789476, + "grad_norm": 0.0015133292181417346, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1414 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2473483458161354, + "epoch": 3.723684210526316, + "grad_norm": 0.003211608622223139, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1415 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2483832836151123, + "epoch": 3.7263157894736842, + "grad_norm": 0.001318311900831759, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 290.224609375, + "completions/mean_terminated_length": 290.224609375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.2575960010290146, + "epoch": 3.7289473684210526, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0016101644141599536, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 283283164.0, + "reward": 0.8690425753593445, + "reward_std": 0.04031773656606674, + "rewards/progression_diversity/mean": -4.095973417861387e-05, + "rewards/progression_diversity/std": 0.0009268129942938685, + "rewards/symbolic_reward_accuracy/mean": 0.958984375, + "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, + "rewards/symbolic_reward_partial_score/mean": 0.9788411855697632, + "rewards/symbolic_reward_partial_score/std": 0.10289039462804794, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.061889886856079, + "sampling/importance_sampling_ratio/min": 6.194894922373351e-06, + "sampling/sampling_logp_difference/max": 11.991785049438477, + "sampling/sampling_logp_difference/mean": 0.12483149766921997, + "step": 1417 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2505711019039154, + "epoch": 3.731578947368421, + "grad_norm": 0.0009705891134217381, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.25299596786499023, + "epoch": 3.734210526315789, + "grad_norm": 0.004146168474107981, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.25464165210723877, + "epoch": 3.736842105263158, + "grad_norm": 0.0012070556404069066, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 294.47265625, + "completions/mean_terminated_length": 294.47265625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.2556218057870865, + "epoch": 3.7394736842105263, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0047198995016515255, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 283829902.0, + "reward": 0.7914550304412842, + "reward_std": 0.037014544010162354, + "rewards/progression_diversity/mean": -1.3498356565833092e-05, + "rewards/progression_diversity/std": 0.00030543291359208524, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.93115234375, + "rewards/symbolic_reward_partial_score/std": 0.17769792675971985, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.062889814376831, + "sampling/importance_sampling_ratio/min": 0.00024652990396134555, + "sampling/sampling_logp_difference/max": 8.308027267456055, + "sampling/sampling_logp_difference/mean": 0.12772433459758759, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2585349977016449, + "epoch": 3.7421052631578946, + "grad_norm": 0.001490709139034152, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1422 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2560439109802246, + "epoch": 3.7447368421052634, + "grad_norm": 0.0030670189298689365, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2598079591989517, + "epoch": 3.7473684210526317, + "grad_norm": 0.006503027398139238, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 316.50390625, + "completions/mean_terminated_length": 285.0606689453125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.24654557555913925, + "epoch": 3.75, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0053245374001562595, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 284389744.0, + "reward": 0.8325069546699524, + "reward_std": 0.039236586540937424, + "rewards/progression_diversity/mean": -0.0012605031952261925, + "rewards/progression_diversity/std": 0.028521930798888206, + "rewards/symbolic_reward_accuracy/mean": 0.900390625, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.9749348759651184, + "rewards/symbolic_reward_partial_score/std": 0.09453998506069183, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0596258640289307, + "sampling/importance_sampling_ratio/min": 0.0023435140028595924, + "sampling/sampling_logp_difference/max": 6.056103706359863, + "sampling/sampling_logp_difference/mean": 0.12060326337814331, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.25124719738960266, + "epoch": 3.7526315789473683, + "grad_norm": 0.0019301900174468756, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1426 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2514963522553444, + "epoch": 3.7552631578947366, + "grad_norm": 0.0022078719921410084, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2516591548919678, + "epoch": 3.7578947368421054, + "grad_norm": 0.0018776309443637729, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 288.1328125, + "completions/mean_terminated_length": 288.1328125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.2531207203865051, + "epoch": 3.7605263157894737, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0015241794753819704, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 284912020.0, + "reward": 0.852148175239563, + "reward_std": 0.033069413155317307, + "rewards/progression_diversity/mean": -2.7848042009281926e-05, + "rewards/progression_diversity/std": 0.0006301292451098561, + "rewards/symbolic_reward_accuracy/mean": 0.93359375, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.9733073115348816, + "rewards/symbolic_reward_partial_score/std": 0.10501327365636826, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060740351676941, + "sampling/importance_sampling_ratio/min": 1.1205984264961444e-06, + "sampling/sampling_logp_difference/max": 13.701647758483887, + "sampling/sampling_logp_difference/mean": 0.1257631629705429, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2528574913740158, + "epoch": 3.763157894736842, + "grad_norm": 0.0046994988806545734, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2516646832227707, + "epoch": 3.765789473684211, + "grad_norm": 0.0015605302760377526, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.25141212344169617, + "epoch": 3.768421052631579, + "grad_norm": 0.0009051641100086272, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 286.068359375, + "completions/mean_terminated_length": 286.068359375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.24804198741912842, + "epoch": 3.7710526315789474, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004949395544826984, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 285463479.0, + "reward": 0.7871561646461487, + "reward_std": 0.07125456631183624, + "rewards/progression_diversity/mean": -0.0002128158521372825, + "rewards/progression_diversity/std": 0.004131709225475788, + "rewards/symbolic_reward_accuracy/mean": 0.833984375, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.9558919668197632, + "rewards/symbolic_reward_partial_score/std": 0.1025310531258583, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060426950454712, + "sampling/importance_sampling_ratio/min": 0.000660174002405256, + "sampling/sampling_logp_difference/max": 7.323007106781006, + "sampling/sampling_logp_difference/mean": 0.12435492873191833, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.24799808859825134, + "epoch": 3.7736842105263158, + "grad_norm": 0.011806968599557877, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1434 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.24940943717956543, + "epoch": 3.776315789473684, + "grad_norm": 0.003585018450394273, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.24540776759386063, + "epoch": 3.7789473684210524, + "grad_norm": 0.006892574485391378, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 284.197265625, + "completions/mean_terminated_length": 284.197265625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.2505408376455307, + "epoch": 3.781578947368421, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0029755232390016317, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 286036476.0, + "reward": 0.7453613877296448, + "reward_std": 0.03177817165851593, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.78125, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.9220377206802368, + "rewards/symbolic_reward_partial_score/std": 0.16653913259506226, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0615108013153076, + "sampling/importance_sampling_ratio/min": 0.0015313836047425866, + "sampling/sampling_logp_difference/max": 6.481583595275879, + "sampling/sampling_logp_difference/mean": 0.12580080330371857, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.24840565025806427, + "epoch": 3.7842105263157895, + "grad_norm": 0.001804483006708324, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.25283022224903107, + "epoch": 3.786842105263158, + "grad_norm": 0.002521629212424159, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2502717077732086, + "epoch": 3.7894736842105265, + "grad_norm": 0.0006864585448056459, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 281.96484375, + "completions/mean_terminated_length": 281.96484375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.2550983428955078, + "epoch": 3.792105263157895, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.0033912304788827896, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 286587978.0, + "reward": 0.7921856641769409, + "reward_std": 0.04257185012102127, + "rewards/progression_diversity/mean": -0.00018314375483896583, + "rewards/progression_diversity/std": 0.0032831577118486166, + "rewards/symbolic_reward_accuracy/mean": 0.84375, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.953125, + "rewards/symbolic_reward_partial_score/std": 0.11922687292098999, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.061973214149475, + "sampling/importance_sampling_ratio/min": 0.0007712719962000847, + "sampling/sampling_logp_difference/max": 7.167469501495361, + "sampling/sampling_logp_difference/mean": 0.12495287507772446, + "step": 1441 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.2525986135005951, + "epoch": 3.794736842105263, + "grad_norm": 0.0032344043720513582, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2523139417171478, + "epoch": 3.7973684210526315, + "grad_norm": 0.003199146594852209, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.24989847838878632, + "epoch": 3.8, + "grad_norm": 0.0028786209877580404, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 281.888671875, + "completions/mean_terminated_length": 281.888671875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.24845003336668015, + "epoch": 3.8026315789473686, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0037182525265961885, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 287115921.0, + "reward": 0.8478025197982788, + "reward_std": 0.02462989278137684, + "rewards/progression_diversity/mean": -2.4990213205455802e-05, + "rewards/progression_diversity/std": 0.0005654640262946486, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9744466543197632, + "rewards/symbolic_reward_partial_score/std": 0.09773314744234085, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0622966289520264, + "sampling/importance_sampling_ratio/min": 0.0008701475453563035, + "sampling/sampling_logp_difference/max": 7.046847820281982, + "sampling/sampling_logp_difference/mean": 0.12369303405284882, + "step": 1445 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2537457197904587, + "epoch": 3.805263157894737, + "grad_norm": 0.000904034823179245, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1446 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.24874110519886017, + "epoch": 3.807894736842105, + "grad_norm": 0.0007011942798271775, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1447 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.24791067093610764, + "epoch": 3.8105263157894735, + "grad_norm": 0.0011079704854637384, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 308.1796875, + "completions/mean_terminated_length": 276.72015380859375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.24869465827941895, + "epoch": 3.8131578947368423, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.006853095255792141, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 287657805.0, + "reward": 0.8094131350517273, + "reward_std": 0.04773065820336342, + "rewards/progression_diversity/mean": -0.001070742728188634, + "rewards/progression_diversity/std": 0.024228140711784363, + "rewards/symbolic_reward_accuracy/mean": 0.876953125, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.9441732168197632, + "rewards/symbolic_reward_partial_score/std": 0.15558461844921112, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0597963333129883, + "sampling/importance_sampling_ratio/min": 0.003406539326533675, + "sampling/sampling_logp_difference/max": 5.682058334350586, + "sampling/sampling_logp_difference/mean": 0.11979185044765472, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2493317946791649, + "epoch": 3.8157894736842106, + "grad_norm": 0.0018536460120230913, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24266890436410904, + "epoch": 3.818421052631579, + "grad_norm": 0.00839342549443245, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.24180525541305542, + "epoch": 3.8210526315789473, + "grad_norm": 0.0011780494824051857, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 283.44140625, + "completions/mean_terminated_length": 283.44140625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.2568906396627426, + "epoch": 3.8236842105263156, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0014739191392436624, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 288214255.0, + "reward": 0.8007813096046448, + "reward_std": 0.04135853052139282, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.9583333134651184, + "rewards/symbolic_reward_partial_score/std": 0.11808153241872787, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0631709098815918, + "sampling/importance_sampling_ratio/min": 0.0010668974136933684, + "sampling/sampling_logp_difference/max": 6.843000411987305, + "sampling/sampling_logp_difference/mean": 0.1276545524597168, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.25487686693668365, + "epoch": 3.8263157894736843, + "grad_norm": 0.007138120010495186, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2546128034591675, + "epoch": 3.8289473684210527, + "grad_norm": 0.001334945554845035, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1455 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2549200505018234, + "epoch": 3.831578947368421, + "grad_norm": 0.0048521822318434715, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 277.6171875, + "completions/mean_terminated_length": 277.6171875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.2548879086971283, + "epoch": 3.8342105263157897, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0043244436383247375, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 288753867.0, + "reward": 0.76708984375, + "reward_std": 0.04385879635810852, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.8125, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.9319661259651184, + "rewards/symbolic_reward_partial_score/std": 0.15669545531272888, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.062173843383789, + "sampling/importance_sampling_ratio/min": 0.004508232232183218, + "sampling/sampling_logp_difference/max": 5.40185022354126, + "sampling/sampling_logp_difference/mean": 0.12663015723228455, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.24473460018634796, + "epoch": 3.836842105263158, + "grad_norm": 0.001730249379761517, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1458 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2515050768852234, + "epoch": 3.8394736842105264, + "grad_norm": 0.002126506296917796, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1459 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2557496428489685, + "epoch": 3.8421052631578947, + "grad_norm": 0.004039853345602751, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 278.25390625, + "completions/mean_terminated_length": 278.25390625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.2558761090040207, + "epoch": 3.844736842105263, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.003505394095554948, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 289297133.0, + "reward": 0.8289056420326233, + "reward_std": 0.05454582720994949, + "rewards/progression_diversity/mean": -6.592483259737492e-05, + "rewards/progression_diversity/std": 0.0011662401957437396, + "rewards/symbolic_reward_accuracy/mean": 0.900390625, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.9622395634651184, + "rewards/symbolic_reward_partial_score/std": 0.12198054045438766, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0614289045333862, + "sampling/importance_sampling_ratio/min": 0.0010322880698367953, + "sampling/sampling_logp_difference/max": 6.875977516174316, + "sampling/sampling_logp_difference/mean": 0.1257316619157791, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.2523561343550682, + "epoch": 3.8473684210526313, + "grad_norm": 0.006407232955098152, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.24898836761713028, + "epoch": 3.85, + "grad_norm": 0.006468737497925758, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.25089330971241, + "epoch": 3.8526315789473684, + "grad_norm": 0.002999432384967804, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 276.68359375, + "completions/mean_terminated_length": 276.68359375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.2475588619709015, + "epoch": 3.8552631578947367, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.004880095832049847, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 289843947.0, + "reward": 0.8241690397262573, + "reward_std": 0.03976859897375107, + "rewards/progression_diversity/mean": -8.7072177848313e-05, + "rewards/progression_diversity/std": 0.001970218727365136, + "rewards/symbolic_reward_accuracy/mean": 0.890625, + "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, + "rewards/symbolic_reward_partial_score/mean": 0.9659830331802368, + "rewards/symbolic_reward_partial_score/std": 0.11252343654632568, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0622788667678833, + "sampling/importance_sampling_ratio/min": 0.00028998416382819414, + "sampling/sampling_logp_difference/max": 8.145684242248535, + "sampling/sampling_logp_difference/mean": 0.126849964261055, + "step": 1465 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2497614100575447, + "epoch": 3.8578947368421055, + "grad_norm": 0.0029560329858213663, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2516980916261673, + "epoch": 3.860526315789474, + "grad_norm": 0.004405204672366381, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.25012845546007156, + "epoch": 3.863157894736842, + "grad_norm": 0.0012873845407739282, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 278.521484375, + "completions/mean_terminated_length": 278.521484375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.2574310898780823, + "epoch": 3.8657894736842104, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.008448783308267593, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 290384342.0, + "reward": 0.82958984375, + "reward_std": 0.09687106311321259, + "rewards/progression_diversity/mean": -4.6260124690888915e-06, + "rewards/progression_diversity/std": 0.00010467470565345138, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9606119394302368, + "rewards/symbolic_reward_partial_score/std": 0.13187071681022644, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0626296997070312, + "sampling/importance_sampling_ratio/min": 0.0021899458952248096, + "sampling/sampling_logp_difference/max": 6.123878479003906, + "sampling/sampling_logp_difference/mean": 0.1256691813468933, + "step": 1469 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.24842996895313263, + "epoch": 3.8684210526315788, + "grad_norm": 0.0027846877928823233, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1470 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.24932969361543655, + "epoch": 3.8710526315789475, + "grad_norm": 0.004896063357591629, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.24926616251468658, + "epoch": 3.873684210526316, + "grad_norm": 0.0034752595238387585, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 278.984375, + "completions/mean_terminated_length": 278.984375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.24850311875343323, + "epoch": 3.876315789473684, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.004292353987693787, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 290960366.0, + "reward": 0.7541015148162842, + "reward_std": 0.06326388567686081, + "rewards/progression_diversity/mean": -1.2574851098179352e-05, + "rewards/progression_diversity/std": 0.0002845363924279809, + "rewards/symbolic_reward_accuracy/mean": 0.796875, + "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, + "rewards/symbolic_reward_partial_score/mean": 0.919921875, + "rewards/symbolic_reward_partial_score/std": 0.1701854020357132, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0626215934753418, + "sampling/importance_sampling_ratio/min": 0.00040142651414498687, + "sampling/sampling_logp_difference/max": 7.820486068725586, + "sampling/sampling_logp_difference/mean": 0.12611128389835358, + "step": 1473 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.24769160896539688, + "epoch": 3.8789473684210525, + "grad_norm": 0.0028536503668874502, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1474 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2475282847881317, + "epoch": 3.8815789473684212, + "grad_norm": 0.005000830627977848, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1475 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.25134460628032684, + "epoch": 3.8842105263157896, + "grad_norm": 0.0037172201555222273, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1059.0, + "completions/max_terminated_length": 1059.0, + "completions/mean_length": 285.150390625, + "completions/mean_terminated_length": 285.150390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.2483498975634575, + "epoch": 3.886842105263158, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.00587662635371089, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 291525691.0, + "reward": 0.7465803027153015, + "reward_std": 0.07138986885547638, + "rewards/progression_diversity/mean": -0.00017707535880617797, + "rewards/progression_diversity/std": 0.004006757866591215, + "rewards/symbolic_reward_accuracy/mean": 0.78515625, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.9182943105697632, + "rewards/symbolic_reward_partial_score/std": 0.1653488576412201, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0621373653411865, + "sampling/importance_sampling_ratio/min": 0.0009335778304375708, + "sampling/sampling_logp_difference/max": 6.9764862060546875, + "sampling/sampling_logp_difference/mean": 0.12466084212064743, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2482881024479866, + "epoch": 3.889473684210526, + "grad_norm": 0.004389946348965168, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.24733516573905945, + "epoch": 3.8921052631578945, + "grad_norm": 0.002358887577429414, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1479 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.24862007796764374, + "epoch": 3.8947368421052633, + "grad_norm": 0.007056436035782099, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 312.63671875, + "completions/mean_terminated_length": 281.1859130859375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.24648213386535645, + "epoch": 3.8973684210526316, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.006561717949807644, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 292103585.0, + "reward": 0.8242520689964294, + "reward_std": 0.05909993499517441, + "rewards/progression_diversity/mean": -0.0015558208106085658, + "rewards/progression_diversity/std": 0.0352042093873024, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.9630534052848816, + "rewards/symbolic_reward_partial_score/std": 0.12470652163028717, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0595953464508057, + "sampling/importance_sampling_ratio/min": 0.0015703764511272311, + "sampling/sampling_logp_difference/max": 6.456439971923828, + "sampling/sampling_logp_difference/mean": 0.12068993598222733, + "step": 1481 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.24098284542560577, + "epoch": 3.9, + "grad_norm": 0.0017995714442804456, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 1482 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2443038821220398, + "epoch": 3.9026315789473687, + "grad_norm": 0.004129770677536726, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 1483 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2413162887096405, + "epoch": 3.905263157894737, + "grad_norm": 0.00310018309392035, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 280.509765625, + "completions/mean_terminated_length": 280.509765625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.2457173392176628, + "epoch": 3.9078947368421053, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.0024882822763174772, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 292652358.0, + "reward": 0.8500970005989075, + "reward_std": 0.050697989761829376, + "rewards/progression_diversity/mean": -7.189810276031494e-05, + "rewards/progression_diversity/std": 0.0016268682666122913, + "rewards/symbolic_reward_accuracy/mean": 0.927734375, + "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, + "rewards/symbolic_reward_partial_score/mean": 0.9781900644302368, + "rewards/symbolic_reward_partial_score/std": 0.08413910120725632, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0623741149902344, + "sampling/importance_sampling_ratio/min": 0.004645559936761856, + "sampling/sampling_logp_difference/max": 5.371843338012695, + "sampling/sampling_logp_difference/mean": 0.12582653760910034, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.25171560794115067, + "epoch": 3.9105263157894736, + "grad_norm": 0.0059862262569367886, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.24698235094547272, + "epoch": 3.913157894736842, + "grad_norm": 0.0026288467925041914, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1487 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2489272803068161, + "epoch": 3.9157894736842103, + "grad_norm": 0.0037700568791478872, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 280.005859375, + "completions/mean_terminated_length": 280.005859375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.24408536404371262, + "epoch": 3.918421052631579, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.004705451894551516, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 293172681.0, + "reward": 0.790283203125, + "reward_std": 0.05368012189865112, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.93896484375, + "rewards/symbolic_reward_partial_score/std": 0.15343523025512695, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.061496376991272, + "sampling/importance_sampling_ratio/min": 0.0012954623671248555, + "sampling/sampling_logp_difference/max": 6.648887634277344, + "sampling/sampling_logp_difference/mean": 0.12389793992042542, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24940645694732666, + "epoch": 3.9210526315789473, + "grad_norm": 0.0035800114274024963, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.24737446755170822, + "epoch": 3.9236842105263157, + "grad_norm": 0.0017300564795732498, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.24045811593532562, + "epoch": 3.9263157894736844, + "grad_norm": 0.002610960975289345, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 346.404296875, + "completions/mean_terminated_length": 283.51177978515625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.24617382884025574, + "epoch": 3.9289473684210527, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.007368352729827166, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 293738008.0, + "reward": 0.8140473961830139, + "reward_std": 0.11378496885299683, + "rewards/progression_diversity/mean": -0.0015145066427066922, + "rewards/progression_diversity/std": 0.03268362581729889, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9407552480697632, + "rewards/symbolic_reward_partial_score/std": 0.17397615313529968, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.058638572692871, + "sampling/importance_sampling_ratio/min": 0.0024584669154137373, + "sampling/sampling_logp_difference/max": 6.0082173347473145, + "sampling/sampling_logp_difference/mean": 0.1195145696401596, + "step": 1493 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2411375194787979, + "epoch": 3.931578947368421, + "grad_norm": 0.008139763958752155, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 1494 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.24579878151416779, + "epoch": 3.9342105263157894, + "grad_norm": 0.006455949507653713, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1495 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24035992473363876, + "epoch": 3.9368421052631577, + "grad_norm": 0.006683530285954475, + "learning_rate": 1e-06, + "loss": 0.0233, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 278.2734375, + "completions/mean_terminated_length": 278.2734375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.2454933151602745, + "epoch": 3.9394736842105265, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0044021462090313435, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 294279620.0, + "reward": 0.809814453125, + "reward_std": 0.03176013380289078, + "rewards/progression_diversity/mean": -2.186113533753087e-06, + "rewards/progression_diversity/std": 4.94660998811014e-05, + "rewards/symbolic_reward_accuracy/mean": 0.8671875, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.9650065302848816, + "rewards/symbolic_reward_partial_score/std": 0.09345835447311401, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.061531662940979, + "sampling/importance_sampling_ratio/min": 0.0018202782375738025, + "sampling/sampling_logp_difference/max": 6.308765888214111, + "sampling/sampling_logp_difference/mean": 0.1247473657131195, + "step": 1497 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2489335760474205, + "epoch": 3.942105263157895, + "grad_norm": 0.0007171111647039652, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1498 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24430719763040543, + "epoch": 3.944736842105263, + "grad_norm": 0.0038494495674967766, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1499 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.24642930924892426, + "epoch": 3.9473684210526314, + "grad_norm": 0.0003605918900575489, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 278.025390625, + "completions/mean_terminated_length": 278.025390625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.24727856367826462, + "epoch": 3.95, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0015909909270703793, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 294838289.0, + "reward": 0.866308331489563, + "reward_std": 0.029097389429807663, + "rewards/progression_diversity/mean": -2.9835084205842577e-05, + "rewards/progression_diversity/std": 0.000618708087131381, + "rewards/symbolic_reward_accuracy/mean": 0.953125, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.9814453125, + "rewards/symbolic_reward_partial_score/std": 0.08775121718645096, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0610827207565308, + "sampling/importance_sampling_ratio/min": 8.675010030856356e-05, + "sampling/sampling_logp_difference/max": 9.352478981018066, + "sampling/sampling_logp_difference/mean": 0.12416176497936249, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.24381491541862488, + "epoch": 3.9526315789473685, + "grad_norm": 0.0022116934414952993, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 1502 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.23999455571174622, + "epoch": 3.955263157894737, + "grad_norm": 0.0011692576808854938, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2409554198384285, + "epoch": 3.957894736842105, + "grad_norm": 0.0009114885469898582, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 277.619140625, + "completions/mean_terminated_length": 277.619140625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.24572695791721344, + "epoch": 3.9605263157894735, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.00424238620325923, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 295397230.0, + "reward": 0.8301756381988525, + "reward_std": 0.032026953995227814, + "rewards/progression_diversity/mean": -1.1566195098566823e-05, + "rewards/progression_diversity/std": 0.00026171313947997987, + "rewards/symbolic_reward_accuracy/mean": 0.896484375, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.9742838144302368, + "rewards/symbolic_reward_partial_score/std": 0.08367852121591568, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0614919662475586, + "sampling/importance_sampling_ratio/min": 0.0001828877575462684, + "sampling/sampling_logp_difference/max": 8.606637954711914, + "sampling/sampling_logp_difference/mean": 0.12657146155834198, + "step": 1505 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2439851313829422, + "epoch": 3.963157894736842, + "grad_norm": 0.004110608249902725, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.24492160975933075, + "epoch": 3.9657894736842105, + "grad_norm": 0.0007968592108227313, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2445434406399727, + "epoch": 3.968421052631579, + "grad_norm": 0.0011729674879461527, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 313.478515625, + "completions/mean_terminated_length": 282.02935791015625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.24183562397956848, + "epoch": 3.9710526315789476, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.009790533222258091, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 295987907.0, + "reward": 0.7889963388442993, + "reward_std": 0.03885188698768616, + "rewards/progression_diversity/mean": -0.0017403773963451385, + "rewards/progression_diversity/std": 0.0372454933822155, + "rewards/symbolic_reward_accuracy/mean": 0.841796875, + "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, + "rewards/symbolic_reward_partial_score/mean": 0.9471028447151184, + "rewards/symbolic_reward_partial_score/std": 0.13628406822681427, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0608797073364258, + "sampling/importance_sampling_ratio/min": 0.0006632054573856294, + "sampling/sampling_logp_difference/max": 7.31842565536499, + "sampling/sampling_logp_difference/mean": 0.12243205308914185, + "step": 1509 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.24036569148302078, + "epoch": 3.973684210526316, + "grad_norm": 0.0021023740991950035, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24389879405498505, + "epoch": 3.9763157894736842, + "grad_norm": 0.003945766016840935, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.24920684099197388, + "epoch": 3.9789473684210526, + "grad_norm": 0.0016033814754337072, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 280.58203125, + "completions/mean_terminated_length": 280.58203125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.24790653586387634, + "epoch": 3.981578947368421, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0030526912305504084, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 296541709.0, + "reward": 0.8328123092651367, + "reward_std": 0.06292681396007538, + "rewards/progression_diversity/mean": -1.7364176528644748e-05, + "rewards/progression_diversity/std": 0.000392906425986439, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9713541865348816, + "rewards/symbolic_reward_partial_score/std": 0.09234537184238434, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0622069835662842, + "sampling/importance_sampling_ratio/min": 0.0017897688085213304, + "sampling/sampling_logp_difference/max": 6.325668811798096, + "sampling/sampling_logp_difference/mean": 0.12474418431520462, + "step": 1513 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.2466568797826767, + "epoch": 3.984210526315789, + "grad_norm": 0.0044132862240076065, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2421964779496193, + "epoch": 3.986842105263158, + "grad_norm": 0.003211502218618989, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 1515 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.24317516386508942, + "epoch": 3.9894736842105263, + "grad_norm": 0.0014716348377987742, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 281.439453125, + "completions/mean_terminated_length": 281.439453125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.24056368321180344, + "epoch": 3.9921052631578946, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0032884979154914618, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 297095630.0, + "reward": 0.8452636003494263, + "reward_std": 0.03771102428436279, + "rewards/progression_diversity/mean": -1.1692754924297333e-05, + "rewards/progression_diversity/std": 0.00026457683998160064, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9659830331802368, + "rewards/symbolic_reward_partial_score/std": 0.1378202587366104, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0619913339614868, + "sampling/importance_sampling_ratio/min": 0.0008337947074323893, + "sampling/sampling_logp_difference/max": 7.0895233154296875, + "sampling/sampling_logp_difference/mean": 0.12588346004486084, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2486308068037033, + "epoch": 3.9947368421052634, + "grad_norm": 0.0011405708501115441, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.24828965216875076, + "epoch": 3.9973684210526317, + "grad_norm": 0.002011633710935712, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2471499964594841, + "epoch": 4.0, + "grad_norm": 0.0017826375551521778, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1520 + }, + { + "epoch": 4.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0009765625, + "eval_completions/max_length": 1970.21875, + "eval_completions/max_terminated_length": 543.8125, + "eval_completions/mean_length": 294.76171875, + "eval_completions/mean_terminated_length": 279.04593563079834, + "eval_completions/min_length": 161.0625, + "eval_completions/min_terminated_length": 161.0625, + "eval_entropy": 0.2479579197242856, + "eval_frac_reward_zero_std": 0.79296875, + "eval_loss": 0.0009922023164108396, + "eval_num_tokens": 297095630.0, + "eval_reward": 0.8440247587859631, + "eval_reward_std": 0.04477399826329709, + "eval_rewards/progression_diversity/mean": -0.0006040489212182365, + "eval_rewards/progression_diversity/std": 0.005385569964801107, + "eval_rewards/symbolic_reward_accuracy/mean": 0.9228515625, + "eval_rewards/symbolic_reward_accuracy/std": 0.2023696736432612, + "eval_rewards/symbolic_reward_partial_score/mean": 0.9701741561293602, + "eval_rewards/symbolic_reward_partial_score/std": 0.08667883859016001, + "eval_rewards/tag_count_reward/mean": -0.00732421875, + "eval_rewards/tag_count_reward/std": 0.03229685686528683, + "eval_runtime": 251.604, + "eval_samples_per_second": 0.994, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.061691664159298, + "eval_sampling/importance_sampling_ratio/min": 0.004686991564540222, + "eval_sampling/sampling_logp_difference/max": 14.263333037495613, + "eval_sampling/sampling_logp_difference/mean": 0.12779978360049427, + "eval_steps_per_second": 0.008, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 285.51171875, + "completions/mean_terminated_length": 285.51171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.24478785693645477, + "epoch": 4.002631578947368, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.007620229385793209, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 297658292.0, + "reward": 0.7886228561401367, + "reward_std": 0.08386802673339844, + "rewards/progression_diversity/mean": -1.9863946363329887e-05, + "rewards/progression_diversity/std": 0.00044946977868676186, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.9334310293197632, + "rewards/symbolic_reward_partial_score/std": 0.16388244926929474, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0625085830688477, + "sampling/importance_sampling_ratio/min": 0.0003958561283070594, + "sampling/sampling_logp_difference/max": 7.8344597816467285, + "sampling/sampling_logp_difference/mean": 0.1256425380706787, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2409704327583313, + "epoch": 4.005263157894737, + "grad_norm": 0.0029928318690508604, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1522 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.24776104092597961, + "epoch": 4.007894736842105, + "grad_norm": 0.009631451219320297, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.24515005946159363, + "epoch": 4.010526315789473, + "grad_norm": 0.002499483060091734, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 281.181640625, + "completions/mean_terminated_length": 281.181640625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.24500173330307007, + "epoch": 4.0131578947368425, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0030381688848137856, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 298184369.0, + "reward": 0.866503119468689, + "reward_std": 0.03898514062166214, + "rewards/progression_diversity/mean": -8.544186130166054e-05, + "rewards/progression_diversity/std": 0.001358157955110073, + "rewards/symbolic_reward_accuracy/mean": 0.953125, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.9820963740348816, + "rewards/symbolic_reward_partial_score/std": 0.08295471221208572, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0618839263916016, + "sampling/importance_sampling_ratio/min": 0.00013820610183756799, + "sampling/sampling_logp_difference/max": 8.886764526367188, + "sampling/sampling_logp_difference/mean": 0.12790223956108093, + "step": 1525 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2474457398056984, + "epoch": 4.015789473684211, + "grad_norm": 0.002514457330107689, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24642843008041382, + "epoch": 4.018421052631579, + "grad_norm": 0.0025110999122262, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2506309896707535, + "epoch": 4.021052631578947, + "grad_norm": 0.004548352677375078, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 284.740234375, + "completions/mean_terminated_length": 284.740234375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.2474454790353775, + "epoch": 4.023684210526316, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.003918915521353483, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 298728972.0, + "reward": 0.8486324548721313, + "reward_std": 0.028091946616768837, + "rewards/progression_diversity/mean": -4.3650590669130906e-05, + "rewards/progression_diversity/std": 0.0009877000702545047, + "rewards/symbolic_reward_accuracy/mean": 0.921875, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.9850260019302368, + "rewards/symbolic_reward_partial_score/std": 0.055599283427000046, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0621516704559326, + "sampling/importance_sampling_ratio/min": 0.0001810384273994714, + "sampling/sampling_logp_difference/max": 8.616801261901855, + "sampling/sampling_logp_difference/mean": 0.12489888072013855, + "step": 1529 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.23918018490076065, + "epoch": 4.026315789473684, + "grad_norm": 0.002114385599270463, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1530 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.24100320041179657, + "epoch": 4.028947368421052, + "grad_norm": 0.000925762637052685, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2428521066904068, + "epoch": 4.031578947368421, + "grad_norm": 0.0010113362222909927, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 283.44140625, + "completions/mean_terminated_length": 283.44140625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.24821541458368301, + "epoch": 4.03421052631579, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.003675073618069291, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 299262222.0, + "reward": 0.8415029644966125, + "reward_std": 0.03057749569416046, + "rewards/progression_diversity/mean": -0.00010043365909950808, + "rewards/progression_diversity/std": 0.0022725542075932026, + "rewards/symbolic_reward_accuracy/mean": 0.916015625, + "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, + "rewards/symbolic_reward_partial_score/mean": 0.9729817509651184, + "rewards/symbolic_reward_partial_score/std": 0.10110417753458023, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0609973669052124, + "sampling/importance_sampling_ratio/min": 0.0004463788354769349, + "sampling/sampling_logp_difference/max": 7.7143425941467285, + "sampling/sampling_logp_difference/mean": 0.12613673508167267, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2426435500383377, + "epoch": 4.036842105263158, + "grad_norm": 0.0008600465371273458, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1534 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24244511872529984, + "epoch": 4.0394736842105265, + "grad_norm": 0.0012452512746676803, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.24515938013792038, + "epoch": 4.042105263157895, + "grad_norm": 0.004650202114135027, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 282.837890625, + "completions/mean_terminated_length": 282.837890625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.2445775270462036, + "epoch": 4.044736842105263, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.000899067847058177, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 299790811.0, + "reward": 0.8743164539337158, + "reward_std": 0.02148437686264515, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9609375, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.9925130009651184, + "rewards/symbolic_reward_partial_score/std": 0.040022555738687515, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.061580777168274, + "sampling/importance_sampling_ratio/min": 0.0009474054677411914, + "sampling/sampling_logp_difference/max": 6.961783409118652, + "sampling/sampling_logp_difference/mean": 0.12686872482299805, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2427464723587036, + "epoch": 4.0473684210526315, + "grad_norm": 0.0006039505824446678, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.24190915375947952, + "epoch": 4.05, + "grad_norm": 0.0006737467483617365, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.24564451724290848, + "epoch": 4.052631578947368, + "grad_norm": 0.0006470574298873544, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 286.171875, + "completions/mean_terminated_length": 286.171875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.24494989961385727, + "epoch": 4.0552631578947365, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.00690782256424427, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 300320787.0, + "reward": 0.8101066946983337, + "reward_std": 0.04871155321598053, + "rewards/progression_diversity/mean": -7.603035192005336e-05, + "rewards/progression_diversity/std": 0.0017203704919666052, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.9581705331802368, + "rewards/symbolic_reward_partial_score/std": 0.11796627193689346, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0622797012329102, + "sampling/importance_sampling_ratio/min": 0.00044912920566275716, + "sampling/sampling_logp_difference/max": 7.708199977874756, + "sampling/sampling_logp_difference/mean": 0.12562555074691772, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.24104497581720352, + "epoch": 4.057894736842106, + "grad_norm": 0.0021957457065582275, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.250149168074131, + "epoch": 4.060526315789474, + "grad_norm": 0.0024236314930021763, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1543 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.24035638570785522, + "epoch": 4.063157894736842, + "grad_norm": 0.002043983433395624, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 281.3984375, + "completions/mean_terminated_length": 281.3984375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.24575192481279373, + "epoch": 4.065789473684211, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0030977753922343254, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 300856319.0, + "reward": 0.7493163347244263, + "reward_std": 0.05255415290594101, + "rewards/progression_diversity/mean": -1.282830362470122e-05, + "rewards/progression_diversity/std": 0.0002902713604271412, + "rewards/symbolic_reward_accuracy/mean": 0.78515625, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.9274088144302368, + "rewards/symbolic_reward_partial_score/std": 0.1551593393087387, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.06221604347229, + "sampling/importance_sampling_ratio/min": 0.0009545637876726687, + "sampling/sampling_logp_difference/max": 6.954256057739258, + "sampling/sampling_logp_difference/mean": 0.12480530142784119, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2471490129828453, + "epoch": 4.068421052631579, + "grad_norm": 0.007246929686516523, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2453026995062828, + "epoch": 4.071052631578947, + "grad_norm": 0.0015278997598215938, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.2455468699336052, + "epoch": 4.073684210526316, + "grad_norm": 0.006542537361383438, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 282.92578125, + "completions/mean_terminated_length": 282.92578125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.2415642812848091, + "epoch": 4.076315789473684, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0035267826169729233, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 301425337.0, + "reward": 0.8650867938995361, + "reward_std": 0.03978986665606499, + "rewards/progression_diversity/mean": -0.00011109215120086446, + "rewards/progression_diversity/std": 0.0012798713287338614, + "rewards/symbolic_reward_accuracy/mean": 0.94921875, + "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, + "rewards/symbolic_reward_partial_score/mean": 0.9851887822151184, + "rewards/symbolic_reward_partial_score/std": 0.07398899644613266, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060722827911377, + "sampling/importance_sampling_ratio/min": 0.00038356988807208836, + "sampling/sampling_logp_difference/max": 7.865988731384277, + "sampling/sampling_logp_difference/mean": 0.1256132274866104, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24005647003650665, + "epoch": 4.078947368421052, + "grad_norm": 0.0025566022377461195, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1550 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.24505149573087692, + "epoch": 4.081578947368421, + "grad_norm": 0.0026614954695105553, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1551 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.24144987761974335, + "epoch": 4.08421052631579, + "grad_norm": 0.002677714917808771, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 285.47265625, + "completions/mean_terminated_length": 285.47265625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.24059565365314484, + "epoch": 4.086842105263158, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.004622668959200382, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 301972299.0, + "reward": 0.8511713743209839, + "reward_std": 0.07054747641086578, + "rewards/progression_diversity/mean": -5.1332837756490335e-05, + "rewards/progression_diversity/std": 0.0011615295661613345, + "rewards/symbolic_reward_accuracy/mean": 0.9296875, + "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, + "rewards/symbolic_reward_partial_score/mean": 0.9778646230697632, + "rewards/symbolic_reward_partial_score/std": 0.0910392627120018, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0624241828918457, + "sampling/importance_sampling_ratio/min": 7.295302202692255e-05, + "sampling/sampling_logp_difference/max": 9.525694847106934, + "sampling/sampling_logp_difference/mean": 0.12439781427383423, + "step": 1553 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.24092917144298553, + "epoch": 4.089473684210526, + "grad_norm": 0.0032965033315122128, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1554 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.24477685242891312, + "epoch": 4.092105263157895, + "grad_norm": 0.004428436979651451, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1555 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.24361905455589294, + "epoch": 4.094736842105263, + "grad_norm": 0.003907477483153343, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 320.65625, + "completions/mean_terminated_length": 289.22113037109375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.2397397980093956, + "epoch": 4.097368421052631, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0034273923374712467, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 302529755.0, + "reward": 0.8220639824867249, + "reward_std": 0.04169125109910965, + "rewards/progression_diversity/mean": -0.0006369180628098547, + "rewards/progression_diversity/std": 0.014411810785531998, + "rewards/symbolic_reward_accuracy/mean": 0.896484375, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.9479166269302368, + "rewards/symbolic_reward_partial_score/std": 0.17478591203689575, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0619392395019531, + "sampling/importance_sampling_ratio/min": 0.0002244680217700079, + "sampling/sampling_logp_difference/max": 8.401777267456055, + "sampling/sampling_logp_difference/mean": 0.12284423410892487, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.24551428854465485, + "epoch": 4.1, + "grad_norm": 0.0029721097089350224, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2473151981830597, + "epoch": 4.102631578947369, + "grad_norm": 0.0037502862978726625, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.24894528090953827, + "epoch": 4.105263157894737, + "grad_norm": 0.004042802378535271, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 290.775390625, + "completions/mean_terminated_length": 290.775390625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.2489028424024582, + "epoch": 4.1078947368421055, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.004036257974803448, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 303060744.0, + "reward": 0.8253406882286072, + "reward_std": 0.043370090425014496, + "rewards/progression_diversity/mean": -0.00011065526632592082, + "rewards/progression_diversity/std": 0.0018014537636190653, + "rewards/symbolic_reward_accuracy/mean": 0.890625, + "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, + "rewards/symbolic_reward_partial_score/mean": 0.9698892831802368, + "rewards/symbolic_reward_partial_score/std": 0.09768425673246384, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0613582134246826, + "sampling/importance_sampling_ratio/min": 1.6307029000017792e-05, + "sampling/sampling_logp_difference/max": 11.023914337158203, + "sampling/sampling_logp_difference/mean": 0.12509672343730927, + "step": 1561 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24281351268291473, + "epoch": 4.110526315789474, + "grad_norm": 0.004401057027280331, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.24352271109819412, + "epoch": 4.113157894736842, + "grad_norm": 0.002822014968842268, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2425449937582016, + "epoch": 4.11578947368421, + "grad_norm": 0.0014352428261190653, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 291.498046875, + "completions/mean_terminated_length": 291.498046875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.24744796752929688, + "epoch": 4.118421052631579, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.008502251468598843, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 303627815.0, + "reward": 0.7929680347442627, + "reward_std": 0.04630649834871292, + "rewards/progression_diversity/mean": -7.568293221993372e-05, + "rewards/progression_diversity/std": 0.001712509198114276, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.9322916865348816, + "rewards/symbolic_reward_partial_score/std": 0.17656517028808594, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0619150400161743, + "sampling/importance_sampling_ratio/min": 4.7659916162956506e-05, + "sampling/sampling_logp_difference/max": 9.951419830322266, + "sampling/sampling_logp_difference/mean": 0.12444620579481125, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2407190129160881, + "epoch": 4.121052631578947, + "grad_norm": 0.0018108681542798877, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.24351371824741364, + "epoch": 4.123684210526315, + "grad_norm": 0.0014526962768286467, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24052952975034714, + "epoch": 4.126315789473685, + "grad_norm": 0.0031579281203448772, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 292.951171875, + "completions/mean_terminated_length": 292.951171875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.2511454373598099, + "epoch": 4.128947368421053, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0025576853659003973, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 304146926.0, + "reward": 0.8509761691093445, + "reward_std": 0.013778343796730042, + "rewards/progression_diversity/mean": -4.358722799224779e-05, + "rewards/progression_diversity/std": 0.0006232060259208083, + "rewards/symbolic_reward_accuracy/mean": 0.9296875, + "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, + "rewards/symbolic_reward_partial_score/mean": 0.9772135615348816, + "rewards/symbolic_reward_partial_score/std": 0.08322879672050476, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0625452995300293, + "sampling/importance_sampling_ratio/min": 0.004068870563060045, + "sampling/sampling_logp_difference/max": 5.504389762878418, + "sampling/sampling_logp_difference/mean": 0.12552489340305328, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.24181534349918365, + "epoch": 4.131578947368421, + "grad_norm": 0.0026575892698019743, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1570 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.24096699804067612, + "epoch": 4.13421052631579, + "grad_norm": 0.0008206532220356166, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1571 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.24758878350257874, + "epoch": 4.136842105263158, + "grad_norm": 0.00010886161908274516, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 294.57421875, + "completions/mean_terminated_length": 294.57421875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.24195092916488647, + "epoch": 4.139473684210526, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.006056688260287046, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 304681204.0, + "reward": 0.8438476324081421, + "reward_std": 0.04485977441072464, + "rewards/progression_diversity/mean": -7.551363069069339e-06, + "rewards/progression_diversity/std": 0.00017086784646380693, + "rewards/symbolic_reward_accuracy/mean": 0.916015625, + "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, + "rewards/symbolic_reward_partial_score/mean": 0.9807943105697632, + "rewards/symbolic_reward_partial_score/std": 0.06602808088064194, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0608805418014526, + "sampling/importance_sampling_ratio/min": 0.002050866838544607, + "sampling/sampling_logp_difference/max": 6.189492702484131, + "sampling/sampling_logp_difference/mean": 0.1235751360654831, + "step": 1573 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2409624606370926, + "epoch": 4.1421052631578945, + "grad_norm": 0.0029652619268745184, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.24594729393720627, + "epoch": 4.144736842105263, + "grad_norm": 0.0016559745417907834, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1575 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.24038663506507874, + "epoch": 4.147368421052631, + "grad_norm": 0.002225496107712388, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 289.96875, + "completions/mean_terminated_length": 289.96875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.24104474484920502, + "epoch": 4.15, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0045051174238324165, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 305244484.0, + "reward": 0.8866699934005737, + "reward_std": 0.03702239692211151, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.982421875, + "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, + "rewards/symbolic_reward_partial_score/mean": 0.99072265625, + "rewards/symbolic_reward_partial_score/std": 0.07524821907281876, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059924602508545, + "sampling/importance_sampling_ratio/min": 0.0008238049340434372, + "sampling/sampling_logp_difference/max": 7.101576805114746, + "sampling/sampling_logp_difference/mean": 0.12273959070444107, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23388678580522537, + "epoch": 4.152631578947369, + "grad_norm": 0.0015148852253332734, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.24179504066705704, + "epoch": 4.155263157894737, + "grad_norm": 0.00393514521420002, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1579 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.24486732482910156, + "epoch": 4.157894736842105, + "grad_norm": 0.0008323483052663505, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 324.146484375, + "completions/mean_terminated_length": 292.71820068359375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.2383657991886139, + "epoch": 4.160526315789474, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.007527621928602457, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 305845135.0, + "reward": 0.7925710082054138, + "reward_std": 0.04505772143602371, + "rewards/progression_diversity/mean": -0.0007171723991632462, + "rewards/progression_diversity/std": 0.01562389824539423, + "rewards/symbolic_reward_accuracy/mean": 0.845703125, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.951171875, + "rewards/symbolic_reward_partial_score/std": 0.12982474267482758, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0593748092651367, + "sampling/importance_sampling_ratio/min": 4.811527674064564e-07, + "sampling/sampling_logp_difference/max": 14.547080993652344, + "sampling/sampling_logp_difference/mean": 0.11983726918697357, + "step": 1581 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.23435619473457336, + "epoch": 4.163157894736842, + "grad_norm": 0.0012159182224422693, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1582 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.23912250995635986, + "epoch": 4.16578947368421, + "grad_norm": 0.0016524328384548426, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.24548863619565964, + "epoch": 4.168421052631579, + "grad_norm": 0.0025580108631402254, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 317.92578125, + "completions/mean_terminated_length": 286.4853210449219, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.24263640493154526, + "epoch": 4.171052631578948, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0040906872600317, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 306410377.0, + "reward": 0.8318202495574951, + "reward_std": 0.013329317793250084, + "rewards/progression_diversity/mean": -0.0015740538947284222, + "rewards/progression_diversity/std": 0.03561677411198616, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9680989384651184, + "rewards/symbolic_reward_partial_score/std": 0.10613958537578583, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0580620765686035, + "sampling/importance_sampling_ratio/min": 1.972946165551548e-06, + "sampling/sampling_logp_difference/max": 13.135982513427734, + "sampling/sampling_logp_difference/mean": 0.12163875997066498, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23799894750118256, + "epoch": 4.173684210526316, + "grad_norm": 0.0008383460226468742, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.23699834197759628, + "epoch": 4.176315789473684, + "grad_norm": 0.0012354847276583314, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23717937618494034, + "epoch": 4.178947368421053, + "grad_norm": 0.000985459191724658, + "learning_rate": 1e-06, + "loss": 0.0277, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 287.171875, + "completions/mean_terminated_length": 287.171875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.24291031062602997, + "epoch": 4.181578947368421, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0010142133105546236, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 306943873.0, + "reward": 0.7869137525558472, + "reward_std": 0.011718656867742538, + "rewards/progression_diversity/mean": -3.545811341609806e-05, + "rewards/progression_diversity/std": 0.0008023255504667759, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.943359375, + "rewards/symbolic_reward_partial_score/std": 0.14044690132141113, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0599687099456787, + "sampling/importance_sampling_ratio/min": 6.863525049993768e-05, + "sampling/sampling_logp_difference/max": 9.58670425415039, + "sampling/sampling_logp_difference/mean": 0.1256382167339325, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2450830414891243, + "epoch": 4.184210526315789, + "grad_norm": 0.0008929637842811644, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2389954999089241, + "epoch": 4.186842105263158, + "grad_norm": 0.0004206775629427284, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.24132828414440155, + "epoch": 4.189473684210526, + "grad_norm": 0.000518023211043328, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 286.953125, + "completions/mean_terminated_length": 286.953125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.2418007254600525, + "epoch": 4.192105263157894, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0034422457683831453, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 307499113.0, + "reward": 0.8122069835662842, + "reward_std": 0.018359720706939697, + "rewards/progression_diversity/mean": -8.605420589447021e-06, + "rewards/progression_diversity/std": 0.00013983974349685013, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9378255605697632, + "rewards/symbolic_reward_partial_score/std": 0.1756001114845276, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.062021017074585, + "sampling/importance_sampling_ratio/min": 9.0235989773646e-06, + "sampling/sampling_logp_difference/max": 11.615667343139648, + "sampling/sampling_logp_difference/mean": 0.12551844120025635, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.24229955673217773, + "epoch": 4.1947368421052635, + "grad_norm": 0.0018708358984440565, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1594 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.24395088106393814, + "epoch": 4.197368421052632, + "grad_norm": 0.0030785081908106804, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1595 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23724327981472015, + "epoch": 4.2, + "grad_norm": 0.001542367972433567, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 321.71875, + "completions/mean_terminated_length": 290.28570556640625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.24105414748191833, + "epoch": 4.2026315789473685, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0008341679349541664, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 308087993.0, + "reward": 0.7685401439666748, + "reward_std": 0.012891988269984722, + "rewards/progression_diversity/mean": -0.0014598432462662458, + "rewards/progression_diversity/std": 0.031166821718215942, + "rewards/symbolic_reward_accuracy/mean": 0.8125, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.9375, + "rewards/symbolic_reward_partial_score/std": 0.1478261649608612, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060511827468872, + "sampling/importance_sampling_ratio/min": 0.00011906772124348208, + "sampling/sampling_logp_difference/max": 9.035818099975586, + "sampling/sampling_logp_difference/mean": 0.12241839617490768, + "step": 1597 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2469538226723671, + "epoch": 4.205263157894737, + "grad_norm": 0.0006272746832109988, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23778608441352844, + "epoch": 4.207894736842105, + "grad_norm": 0.0050039771012961864, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2375095784664154, + "epoch": 4.2105263157894735, + "grad_norm": 0.0005078950780443847, + "learning_rate": 1e-06, + "loss": 0.0292, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 290.8203125, + "completions/mean_terminated_length": 290.8203125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.2439725548028946, + "epoch": 4.213157894736842, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.00519453315064311, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 308643709.0, + "reward": 0.7693848013877869, + "reward_std": 0.023179786279797554, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.9513345956802368, + "rewards/symbolic_reward_partial_score/std": 0.11196061223745346, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0619571208953857, + "sampling/importance_sampling_ratio/min": 0.001721260487101972, + "sampling/sampling_logp_difference/max": 6.36469841003418, + "sampling/sampling_logp_difference/mean": 0.12522411346435547, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.24038860201835632, + "epoch": 4.215789473684211, + "grad_norm": 0.0011393935419619083, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.24182245135307312, + "epoch": 4.218421052631579, + "grad_norm": 0.0009637880139052868, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.24568534642457962, + "epoch": 4.221052631578948, + "grad_norm": 0.004586696624755859, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 290.912109375, + "completions/mean_terminated_length": 290.912109375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.24403854459524155, + "epoch": 4.223684210526316, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0022931001149117947, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 309202480.0, + "reward": 0.8104981184005737, + "reward_std": 0.032899536192417145, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.876953125, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.94775390625, + "rewards/symbolic_reward_partial_score/std": 0.14531104266643524, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0620423555374146, + "sampling/importance_sampling_ratio/min": 1.6925074305618182e-05, + "sampling/sampling_logp_difference/max": 10.986714363098145, + "sampling/sampling_logp_difference/mean": 0.1268502175807953, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2432192713022232, + "epoch": 4.226315789473684, + "grad_norm": 0.0034322640858590603, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2459331378340721, + "epoch": 4.228947368421053, + "grad_norm": 0.00626370171085, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1607 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.24618712812662125, + "epoch": 4.231578947368421, + "grad_norm": 0.0018219529883936048, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 285.6875, + "completions/mean_terminated_length": 285.6875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.24345583468675613, + "epoch": 4.234210526315789, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.003988837357610464, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 309748208.0, + "reward": 0.87353515625, + "reward_std": 0.031940147280693054, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.962890625, + "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, + "rewards/symbolic_reward_partial_score/mean": 0.9860026240348816, + "rewards/symbolic_reward_partial_score/std": 0.0735032856464386, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0620702505111694, + "sampling/importance_sampling_ratio/min": 0.00010400653991382569, + "sampling/sampling_logp_difference/max": 9.171056747436523, + "sampling/sampling_logp_difference/mean": 0.12525449693202972, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2439560666680336, + "epoch": 4.2368421052631575, + "grad_norm": 0.005749448202550411, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1610 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.242917962372303, + "epoch": 4.239473684210527, + "grad_norm": 0.0013718483969569206, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24474342167377472, + "epoch": 4.242105263157895, + "grad_norm": 0.0005026358412578702, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 288.400390625, + "completions/mean_terminated_length": 288.400390625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.24127425998449326, + "epoch": 4.244736842105263, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.0018319800728932023, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 310305693.0, + "reward": 0.8312011957168579, + "reward_std": 0.03613283112645149, + "rewards/progression_diversity/mean": -4.250822485118988e-07, + "rewards/progression_diversity/std": 9.618513104214799e-06, + "rewards/symbolic_reward_accuracy/mean": 0.90625, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.9581705331802368, + "rewards/symbolic_reward_partial_score/std": 0.13332508504390717, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.061629295349121, + "sampling/importance_sampling_ratio/min": 0.0001460511703044176, + "sampling/sampling_logp_difference/max": 8.83155345916748, + "sampling/sampling_logp_difference/mean": 0.12399622797966003, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.24228737503290176, + "epoch": 4.247368421052632, + "grad_norm": 0.0009619845659472048, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1614 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.23787134885787964, + "epoch": 4.25, + "grad_norm": 0.0031648194417357445, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.24200475215911865, + "epoch": 4.252631578947368, + "grad_norm": 0.0008313195430673659, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 285.142578125, + "completions/mean_terminated_length": 285.142578125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.23958918452262878, + "epoch": 4.255263157894737, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0013227357994765043, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 310838470.0, + "reward": 0.8195312023162842, + "reward_std": 0.00983283668756485, + "rewards/progression_diversity/mean": -6.325509730231715e-06, + "rewards/progression_diversity/std": 0.0001431299460818991, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9661458134651184, + "rewards/symbolic_reward_partial_score/std": 0.10655689239501953, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060362696647644, + "sampling/importance_sampling_ratio/min": 0.00016181122919078916, + "sampling/sampling_logp_difference/max": 8.729080200195312, + "sampling/sampling_logp_difference/mean": 0.12306272983551025, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2417643666267395, + "epoch": 4.257894736842105, + "grad_norm": 0.001047020428813994, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.23926913738250732, + "epoch": 4.260526315789473, + "grad_norm": 0.0005425025010481477, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.24134934693574905, + "epoch": 4.2631578947368425, + "grad_norm": 2.3157368559623137e-05, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 279.1875, + "completions/mean_terminated_length": 279.1875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.24217140674591064, + "epoch": 4.265789473684211, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.008381368592381477, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 311348166.0, + "reward": 0.8707519769668579, + "reward_std": 0.02324218861758709, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9609375, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.9806314706802368, + "rewards/symbolic_reward_partial_score/std": 0.10037314146757126, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059480905532837, + "sampling/importance_sampling_ratio/min": 0.0010475642047822475, + "sampling/sampling_logp_difference/max": 6.861287593841553, + "sampling/sampling_logp_difference/mean": 0.12341621518135071, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2414199411869049, + "epoch": 4.268421052631579, + "grad_norm": 0.0014712604461237788, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.24441535025835037, + "epoch": 4.271052631578947, + "grad_norm": 0.001006963080726564, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23892833292484283, + "epoch": 4.273684210526316, + "grad_norm": 0.0006697697681374848, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 279.873046875, + "completions/mean_terminated_length": 279.873046875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.2365439236164093, + "epoch": 4.276315789473684, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0022215263452380896, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 311921157.0, + "reward": 0.8255848288536072, + "reward_std": 0.0127988550812006, + "rewards/progression_diversity/mean": -0.00011174208339070901, + "rewards/progression_diversity/std": 0.002458578674122691, + "rewards/symbolic_reward_accuracy/mean": 0.89453125, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.962890625, + "rewards/symbolic_reward_partial_score/std": 0.1252562403678894, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0597436428070068, + "sampling/importance_sampling_ratio/min": 0.003232834627851844, + "sampling/sampling_logp_difference/max": 5.734395980834961, + "sampling/sampling_logp_difference/mean": 0.11962610483169556, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23193275928497314, + "epoch": 4.278947368421052, + "grad_norm": 0.0013880907790735364, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2370048388838768, + "epoch": 4.281578947368421, + "grad_norm": 0.0017254366539418697, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.24163448810577393, + "epoch": 4.284210526315789, + "grad_norm": 0.002020586049184203, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 347.330078125, + "completions/mean_terminated_length": 284.4411926269531, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.23737385869026184, + "epoch": 4.286842105263158, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0008498894167132676, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 312490446.0, + "reward": 0.8547199368476868, + "reward_std": 0.013046052306890488, + "rewards/progression_diversity/mean": -0.0016431687399744987, + "rewards/progression_diversity/std": 0.0253975298255682, + "rewards/symbolic_reward_accuracy/mean": 0.93359375, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.9832357168197632, + "rewards/symbolic_reward_partial_score/std": 0.07629074156284332, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0584070682525635, + "sampling/importance_sampling_ratio/min": 5.360832915357605e-07, + "sampling/sampling_logp_difference/max": 14.438976287841797, + "sampling/sampling_logp_difference/mean": 0.1209835484623909, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2417454794049263, + "epoch": 4.2894736842105265, + "grad_norm": 0.0014678961597383022, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.24222751706838608, + "epoch": 4.292105263157895, + "grad_norm": 0.0007775401463732123, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23408235609531403, + "epoch": 4.294736842105263, + "grad_norm": 0.00032593103242106736, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 279.724609375, + "completions/mean_terminated_length": 279.724609375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.23920771479606628, + "epoch": 4.2973684210526315, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0023012347519397736, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 313035969.0, + "reward": 0.7753905057907104, + "reward_std": 0.024877512827515602, + "rewards/progression_diversity/mean": -1.7009941075230017e-05, + "rewards/progression_diversity/std": 0.00038489102735184133, + "rewards/symbolic_reward_accuracy/mean": 0.826171875, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.9322916269302368, + "rewards/symbolic_reward_partial_score/std": 0.16144797205924988, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0600590705871582, + "sampling/importance_sampling_ratio/min": 0.00021802130504511297, + "sampling/sampling_logp_difference/max": 8.430917739868164, + "sampling/sampling_logp_difference/mean": 0.1208389550447464, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23538841307163239, + "epoch": 4.3, + "grad_norm": 0.000836519175209105, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23597903549671173, + "epoch": 4.302631578947368, + "grad_norm": 0.002698976546525955, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23745371401309967, + "epoch": 4.3052631578947365, + "grad_norm": 0.0009186923853121698, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9394.0, + "completions/mean_length": 328.09375, + "completions/mean_terminated_length": 296.6731872558594, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.23953492939472198, + "epoch": 4.307894736842106, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.008782876655459404, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 313556529.0, + "reward": 0.8749912977218628, + "reward_std": 0.01434220839291811, + "rewards/progression_diversity/mean": -0.0008723714272491634, + "rewards/progression_diversity/std": 0.012109385803341866, + "rewards/symbolic_reward_accuracy/mean": 0.96484375, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.9876302480697632, + "rewards/symbolic_reward_partial_score/std": 0.07861035317182541, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0582417249679565, + "sampling/importance_sampling_ratio/min": 2.171735104639083e-05, + "sampling/sampling_logp_difference/max": 10.737399101257324, + "sampling/sampling_logp_difference/mean": 0.12100666016340256, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.24021229147911072, + "epoch": 4.310526315789474, + "grad_norm": 0.001116728177294135, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23746858537197113, + "epoch": 4.313157894736842, + "grad_norm": 0.0025934746954590082, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 1639 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2366245537996292, + "epoch": 4.315789473684211, + "grad_norm": 0.0006494184490293264, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 282.509765625, + "completions/mean_terminated_length": 282.509765625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.2368258759379387, + "epoch": 4.318421052631579, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 314122326.0, + "reward": 0.8359375, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.90625, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.9739583730697632, + "rewards/symbolic_reward_partial_score/std": 0.08454757928848267, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059388518333435, + "sampling/importance_sampling_ratio/min": 0.005422016140073538, + "sampling/sampling_logp_difference/max": 5.217287540435791, + "sampling/sampling_logp_difference/mean": 0.12114188820123672, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.23350204527378082, + "epoch": 4.321052631578947, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.23761139810085297, + "epoch": 4.323684210526316, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.23665422201156616, + "epoch": 4.326315789473684, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 281.400390625, + "completions/mean_terminated_length": 281.400390625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.2357463464140892, + "epoch": 4.328947368421053, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.002676774049177766, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 314671395.0, + "reward": 0.8441406488418579, + "reward_std": 0.023877985775470734, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.919921875, + "rewards/symbolic_reward_accuracy/std": 0.271679550409317, + "rewards/symbolic_reward_partial_score/mean": 0.9739583730697632, + "rewards/symbolic_reward_partial_score/std": 0.09836674481630325, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0592198371887207, + "sampling/importance_sampling_ratio/min": 2.6666658214935524e-08, + "sampling/sampling_logp_difference/max": 17.439851760864258, + "sampling/sampling_logp_difference/mean": 0.12135563790798187, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23633001744747162, + "epoch": 4.331578947368421, + "grad_norm": 0.0014993188669905066, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23138535022735596, + "epoch": 4.33421052631579, + "grad_norm": 0.0018792420160025358, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23762395977973938, + "epoch": 4.336842105263158, + "grad_norm": 0.0003342384588904679, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 285.7578125, + "completions/mean_terminated_length": 285.7578125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.23968391120433807, + "epoch": 4.339473684210526, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0011085503501817584, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 315219847.0, + "reward": 0.8956515789031982, + "reward_std": 0.017393916845321655, + "rewards/progression_diversity/mean": -0.0002775713801383972, + "rewards/progression_diversity/std": 0.004437287338078022, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9972330331802368, + "rewards/symbolic_reward_partial_score/std": 0.04610797390341759, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0596669912338257, + "sampling/importance_sampling_ratio/min": 3.726242721313611e-05, + "sampling/sampling_logp_difference/max": 10.197525024414062, + "sampling/sampling_logp_difference/mean": 0.12217455357313156, + "step": 1649 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.23926863819360733, + "epoch": 4.342105263157895, + "grad_norm": 0.0010118326172232628, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23653405904769897, + "epoch": 4.344736842105263, + "grad_norm": 0.0055463118478655815, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.24093815684318542, + "epoch": 4.347368421052631, + "grad_norm": 0.0011176273692399263, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 284.4453125, + "completions/mean_terminated_length": 284.4453125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.23320885747671127, + "epoch": 4.35, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.003535026917234063, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 315769131.0, + "reward": 0.8701168298721313, + "reward_std": 0.02188323438167572, + "rewards/progression_diversity/mean": -4.30387444794178e-05, + "rewards/progression_diversity/std": 0.0009738556109368801, + "rewards/symbolic_reward_accuracy/mean": 0.95703125, + "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, + "rewards/symbolic_reward_partial_score/mean": 0.986328125, + "rewards/symbolic_reward_partial_score/std": 0.06698986887931824, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059899926185608, + "sampling/importance_sampling_ratio/min": 1.0019117553383694e-06, + "sampling/sampling_logp_difference/max": 13.813600540161133, + "sampling/sampling_logp_difference/mean": 0.12067143619060516, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2401559054851532, + "epoch": 4.352631578947369, + "grad_norm": 0.001625710865482688, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1654 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.24083483219146729, + "epoch": 4.355263157894737, + "grad_norm": 0.0013069827109575272, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1655 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23248496651649475, + "epoch": 4.3578947368421055, + "grad_norm": 0.003529660403728485, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 283.091796875, + "completions/mean_terminated_length": 283.091796875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.23519539088010788, + "epoch": 4.360526315789474, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0036818196531385183, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 316333722.0, + "reward": 0.872850775718689, + "reward_std": 0.014050932601094246, + "rewards/progression_diversity/mean": -8.317209722008556e-05, + "rewards/progression_diversity/std": 0.0014961487613618374, + "rewards/symbolic_reward_accuracy/mean": 0.9609375, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.9876302480697632, + "rewards/symbolic_reward_partial_score/std": 0.06307155638933182, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0588300228118896, + "sampling/importance_sampling_ratio/min": 3.670607475214638e-05, + "sampling/sampling_logp_difference/max": 10.212568283081055, + "sampling/sampling_logp_difference/mean": 0.12060007452964783, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23568804562091827, + "epoch": 4.363157894736842, + "grad_norm": 0.003883509198203683, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2364446073770523, + "epoch": 4.36578947368421, + "grad_norm": 0.0005405032425187528, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23525787144899368, + "epoch": 4.368421052631579, + "grad_norm": 0.0007799993618391454, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 283.634765625, + "completions/mean_terminated_length": 283.634765625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.23709560930728912, + "epoch": 4.371052631578947, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0018411829369142652, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 316892255.0, + "reward": 0.8917969465255737, + "reward_std": 0.01621941104531288, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98828125, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.99609375, + "rewards/symbolic_reward_partial_score/std": 0.04019329324364662, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.058568000793457, + "sampling/importance_sampling_ratio/min": 2.124160801031394e-06, + "sampling/sampling_logp_difference/max": 13.0621337890625, + "sampling/sampling_logp_difference/mean": 0.12094350159168243, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23370085656642914, + "epoch": 4.373684210526315, + "grad_norm": 0.00029920003726147115, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23229587078094482, + "epoch": 4.376315789473685, + "grad_norm": 0.00032023570383898914, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23625388741493225, + "epoch": 4.378947368421053, + "grad_norm": 0.0019593036267906427, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 279.814453125, + "completions/mean_terminated_length": 279.814453125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.2396935299038887, + "epoch": 4.381578947368421, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0010560675291344523, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 317459520.0, + "reward": 0.8500972986221313, + "reward_std": 0.01595105230808258, + "rewards/progression_diversity/mean": -4.367587098386139e-05, + "rewards/progression_diversity/std": 0.0008068631868809462, + "rewards/symbolic_reward_accuracy/mean": 0.9296875, + "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, + "rewards/symbolic_reward_partial_score/mean": 0.9742838144302368, + "rewards/symbolic_reward_partial_score/std": 0.10590074211359024, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0598008632659912, + "sampling/importance_sampling_ratio/min": 5.395947368924681e-07, + "sampling/sampling_logp_difference/max": 14.43244743347168, + "sampling/sampling_logp_difference/mean": 0.12156838923692703, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23540012538433075, + "epoch": 4.38421052631579, + "grad_norm": 9.158550528809428e-05, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23555586487054825, + "epoch": 4.386842105263158, + "grad_norm": 0.0010763579048216343, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2318090721964836, + "epoch": 4.389473684210526, + "grad_norm": 0.00030530893127433956, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 283.79296875, + "completions/mean_terminated_length": 283.79296875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.2331906408071518, + "epoch": 4.3921052631578945, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0010331433732062578, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 318019158.0, + "reward": 0.8382322788238525, + "reward_std": 0.009019151329994202, + "rewards/progression_diversity/mean": -1.49488914757967e-05, + "rewards/progression_diversity/std": 0.0002401837264187634, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9737955331802368, + "rewards/symbolic_reward_partial_score/std": 0.09921777248382568, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.061126947402954, + "sampling/importance_sampling_ratio/min": 1.443103337805951e-05, + "sampling/sampling_logp_difference/max": 11.146129608154297, + "sampling/sampling_logp_difference/mean": 0.11966268718242645, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2346176952123642, + "epoch": 4.394736842105263, + "grad_norm": 0.0002629106165841222, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2403409779071808, + "epoch": 4.397368421052631, + "grad_norm": 0.0007090272847563028, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1671 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2349977195262909, + "epoch": 4.4, + "grad_norm": 0.0034831841476261616, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 279.677734375, + "completions/mean_terminated_length": 279.677734375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.23757142573595047, + "epoch": 4.402631578947369, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0032499730587005615, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 318557297.0, + "reward": 0.8682129383087158, + "reward_std": 0.027213580906391144, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.953125, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.98779296875, + "rewards/symbolic_reward_partial_score/std": 0.05615198239684105, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.058640480041504, + "sampling/importance_sampling_ratio/min": 3.7312423728508293e-07, + "sampling/sampling_logp_difference/max": 14.80135440826416, + "sampling/sampling_logp_difference/mean": 0.1246851310133934, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.24083828926086426, + "epoch": 4.405263157894737, + "grad_norm": 0.001479274476878345, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23393068462610245, + "epoch": 4.407894736842105, + "grad_norm": 0.0007833510753698647, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23537438362836838, + "epoch": 4.410526315789474, + "grad_norm": 0.0011368156410753727, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 279.228515625, + "completions/mean_terminated_length": 279.228515625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.23278649151325226, + "epoch": 4.413157894736842, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0007415832951664925, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 319075878.0, + "reward": 0.8525390625, + "reward_std": 0.011718750931322575, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.93359375, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.974609375, + "rewards/symbolic_reward_partial_score/std": 0.10058461129665375, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059082269668579, + "sampling/importance_sampling_ratio/min": 0.0008237397414632142, + "sampling/sampling_logp_difference/max": 7.101655960083008, + "sampling/sampling_logp_difference/mean": 0.11824500560760498, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.23778235167264938, + "epoch": 4.41578947368421, + "grad_norm": 0.0005469997995533049, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.23452699184417725, + "epoch": 4.418421052631579, + "grad_norm": 0.0004806320648640394, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.22962932288646698, + "epoch": 4.421052631578947, + "grad_norm": 0.006431492045521736, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 280.94921875, + "completions/mean_terminated_length": 280.94921875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.2355402261018753, + "epoch": 4.423684210526316, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0020566831808537245, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 319644044.0, + "reward": 0.8218750357627869, + "reward_std": 0.02611433155834675, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9700520634651184, + "rewards/symbolic_reward_partial_score/std": 0.09696658700704575, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0602164268493652, + "sampling/importance_sampling_ratio/min": 0.0002465007419232279, + "sampling/sampling_logp_difference/max": 8.308145523071289, + "sampling/sampling_logp_difference/mean": 0.1220279335975647, + "step": 1681 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.24096575379371643, + "epoch": 4.426315789473684, + "grad_norm": 0.001502837985754013, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2392711043357849, + "epoch": 4.428947368421053, + "grad_norm": 0.0013740418944507837, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.24290843307971954, + "epoch": 4.431578947368421, + "grad_norm": 0.0006701555685140193, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 280.751953125, + "completions/mean_terminated_length": 280.751953125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.23609529435634613, + "epoch": 4.434210526315789, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.004810694605112076, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 320206797.0, + "reward": 0.8401367664337158, + "reward_std": 0.00967772863805294, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9140625, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.9723306894302368, + "rewards/symbolic_reward_partial_score/std": 0.10186579823493958, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0596872568130493, + "sampling/importance_sampling_ratio/min": 2.2254751002037665e-06, + "sampling/sampling_logp_difference/max": 13.01554012298584, + "sampling/sampling_logp_difference/mean": 0.1213376522064209, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2319614738225937, + "epoch": 4.436842105263158, + "grad_norm": 0.0014053754275664687, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2342107743024826, + "epoch": 4.439473684210526, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2317662015557289, + "epoch": 4.442105263157894, + "grad_norm": 0.0037717849481850863, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 278.859375, + "completions/mean_terminated_length": 278.859375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.2374890148639679, + "epoch": 4.4447368421052635, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0034204961266368628, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 320742533.0, + "reward": 0.8681638836860657, + "reward_std": 0.01761283539235592, + "rewards/progression_diversity/mean": -2.218061126768589e-05, + "rewards/progression_diversity/std": 0.0005018899100832641, + "rewards/symbolic_reward_accuracy/mean": 0.951171875, + "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, + "rewards/symbolic_reward_partial_score/mean": 0.9915364384651184, + "rewards/symbolic_reward_partial_score/std": 0.037724245339632034, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0595741271972656, + "sampling/importance_sampling_ratio/min": 1.0624612514220644e-05, + "sampling/sampling_logp_difference/max": 11.452337265014648, + "sampling/sampling_logp_difference/mean": 0.12184470891952515, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23680029064416885, + "epoch": 4.447368421052632, + "grad_norm": 0.00370118604041636, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23859335482120514, + "epoch": 4.45, + "grad_norm": 0.0015089736552909017, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2354060262441635, + "epoch": 4.4526315789473685, + "grad_norm": 0.0017409600550308824, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 282.396484375, + "completions/mean_terminated_length": 282.396484375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.2397817000746727, + "epoch": 4.455263157894737, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.007454941049218178, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 321276912.0, + "reward": 0.8334946632385254, + "reward_std": 0.005865141749382019, + "rewards/progression_diversity/mean": -0.0001441855274606496, + "rewards/progression_diversity/std": 0.0024001228157430887, + "rewards/symbolic_reward_accuracy/mean": 0.908203125, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.9619140625, + "rewards/symbolic_reward_partial_score/std": 0.12517951428890228, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060306429862976, + "sampling/importance_sampling_ratio/min": 7.964307769725565e-06, + "sampling/sampling_logp_difference/max": 11.740540504455566, + "sampling/sampling_logp_difference/mean": 0.12180732935667038, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23552466928958893, + "epoch": 4.457894736842105, + "grad_norm": 0.0003189202107023448, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2354494035243988, + "epoch": 4.4605263157894735, + "grad_norm": 0.00015152331616263837, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23567309230566025, + "epoch": 4.463157894736842, + "grad_norm": 0.00037868903018534184, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 281.587890625, + "completions/mean_terminated_length": 281.587890625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.2353179082274437, + "epoch": 4.465789473684211, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0016322329174727201, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 321834077.0, + "reward": 0.8449705839157104, + "reward_std": 0.01574588194489479, + "rewards/progression_diversity/mean": -1.4576362445950508e-05, + "rewards/progression_diversity/std": 0.0002419299999019131, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9650065302848816, + "rewards/symbolic_reward_partial_score/std": 0.1250525414943695, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0596911907196045, + "sampling/importance_sampling_ratio/min": 0.0011949631152674556, + "sampling/sampling_logp_difference/max": 6.729640007019043, + "sampling/sampling_logp_difference/mean": 0.12178048491477966, + "step": 1697 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23920729756355286, + "epoch": 4.468421052631579, + "grad_norm": 0.0006203987868502736, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23722657561302185, + "epoch": 4.471052631578948, + "grad_norm": 0.0048148296773433685, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1699 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.24049177765846252, + "epoch": 4.473684210526316, + "grad_norm": 0.0043615782633423805, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 280.58984375, + "completions/mean_terminated_length": 280.58984375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.2388734519481659, + "epoch": 4.476315789473684, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.002722710371017456, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 322378539.0, + "reward": 0.8768066763877869, + "reward_std": 0.00659077987074852, + "rewards/progression_diversity/mean": -1.1165892601638916e-06, + "rewards/progression_diversity/std": 2.5265529984608293e-05, + "rewards/symbolic_reward_accuracy/mean": 0.966796875, + "rewards/symbolic_reward_accuracy/std": 0.17934183776378632, + "rewards/symbolic_reward_partial_score/mean": 0.9890950918197632, + "rewards/symbolic_reward_partial_score/std": 0.06236884742975235, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0597413778305054, + "sampling/importance_sampling_ratio/min": 0.0002870491589419544, + "sampling/sampling_logp_difference/max": 8.15585708618164, + "sampling/sampling_logp_difference/mean": 0.11984318494796753, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23890526592731476, + "epoch": 4.478947368421053, + "grad_norm": 0.0009888506028801203, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23294271528720856, + "epoch": 4.481578947368421, + "grad_norm": 0.00026438181521371007, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.23210904002189636, + "epoch": 4.484210526315789, + "grad_norm": 0.0012112339027225971, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 289.240234375, + "completions/mean_terminated_length": 289.240234375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.2412128746509552, + "epoch": 4.4868421052631575, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0025596783962100744, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 322939782.0, + "reward": 0.866014838218689, + "reward_std": 0.018494877964258194, + "rewards/progression_diversity/mean": -7.871522393543273e-05, + "rewards/progression_diversity/std": 0.0012731452006846666, + "rewards/symbolic_reward_accuracy/mean": 0.951171875, + "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, + "rewards/symbolic_reward_partial_score/mean": 0.984375, + "rewards/symbolic_reward_partial_score/std": 0.08390216529369354, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0600802898406982, + "sampling/importance_sampling_ratio/min": 6.673198367934674e-05, + "sampling/sampling_logp_difference/max": 9.614826202392578, + "sampling/sampling_logp_difference/mean": 0.12298185378313065, + "step": 1705 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.23964014649391174, + "epoch": 4.489473684210527, + "grad_norm": 0.012937691994011402, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.24078621715307236, + "epoch": 4.492105263157895, + "grad_norm": 0.002141481265425682, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23956668376922607, + "epoch": 4.494736842105263, + "grad_norm": 0.0020123261492699385, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 285.2578125, + "completions/mean_terminated_length": 285.2578125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.23070378601551056, + "epoch": 4.497368421052632, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.005668024532496929, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 323472618.0, + "reward": 0.869824230670929, + "reward_std": 0.01963612250983715, + "rewards/progression_diversity/mean": -3.7124846130609512e-06, + "rewards/progression_diversity/std": 8.400394290219992e-05, + "rewards/symbolic_reward_accuracy/mean": 0.958984375, + "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, + "rewards/symbolic_reward_partial_score/mean": 0.9814453125, + "rewards/symbolic_reward_partial_score/std": 0.09198538959026337, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060324788093567, + "sampling/importance_sampling_ratio/min": 0.00014617157285101712, + "sampling/sampling_logp_difference/max": 8.830729484558105, + "sampling/sampling_logp_difference/mean": 0.12136264890432358, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2371450588107109, + "epoch": 4.5, + "grad_norm": 0.0007223788998089731, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2418120726943016, + "epoch": 4.502631578947368, + "grad_norm": 0.0038238605484366417, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23640219867229462, + "epoch": 4.505263157894737, + "grad_norm": 0.0007986408309079707, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 286.01171875, + "completions/mean_terminated_length": 286.01171875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.23386920988559723, + "epoch": 4.507894736842105, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.01059133093804121, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 324021840.0, + "reward": 0.8559077978134155, + "reward_std": 0.03120303340256214, + "rewards/progression_diversity/mean": -4.882874418399297e-05, + "rewards/progression_diversity/std": 0.0011048683663830161, + "rewards/symbolic_reward_accuracy/mean": 0.935546875, + "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, + "rewards/symbolic_reward_partial_score/mean": 0.98193359375, + "rewards/symbolic_reward_partial_score/std": 0.08151435852050781, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0577611923217773, + "sampling/importance_sampling_ratio/min": 3.047924337806762e-06, + "sampling/sampling_logp_difference/max": 12.7010498046875, + "sampling/sampling_logp_difference/mean": 0.12323953211307526, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2332010492682457, + "epoch": 4.510526315789473, + "grad_norm": 0.002193406457081437, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.23637745529413223, + "epoch": 4.5131578947368425, + "grad_norm": 0.0024835586082190275, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23958788067102432, + "epoch": 4.515789473684211, + "grad_norm": 0.0006318061496131122, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 283.26953125, + "completions/mean_terminated_length": 283.26953125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.23540818691253662, + "epoch": 4.518421052631579, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.005175166297703981, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 324558490.0, + "reward": 0.8326165676116943, + "reward_std": 0.005581952165812254, + "rewards/progression_diversity/mean": -6.77828211337328e-05, + "rewards/progression_diversity/std": 0.0015337502118200064, + "rewards/symbolic_reward_accuracy/mean": 0.908203125, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.958984375, + "rewards/symbolic_reward_partial_score/std": 0.13281294703483582, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0589946508407593, + "sampling/importance_sampling_ratio/min": 5.9129040892003104e-05, + "sampling/sampling_logp_difference/max": 9.735788345336914, + "sampling/sampling_logp_difference/mean": 0.12047428637742996, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.23957037180662155, + "epoch": 4.521052631578947, + "grad_norm": 0.0002911387709900737, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.23324524611234665, + "epoch": 4.523684210526316, + "grad_norm": 0.00015874733799137175, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23140724003314972, + "epoch": 4.526315789473684, + "grad_norm": 0.0002466792648192495, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 280.705078125, + "completions/mean_terminated_length": 280.705078125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.23024223744869232, + "epoch": 4.528947368421052, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.00565887289121747, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 325127555.0, + "reward": 0.8357421159744263, + "reward_std": 0.021845251321792603, + "rewards/progression_diversity/mean": -8.315712875628378e-06, + "rewards/progression_diversity/std": 0.0001881630887510255, + "rewards/symbolic_reward_accuracy/mean": 0.908203125, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.9694010019302368, + "rewards/symbolic_reward_partial_score/std": 0.10498092323541641, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0591754913330078, + "sampling/importance_sampling_ratio/min": 0.0007685017772018909, + "sampling/sampling_logp_difference/max": 7.171067714691162, + "sampling/sampling_logp_difference/mean": 0.1192249283194542, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23211494088172913, + "epoch": 4.531578947368421, + "grad_norm": 0.0007768426439724863, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23332732915878296, + "epoch": 4.534210526315789, + "grad_norm": 0.0008748367545194924, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2306608259677887, + "epoch": 4.536842105263158, + "grad_norm": 0.003938390873372555, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1296.0, + "completions/mean_length": 314.302734375, + "completions/mean_terminated_length": 282.8551940917969, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.22948867827653885, + "epoch": 4.5394736842105265, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.006544643547385931, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 325692446.0, + "reward": 0.8094171285629272, + "reward_std": 0.03800683468580246, + "rewards/progression_diversity/mean": -0.0006720458623021841, + "rewards/progression_diversity/std": 0.014617693610489368, + "rewards/symbolic_reward_accuracy/mean": 0.8671875, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.96435546875, + "rewards/symbolic_reward_partial_score/std": 0.10590612888336182, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.055977702140808, + "sampling/importance_sampling_ratio/min": 0.0003501154133118689, + "sampling/sampling_logp_difference/max": 7.957247734069824, + "sampling/sampling_logp_difference/mean": 0.11574487388134003, + "step": 1725 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.22993683815002441, + "epoch": 4.542105263157895, + "grad_norm": 0.0012803367571905255, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1726 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.22964587807655334, + "epoch": 4.544736842105263, + "grad_norm": 0.0012463160092011094, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.22539836168289185, + "epoch": 4.5473684210526315, + "grad_norm": 0.006871597841382027, + "learning_rate": 1e-06, + "loss": 0.0304, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 280.208984375, + "completions/mean_terminated_length": 280.208984375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.23477299511432648, + "epoch": 4.55, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0005855901399627328, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 326242889.0, + "reward": 0.8384764790534973, + "reward_std": 0.0075668939389288425, + "rewards/progression_diversity/mean": -1.1385917787265498e-05, + "rewards/progression_diversity/std": 0.0002576339466031641, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.974609375, + "rewards/symbolic_reward_partial_score/std": 0.0853847861289978, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0593574047088623, + "sampling/importance_sampling_ratio/min": 0.0006416025571525097, + "sampling/sampling_logp_difference/max": 7.351541519165039, + "sampling/sampling_logp_difference/mean": 0.11885949969291687, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2322026565670967, + "epoch": 4.552631578947368, + "grad_norm": 0.0004462753713596612, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.229575015604496, + "epoch": 4.5552631578947365, + "grad_norm": 0.0003073792904615402, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.23423191159963608, + "epoch": 4.557894736842105, + "grad_norm": 0.003369005862623453, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 277.25390625, + "completions/mean_terminated_length": 277.25390625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.2312970906496048, + "epoch": 4.560526315789474, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.00039534809184260666, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 326781131.0, + "reward": 0.85791015625, + "reward_std": 0.005859375931322575, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.935546875, + "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, + "rewards/symbolic_reward_partial_score/mean": 0.9886067509651184, + "rewards/symbolic_reward_partial_score/std": 0.04581141844391823, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0576618909835815, + "sampling/importance_sampling_ratio/min": 4.253246879670769e-05, + "sampling/sampling_logp_difference/max": 10.065242767333984, + "sampling/sampling_logp_difference/mean": 0.11823997646570206, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.23137207329273224, + "epoch": 4.563157894736842, + "grad_norm": 0.0002846321149263531, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2298935204744339, + "epoch": 4.565789473684211, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.22877340018749237, + "epoch": 4.568421052631579, + "grad_norm": 0.0003122953639831394, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 283.275390625, + "completions/mean_terminated_length": 283.275390625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.22834929078817368, + "epoch": 4.571052631578947, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0007801002357155085, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 327328632.0, + "reward": 0.8971680402755737, + "reward_std": 0.01132812537252903, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9983724355697632, + "rewards/symbolic_reward_partial_score/std": 0.026533395051956177, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0584046840667725, + "sampling/importance_sampling_ratio/min": 0.003217845456674695, + "sampling/sampling_logp_difference/max": 5.739043235778809, + "sampling/sampling_logp_difference/mean": 0.11800207197666168, + "step": 1737 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2275266945362091, + "epoch": 4.573684210526316, + "grad_norm": 0.0003757645608857274, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.23003485798835754, + "epoch": 4.576315789473684, + "grad_norm": 0.0006597706233151257, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.22901742905378342, + "epoch": 4.578947368421053, + "grad_norm": 0.00029348907992243767, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 314.96875, + "completions/mean_terminated_length": 283.5224914550781, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.2328527346253395, + "epoch": 4.581578947368421, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.006896194536238909, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 327900200.0, + "reward": 0.821234941482544, + "reward_std": 0.06384304165840149, + "rewards/progression_diversity/mean": -0.0005336772883310914, + "rewards/progression_diversity/std": 0.012075738981366158, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9646809697151184, + "rewards/symbolic_reward_partial_score/std": 0.10966981947422028, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0578877925872803, + "sampling/importance_sampling_ratio/min": 5.2641491492977366e-05, + "sampling/sampling_logp_difference/max": 9.852005958557129, + "sampling/sampling_logp_difference/mean": 0.11672507226467133, + "step": 1741 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.2317691296339035, + "epoch": 4.58421052631579, + "grad_norm": 0.0015694440808147192, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.23245307803153992, + "epoch": 4.586842105263158, + "grad_norm": 0.0021039163693785667, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.22942954301834106, + "epoch": 4.589473684210526, + "grad_norm": 0.010603874921798706, + "learning_rate": 1e-06, + "loss": -0.0294, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 285.0625, + "completions/mean_terminated_length": 285.0625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.23405387252569199, + "epoch": 4.592105263157895, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0022874982096254826, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 328485832.0, + "reward": 0.8634766340255737, + "reward_std": 0.02220808155834675, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.94921875, + "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, + "rewards/symbolic_reward_partial_score/mean": 0.9798176884651184, + "rewards/symbolic_reward_partial_score/std": 0.09398413449525833, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0588750839233398, + "sampling/importance_sampling_ratio/min": 0.00381456664763391, + "sampling/sampling_logp_difference/max": 5.568928241729736, + "sampling/sampling_logp_difference/mean": 0.12032032012939453, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2334182858467102, + "epoch": 4.594736842105263, + "grad_norm": 0.0004996811621822417, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23181022703647614, + "epoch": 4.597368421052631, + "grad_norm": 0.0012804733123630285, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.22890260815620422, + "epoch": 4.6, + "grad_norm": 0.0072340769693255424, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 287.724609375, + "completions/mean_terminated_length": 287.724609375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.23200425505638123, + "epoch": 4.602631578947369, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.001083358540199697, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 329068155.0, + "reward": 0.8512207269668579, + "reward_std": 0.016063103452324867, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9296875, + "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, + "rewards/symbolic_reward_partial_score/mean": 0.97802734375, + "rewards/symbolic_reward_partial_score/std": 0.08205181360244751, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0593247413635254, + "sampling/importance_sampling_ratio/min": 0.0002779820642899722, + "sampling/sampling_logp_difference/max": 8.18795394897461, + "sampling/sampling_logp_difference/mean": 0.11916124075651169, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23036928474903107, + "epoch": 4.605263157894737, + "grad_norm": 0.0007268539047800004, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.22929440438747406, + "epoch": 4.6078947368421055, + "grad_norm": 0.004047263413667679, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.22957557439804077, + "epoch": 4.610526315789474, + "grad_norm": 0.007964338175952435, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 288.81640625, + "completions/mean_terminated_length": 288.81640625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.23655959218740463, + "epoch": 4.613157894736842, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004663840867578983, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 329621021.0, + "reward": 0.8336426019668579, + "reward_std": 0.026225607842206955, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.908203125, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.96240234375, + "rewards/symbolic_reward_partial_score/std": 0.12385479360818863, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0611376762390137, + "sampling/importance_sampling_ratio/min": 1.428700124961324e-05, + "sampling/sampling_logp_difference/max": 11.156160354614258, + "sampling/sampling_logp_difference/mean": 0.12244156748056412, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2377874031662941, + "epoch": 4.61578947368421, + "grad_norm": 0.0013511740835383534, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23934999853372574, + "epoch": 4.618421052631579, + "grad_norm": 0.0017129138577729464, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.23353976756334305, + "epoch": 4.621052631578947, + "grad_norm": 0.0062218764796853065, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 287.197265625, + "completions/mean_terminated_length": 287.197265625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.23446016758680344, + "epoch": 4.623684210526315, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.00431384751573205, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 330159682.0, + "reward": 0.8695312142372131, + "reward_std": 0.04663246124982834, + "rewards/progression_diversity/mean": -8.157388037943747e-06, + "rewards/progression_diversity/std": 0.00018458062550053, + "rewards/symbolic_reward_accuracy/mean": 0.955078125, + "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, + "rewards/symbolic_reward_partial_score/mean": 0.98828125, + "rewards/symbolic_reward_partial_score/std": 0.06211148202419281, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0600439310073853, + "sampling/importance_sampling_ratio/min": 0.00023238833819050342, + "sampling/sampling_logp_difference/max": 8.367100715637207, + "sampling/sampling_logp_difference/mean": 0.12167903035879135, + "step": 1757 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.2346833571791649, + "epoch": 4.626315789473685, + "grad_norm": 0.0014358048792928457, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1758 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.23761341720819473, + "epoch": 4.628947368421053, + "grad_norm": 0.0057918294332921505, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23876164853572845, + "epoch": 4.631578947368421, + "grad_norm": 0.0010682097636163235, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 288.267578125, + "completions/mean_terminated_length": 288.267578125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.236293226480484, + "epoch": 4.63421052631579, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.006198339629918337, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 330706251.0, + "reward": 0.8838867545127869, + "reward_std": 0.029681198298931122, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.978515625, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.9892578125, + "rewards/symbolic_reward_partial_score/std": 0.07256709784269333, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060239315032959, + "sampling/importance_sampling_ratio/min": 0.003650497877970338, + "sampling/sampling_logp_difference/max": 5.612891674041748, + "sampling/sampling_logp_difference/mean": 0.12036669254302979, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23460889607667923, + "epoch": 4.636842105263158, + "grad_norm": 0.0026386433746665716, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2347528636455536, + "epoch": 4.639473684210526, + "grad_norm": 0.0008613457321189344, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23094835877418518, + "epoch": 4.6421052631578945, + "grad_norm": 0.002696852432563901, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 285.228515625, + "completions/mean_terminated_length": 285.228515625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.23169712722301483, + "epoch": 4.644736842105263, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0015485257608816028, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 331248416.0, + "reward": 0.8603997230529785, + "reward_std": 0.025298018008470535, + "rewards/progression_diversity/mean": -7.630509207956493e-05, + "rewards/progression_diversity/std": 0.0012197623727843165, + "rewards/symbolic_reward_accuracy/mean": 0.939453125, + "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, + "rewards/symbolic_reward_partial_score/mean": 0.9890950322151184, + "rewards/symbolic_reward_partial_score/std": 0.04548434168100357, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0597522258758545, + "sampling/importance_sampling_ratio/min": 0.0031475864816457033, + "sampling/sampling_logp_difference/max": 5.761119365692139, + "sampling/sampling_logp_difference/mean": 0.12072933465242386, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.23586693406105042, + "epoch": 4.647368421052631, + "grad_norm": 0.0018776926444843411, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1766 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.23478037863969803, + "epoch": 4.65, + "grad_norm": 0.0038558384403586388, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1767 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2335403859615326, + "epoch": 4.652631578947369, + "grad_norm": 0.000620207458268851, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 281.16015625, + "completions/mean_terminated_length": 281.16015625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.232573501765728, + "epoch": 4.655263157894737, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0054251509718596935, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 331805362.0, + "reward": 0.8727539777755737, + "reward_std": 0.01990697905421257, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.962890625, + "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, + "rewards/symbolic_reward_partial_score/mean": 0.9833984375, + "rewards/symbolic_reward_partial_score/std": 0.0878334566950798, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0596094131469727, + "sampling/importance_sampling_ratio/min": 2.395567935309373e-05, + "sampling/sampling_logp_difference/max": 10.639305114746094, + "sampling/sampling_logp_difference/mean": 0.1198565810918808, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.23547282069921494, + "epoch": 4.657894736842105, + "grad_norm": 0.0010475910967215896, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23682619631290436, + "epoch": 4.660526315789474, + "grad_norm": 0.0034293478820472956, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23524679243564606, + "epoch": 4.663157894736842, + "grad_norm": 0.0031075715087354183, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 319.95703125, + "completions/mean_terminated_length": 288.5205383300781, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.24038264155387878, + "epoch": 4.66578947368421, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0012618119362741709, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 332397852.0, + "reward": 0.8819647431373596, + "reward_std": 0.015408697538077831, + "rewards/progression_diversity/mean": -0.0017699984600767493, + "rewards/progression_diversity/std": 0.037862591445446014, + "rewards/symbolic_reward_accuracy/mean": 0.974609375, + "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, + "rewards/symbolic_reward_partial_score/mean": 0.9913736581802368, + "rewards/symbolic_reward_partial_score/std": 0.06337195634841919, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0580030679702759, + "sampling/importance_sampling_ratio/min": 0.002201242372393608, + "sampling/sampling_logp_difference/max": 6.1187334060668945, + "sampling/sampling_logp_difference/mean": 0.12074305862188339, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23519500344991684, + "epoch": 4.668421052631579, + "grad_norm": 0.001645031850785017, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23526707291603088, + "epoch": 4.671052631578947, + "grad_norm": 0.002648879075422883, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23147708177566528, + "epoch": 4.673684210526316, + "grad_norm": 0.0012472477974370122, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 285.880859375, + "completions/mean_terminated_length": 285.880859375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.23570366948843002, + "epoch": 4.676315789473684, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0007892610155977309, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 332941855.0, + "reward": 0.8750487565994263, + "reward_std": 0.01221198309212923, + "rewards/progression_diversity/mean": -1.265850369236432e-05, + "rewards/progression_diversity/std": 0.0002864292182493955, + "rewards/symbolic_reward_accuracy/mean": 0.962890625, + "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, + "rewards/symbolic_reward_partial_score/mean": 0.9910481572151184, + "rewards/symbolic_reward_partial_score/std": 0.045909516513347626, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059958577156067, + "sampling/importance_sampling_ratio/min": 3.310117608634755e-05, + "sampling/sampling_logp_difference/max": 10.31594181060791, + "sampling/sampling_logp_difference/mean": 0.12255353480577469, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23493484407663345, + "epoch": 4.678947368421053, + "grad_norm": 0.0008538050460629165, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23577520996332169, + "epoch": 4.681578947368421, + "grad_norm": 0.004247025586664677, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2345782071352005, + "epoch": 4.684210526315789, + "grad_norm": 0.005044107791036367, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 286.583984375, + "completions/mean_terminated_length": 286.583984375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.23718595504760742, + "epoch": 4.686842105263158, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0074276975356042385, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 333467690.0, + "reward": 0.8631348013877869, + "reward_std": 0.034078460186719894, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.947265625, + "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, + "rewards/symbolic_reward_partial_score/mean": 0.9825845956802368, + "rewards/symbolic_reward_partial_score/std": 0.07894845306873322, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0592341423034668, + "sampling/importance_sampling_ratio/min": 4.83025869471021e-05, + "sampling/sampling_logp_difference/max": 9.93802547454834, + "sampling/sampling_logp_difference/mean": 0.12274422496557236, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23618671298027039, + "epoch": 4.689473684210526, + "grad_norm": 0.00105291698127985, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23698270320892334, + "epoch": 4.692105263157895, + "grad_norm": 0.005746976938098669, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2383096069097519, + "epoch": 4.6947368421052635, + "grad_norm": 0.0012328887823969126, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 285.83984375, + "completions/mean_terminated_length": 285.83984375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.23322966694831848, + "epoch": 4.697368421052632, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0064533608965575695, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 334002328.0, + "reward": 0.8294922113418579, + "reward_std": 0.03646354004740715, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.900390625, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.9641926884651184, + "rewards/symbolic_reward_partial_score/std": 0.11747395247220993, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0586230754852295, + "sampling/importance_sampling_ratio/min": 0.000552015146240592, + "sampling/sampling_logp_difference/max": 7.501935005187988, + "sampling/sampling_logp_difference/mean": 0.12089669704437256, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2360163927078247, + "epoch": 4.7, + "grad_norm": 0.0023228242062032223, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23313098400831223, + "epoch": 4.7026315789473685, + "grad_norm": 0.0005817331839352846, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1787 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.23333580791950226, + "epoch": 4.705263157894737, + "grad_norm": 0.000786386604886502, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 281.697265625, + "completions/mean_terminated_length": 281.697265625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.23130569607019424, + "epoch": 4.707894736842105, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0013153174659237266, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 334544189.0, + "reward": 0.8330566883087158, + "reward_std": 0.030620714649558067, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.96826171875, + "rewards/symbolic_reward_partial_score/std": 0.10676281899213791, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0613431930541992, + "sampling/importance_sampling_ratio/min": 0.002472738502547145, + "sampling/sampling_logp_difference/max": 6.002429008483887, + "sampling/sampling_logp_difference/mean": 0.12073177099227905, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.23753570765256882, + "epoch": 4.7105263157894735, + "grad_norm": 0.004040809348225594, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1790 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.24012085795402527, + "epoch": 4.713157894736842, + "grad_norm": 0.0009908534120768309, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2364014834165573, + "epoch": 4.715789473684211, + "grad_norm": 0.000720858748536557, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 349.853515625, + "completions/mean_terminated_length": 286.9745178222656, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.2367512583732605, + "epoch": 4.718421052631579, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004800880327820778, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 335109778.0, + "reward": 0.874405562877655, + "reward_std": 0.013915155082941055, + "rewards/progression_diversity/mean": -0.0008550827042199671, + "rewards/progression_diversity/std": 0.01934831216931343, + "rewards/symbolic_reward_accuracy/mean": 0.96484375, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.986328125, + "rewards/symbolic_reward_partial_score/std": 0.08010873943567276, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0590322017669678, + "sampling/importance_sampling_ratio/min": 0.004423519130796194, + "sampling/sampling_logp_difference/max": 5.4208197593688965, + "sampling/sampling_logp_difference/mean": 0.11889566481113434, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23232899606227875, + "epoch": 4.721052631578948, + "grad_norm": 0.0069125681184232235, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23827090859413147, + "epoch": 4.723684210526316, + "grad_norm": 0.003237023251131177, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23686149716377258, + "epoch": 4.726315789473684, + "grad_norm": 0.0009356890805065632, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 313.6484375, + "completions/mean_terminated_length": 282.1996154785156, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.2309466004371643, + "epoch": 4.728947368421053, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0016089669661596417, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 335674174.0, + "reward": 0.8695645332336426, + "reward_std": 0.020846091210842133, + "rewards/progression_diversity/mean": -0.0015609528636559844, + "rewards/progression_diversity/std": 0.03335866704583168, + "rewards/symbolic_reward_accuracy/mean": 0.95703125, + "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, + "rewards/symbolic_reward_partial_score/mean": 0.9851887822151184, + "rewards/symbolic_reward_partial_score/std": 0.07982046157121658, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.057593584060669, + "sampling/importance_sampling_ratio/min": 0.0018050133949145675, + "sampling/sampling_logp_difference/max": 6.317187309265137, + "sampling/sampling_logp_difference/mean": 0.11618360877037048, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23156511783599854, + "epoch": 4.731578947368421, + "grad_norm": 0.0030565187335014343, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2370528131723404, + "epoch": 4.734210526315789, + "grad_norm": 0.005625669378787279, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.22954415529966354, + "epoch": 4.7368421052631575, + "grad_norm": 0.002420470118522644, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 284.154296875, + "completions/mean_terminated_length": 284.154296875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.23575779795646667, + "epoch": 4.739473684210527, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.003084713127464056, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 336225133.0, + "reward": 0.8538080453872681, + "reward_std": 0.01827925443649292, + "rewards/progression_diversity/mean": -5.8929272199748084e-05, + "rewards/progression_diversity/std": 0.001333417254500091, + "rewards/symbolic_reward_accuracy/mean": 0.931640625, + "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, + "rewards/symbolic_reward_partial_score/mean": 0.9827474355697632, + "rewards/symbolic_reward_partial_score/std": 0.07354949414730072, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0592098236083984, + "sampling/importance_sampling_ratio/min": 0.00014154307427816093, + "sampling/sampling_logp_difference/max": 8.862906455993652, + "sampling/sampling_logp_difference/mean": 0.12110301852226257, + "step": 1801 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.23429590463638306, + "epoch": 4.742105263157895, + "grad_norm": 0.0009401784045621753, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23786381632089615, + "epoch": 4.744736842105263, + "grad_norm": 0.005505091045051813, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23975226283073425, + "epoch": 4.747368421052632, + "grad_norm": 0.0008587277843616903, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 284.486328125, + "completions/mean_terminated_length": 284.486328125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.2338343784213066, + "epoch": 4.75, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.001873121364042163, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 336790118.0, + "reward": 0.867529034614563, + "reward_std": 0.027098482474684715, + "rewards/progression_diversity/mean": -3.302685945527628e-05, + "rewards/progression_diversity/std": 0.0007473125006072223, + "rewards/symbolic_reward_accuracy/mean": 0.953125, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.9855143427848816, + "rewards/symbolic_reward_partial_score/std": 0.06589668989181519, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060340404510498, + "sampling/importance_sampling_ratio/min": 3.4130298445234075e-05, + "sampling/sampling_logp_difference/max": 10.285325050354004, + "sampling/sampling_logp_difference/mean": 0.12118184566497803, + "step": 1805 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23535771667957306, + "epoch": 4.752631578947368, + "grad_norm": 0.003209069836884737, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23693060129880905, + "epoch": 4.755263157894737, + "grad_norm": 0.006161042023450136, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23479554057121277, + "epoch": 4.757894736842105, + "grad_norm": 0.0019247831078246236, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 285.146484375, + "completions/mean_terminated_length": 285.146484375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.23206809908151627, + "epoch": 4.760526315789473, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.001541185425594449, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 337325905.0, + "reward": 0.8272461295127869, + "reward_std": 0.01054687611758709, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.89453125, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.9684244394302368, + "rewards/symbolic_reward_partial_score/std": 0.10192830860614777, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0599029064178467, + "sampling/importance_sampling_ratio/min": 0.0008538118563592434, + "sampling/sampling_logp_difference/max": 7.065799713134766, + "sampling/sampling_logp_difference/mean": 0.12050032615661621, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2352423518896103, + "epoch": 4.7631578947368425, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2324458658695221, + "epoch": 4.765789473684211, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.23431549966335297, + "epoch": 4.768421052631579, + "grad_norm": 0.0009463911992497742, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 281.193359375, + "completions/mean_terminated_length": 281.193359375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.23839272558689117, + "epoch": 4.771052631578947, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0013199823442846537, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 337859668.0, + "reward": 0.8254392743110657, + "reward_std": 0.026775745674967766, + "rewards/progression_diversity/mean": -1.901627729239408e-05, + "rewards/progression_diversity/std": 0.00043028921936638653, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.96630859375, + "rewards/symbolic_reward_partial_score/std": 0.10718066245317459, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0602418184280396, + "sampling/importance_sampling_ratio/min": 0.0003893849498126656, + "sampling/sampling_logp_difference/max": 7.850942134857178, + "sampling/sampling_logp_difference/mean": 0.12159500271081924, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2346992939710617, + "epoch": 4.773684210526316, + "grad_norm": 0.0012670259457081556, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1814 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.23430411517620087, + "epoch": 4.776315789473684, + "grad_norm": 0.001033074571751058, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1815 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2374269664287567, + "epoch": 4.778947368421052, + "grad_norm": 0.0033665422815829515, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 289.28515625, + "completions/mean_terminated_length": 289.28515625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.2364349216222763, + "epoch": 4.781578947368421, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0023707703221589327, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 338414278.0, + "reward": 0.847900390625, + "reward_std": 0.016666706651449203, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.921875, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.9825845956802368, + "rewards/symbolic_reward_partial_score/std": 0.06219496950507164, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0596914291381836, + "sampling/importance_sampling_ratio/min": 0.003912740852683783, + "sampling/sampling_logp_difference/max": 5.543517112731934, + "sampling/sampling_logp_difference/mean": 0.12083780765533447, + "step": 1817 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23182794451713562, + "epoch": 4.784210526315789, + "grad_norm": 0.006022021174430847, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.23250708729028702, + "epoch": 4.786842105263158, + "grad_norm": 0.0012751823524013162, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23640374094247818, + "epoch": 4.7894736842105265, + "grad_norm": 0.0018326534191146493, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 288.1015625, + "completions/mean_terminated_length": 288.1015625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.2266254797577858, + "epoch": 4.792105263157895, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.00011159066343680024, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 338966074.0, + "reward": 0.899999737739563, + "reward_std": 1.0485451866770745e-06, + "rewards/progression_diversity/mean": -2.6222385713481344e-05, + "rewards/progression_diversity/std": 0.0005933448555879295, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0588757991790771, + "sampling/importance_sampling_ratio/min": 1.982930598387611e-06, + "sampling/sampling_logp_difference/max": 13.130934715270996, + "sampling/sampling_logp_difference/mean": 0.11928833276033401, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.23060919344425201, + "epoch": 4.794736842105263, + "grad_norm": 7.569075387436897e-05, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.22907361388206482, + "epoch": 4.7973684210526315, + "grad_norm": 5.835884076077491e-05, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1823 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23327982425689697, + "epoch": 4.8, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 289.376953125, + "completions/mean_terminated_length": 289.376953125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.23341242969036102, + "epoch": 4.802631578947368, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.003994180355221033, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 339509019.0, + "reward": 0.797314465045929, + "reward_std": 0.03295782953500748, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.95458984375, + "rewards/symbolic_reward_partial_score/std": 0.12550002336502075, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0598238706588745, + "sampling/importance_sampling_ratio/min": 0.0002354941243538633, + "sampling/sampling_logp_difference/max": 8.353824615478516, + "sampling/sampling_logp_difference/mean": 0.12100344896316528, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23315101116895676, + "epoch": 4.8052631578947365, + "grad_norm": 0.003105239477008581, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2331264168024063, + "epoch": 4.807894736842105, + "grad_norm": 0.004227034747600555, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23640157282352448, + "epoch": 4.810526315789474, + "grad_norm": 0.006377407815307379, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 291.056640625, + "completions/mean_terminated_length": 291.056640625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.23511869460344315, + "epoch": 4.813157894736842, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.00462336977943778, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 340066840.0, + "reward": 0.8834961652755737, + "reward_std": 0.022172415629029274, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.974609375, + "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, + "rewards/symbolic_reward_partial_score/mean": 0.9957681894302368, + "rewards/symbolic_reward_partial_score/std": 0.02624371461570263, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0585315227508545, + "sampling/importance_sampling_ratio/min": 0.000580458901822567, + "sampling/sampling_logp_difference/max": 7.451691627502441, + "sampling/sampling_logp_difference/mean": 0.11921732127666473, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23502135276794434, + "epoch": 4.815789473684211, + "grad_norm": 0.0011459586676210165, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2287670001387596, + "epoch": 4.818421052631579, + "grad_norm": 0.0017157213296741247, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2291087955236435, + "epoch": 4.821052631578947, + "grad_norm": 0.0019412727560847998, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 319.869140625, + "completions/mean_terminated_length": 288.4324951171875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.23015135526657104, + "epoch": 4.823684210526316, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.002705089980736375, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 340650101.0, + "reward": 0.8444219827651978, + "reward_std": 0.034328803420066833, + "rewards/progression_diversity/mean": -0.0011606995249167085, + "rewards/progression_diversity/std": 0.026263633742928505, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9638671875, + "rewards/symbolic_reward_partial_score/std": 0.13422636687755585, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0576014518737793, + "sampling/importance_sampling_ratio/min": 1.53549427551216e-22, + "sampling/sampling_logp_difference/max": 50.22801971435547, + "sampling/sampling_logp_difference/mean": 0.11635151505470276, + "step": 1833 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.22804827243089676, + "epoch": 4.826315789473684, + "grad_norm": 0.004144140984863043, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2338591292500496, + "epoch": 4.828947368421053, + "grad_norm": 0.001100580906495452, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23625478148460388, + "epoch": 4.831578947368421, + "grad_norm": 0.011743053793907166, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 289.177734375, + "completions/mean_terminated_length": 289.177734375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.23688583821058273, + "epoch": 4.83421052631579, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0028815693221986294, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 341199984.0, + "reward": 0.8502440452575684, + "reward_std": 0.013955610804259777, + "rewards/progression_diversity/mean": -7.151393219828606e-06, + "rewards/progression_diversity/std": 0.00012617984612006694, + "rewards/symbolic_reward_accuracy/mean": 0.931640625, + "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, + "rewards/symbolic_reward_partial_score/mean": 0.9708659052848816, + "rewards/symbolic_reward_partial_score/std": 0.11147594451904297, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0599719285964966, + "sampling/importance_sampling_ratio/min": 0.0025296704843640327, + "sampling/sampling_logp_difference/max": 5.979666233062744, + "sampling/sampling_logp_difference/mean": 0.12142408639192581, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.24060922861099243, + "epoch": 4.836842105263158, + "grad_norm": 0.0021192163694649935, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.23550759255886078, + "epoch": 4.839473684210526, + "grad_norm": 0.00041981041431427, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.23710917681455612, + "epoch": 4.842105263157895, + "grad_norm": 0.0005093511426821351, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 286.017578125, + "completions/mean_terminated_length": 286.017578125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.23979313671588898, + "epoch": 4.844736842105263, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0012512730900198221, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 341719033.0, + "reward": 0.893261730670929, + "reward_std": 0.01963597722351551, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9970703125, + "rewards/symbolic_reward_partial_score/std": 0.031142795458436012, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059399127960205, + "sampling/importance_sampling_ratio/min": 6.281732930801809e-05, + "sampling/sampling_logp_difference/max": 9.67527961730957, + "sampling/sampling_logp_difference/mean": 0.12148109823465347, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2339087724685669, + "epoch": 4.847368421052631, + "grad_norm": 0.0032734405249357224, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23710087686777115, + "epoch": 4.85, + "grad_norm": 0.004084085114300251, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23539859056472778, + "epoch": 4.852631578947369, + "grad_norm": 0.007630764041095972, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 291.00390625, + "completions/mean_terminated_length": 291.00390625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.23795322328805923, + "epoch": 4.855263157894737, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.003123840782791376, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 342274683.0, + "reward": 0.8268060088157654, + "reward_std": 0.04585159197449684, + "rewards/progression_diversity/mean": -6.512173422379419e-05, + "rewards/progression_diversity/std": 0.0014735364820808172, + "rewards/symbolic_reward_accuracy/mean": 0.89453125, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.9669595956802368, + "rewards/symbolic_reward_partial_score/std": 0.1075100302696228, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0592467784881592, + "sampling/importance_sampling_ratio/min": 0.0003768360475078225, + "sampling/sampling_logp_difference/max": 7.883700370788574, + "sampling/sampling_logp_difference/mean": 0.12160350382328033, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.2377663478255272, + "epoch": 4.8578947368421055, + "grad_norm": 0.010971352458000183, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.23767564445734024, + "epoch": 4.860526315789474, + "grad_norm": 0.001147806760855019, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 1847 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2376321703195572, + "epoch": 4.863157894736842, + "grad_norm": 0.006188137922435999, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 360.208984375, + "completions/mean_terminated_length": 297.37060546875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.22861691564321518, + "epoch": 4.86578947368421, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.005671604536473751, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 342854054.0, + "reward": 0.8605214953422546, + "reward_std": 0.029463060200214386, + "rewards/progression_diversity/mean": -0.002539029810577631, + "rewards/progression_diversity/std": 0.04076690971851349, + "rewards/symbolic_reward_accuracy/mean": 0.94921875, + "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, + "rewards/symbolic_reward_partial_score/mean": 0.9713541865348816, + "rewards/symbolic_reward_partial_score/std": 0.13124485313892365, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0566385984420776, + "sampling/importance_sampling_ratio/min": 0.0017628967761993408, + "sampling/sampling_logp_difference/max": 6.340796947479248, + "sampling/sampling_logp_difference/mean": 0.11643043160438538, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23994342982769012, + "epoch": 4.868421052631579, + "grad_norm": 0.0025020637549459934, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2400030419230461, + "epoch": 4.871052631578947, + "grad_norm": 0.009745350107550621, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.24525520205497742, + "epoch": 4.873684210526315, + "grad_norm": 0.0018413093639537692, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 292.111328125, + "completions/mean_terminated_length": 292.111328125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.2327280268073082, + "epoch": 4.876315789473685, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.0019410037202760577, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 343422623.0, + "reward": 0.8218262195587158, + "reward_std": 0.03689917176961899, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.890625, + "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, + "rewards/symbolic_reward_partial_score/mean": 0.9581705331802368, + "rewards/symbolic_reward_partial_score/std": 0.13043980300426483, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.05967378616333, + "sampling/importance_sampling_ratio/min": 4.0230930608231574e-05, + "sampling/sampling_logp_difference/max": 10.120874404907227, + "sampling/sampling_logp_difference/mean": 0.12192998826503754, + "step": 1853 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.22967851161956787, + "epoch": 4.878947368421053, + "grad_norm": 0.002355337841436267, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1854 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.23766426742076874, + "epoch": 4.881578947368421, + "grad_norm": 0.001805786625482142, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.23744701594114304, + "epoch": 4.88421052631579, + "grad_norm": 0.0023394892923533916, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 333.83203125, + "completions/mean_terminated_length": 302.4226989746094, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.2477203831076622, + "epoch": 4.886842105263158, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0036808003205806017, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 344016041.0, + "reward": 0.8699126243591309, + "reward_std": 0.03976935148239136, + "rewards/progression_diversity/mean": -0.0009265051339752972, + "rewards/progression_diversity/std": 0.02096441760659218, + "rewards/symbolic_reward_accuracy/mean": 0.9609375, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.978515625, + "rewards/symbolic_reward_partial_score/std": 0.1107143834233284, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0602765083312988, + "sampling/importance_sampling_ratio/min": 9.47190681443999e-09, + "sampling/sampling_logp_difference/max": 18.47493553161621, + "sampling/sampling_logp_difference/mean": 0.12124471366405487, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23858879506587982, + "epoch": 4.889473684210526, + "grad_norm": 0.011941269040107727, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.24422086030244827, + "epoch": 4.8921052631578945, + "grad_norm": 0.002855083905160427, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2405536025762558, + "epoch": 4.894736842105263, + "grad_norm": 0.0015807250747457147, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 292.560546875, + "completions/mean_terminated_length": 292.560546875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.23888926953077316, + "epoch": 4.897368421052631, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.001256766146980226, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 344533768.0, + "reward": 0.8810058832168579, + "reward_std": 0.020134469494223595, + "rewards/progression_diversity/mean": -2.70133432422881e-06, + "rewards/progression_diversity/std": 6.112422124715522e-05, + "rewards/symbolic_reward_accuracy/mean": 0.97265625, + "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, + "rewards/symbolic_reward_partial_score/mean": 0.9913737177848816, + "rewards/symbolic_reward_partial_score/std": 0.052070844918489456, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0612703561782837, + "sampling/importance_sampling_ratio/min": 1.173806595033966e-05, + "sampling/sampling_logp_difference/max": 11.352673530578613, + "sampling/sampling_logp_difference/mean": 0.12240561842918396, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23871064186096191, + "epoch": 4.9, + "grad_norm": 0.0026184869930148125, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.23794341087341309, + "epoch": 4.902631578947369, + "grad_norm": 0.00047851918498054147, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23701441287994385, + "epoch": 4.905263157894737, + "grad_norm": 0.0012964674970135093, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 295.33984375, + "completions/mean_terminated_length": 295.33984375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.2406173199415207, + "epoch": 4.907894736842105, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.002589939162135124, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 345085782.0, + "reward": 0.8155273795127869, + "reward_std": 0.022172417491674423, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.9567056894302368, + "rewards/symbolic_reward_partial_score/std": 0.1282230019569397, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.06076979637146, + "sampling/importance_sampling_ratio/min": 1.2657117167691467e-06, + "sampling/sampling_logp_difference/max": 13.579875946044922, + "sampling/sampling_logp_difference/mean": 0.12502209842205048, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.24070727825164795, + "epoch": 4.910526315789474, + "grad_norm": 0.0006123611237853765, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23713107407093048, + "epoch": 4.913157894736842, + "grad_norm": 0.0053363642655313015, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23777218908071518, + "epoch": 4.91578947368421, + "grad_norm": 0.0004397969751153141, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 301.1484375, + "completions/mean_terminated_length": 301.1484375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.24252811819314957, + "epoch": 4.918421052631579, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.005357232876121998, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 345673154.0, + "reward": 0.8431152701377869, + "reward_std": 0.05576281249523163, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.91796875, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.9744465947151184, + "rewards/symbolic_reward_partial_score/std": 0.09273875504732132, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0615004301071167, + "sampling/importance_sampling_ratio/min": 4.7737885324750096e-05, + "sampling/sampling_logp_difference/max": 9.949785232543945, + "sampling/sampling_logp_difference/mean": 0.12491629272699356, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2446291595697403, + "epoch": 4.921052631578947, + "grad_norm": 0.0016193005722016096, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1870 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.24189703166484833, + "epoch": 4.923684210526316, + "grad_norm": 0.003561503952369094, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1871 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.24043302237987518, + "epoch": 4.926315789473684, + "grad_norm": 0.008988670073449612, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 299.7890625, + "completions/mean_terminated_length": 299.7890625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.23490381240844727, + "epoch": 4.928947368421053, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0076865507289767265, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 346205430.0, + "reward": 0.8478027582168579, + "reward_std": 0.03393760323524475, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9744466543197632, + "rewards/symbolic_reward_partial_score/std": 0.095624640583992, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059721827507019, + "sampling/importance_sampling_ratio/min": 0.0001972316240426153, + "sampling/sampling_logp_difference/max": 8.531131744384766, + "sampling/sampling_logp_difference/mean": 0.1227768063545227, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.23886170983314514, + "epoch": 4.931578947368421, + "grad_norm": 0.002951346104964614, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.23480097204446793, + "epoch": 4.934210526315789, + "grad_norm": 0.0027505126781761646, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1875 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.23670527338981628, + "epoch": 4.936842105263158, + "grad_norm": 0.004540387075394392, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 297.521484375, + "completions/mean_terminated_length": 297.521484375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.23834621906280518, + "epoch": 4.939473684210526, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0006869042990729213, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 346775585.0, + "reward": 0.876171350479126, + "reward_std": 0.010939901694655418, + "rewards/progression_diversity/mean": -6.002806549076922e-05, + "rewards/progression_diversity/std": 0.0013582800747826695, + "rewards/symbolic_reward_accuracy/mean": 0.96484375, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.9908853769302368, + "rewards/symbolic_reward_partial_score/std": 0.04916610196232796, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0606482028961182, + "sampling/importance_sampling_ratio/min": 0.0016635659849271178, + "sampling/sampling_logp_difference/max": 6.398791790008545, + "sampling/sampling_logp_difference/mean": 0.12316389381885529, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23614563047885895, + "epoch": 4.942105263157895, + "grad_norm": 0.0005515136872418225, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.23562929034233093, + "epoch": 4.9447368421052635, + "grad_norm": 0.00030168332159519196, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2394574210047722, + "epoch": 4.947368421052632, + "grad_norm": 0.0006382480496540666, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 297.638671875, + "completions/mean_terminated_length": 297.638671875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.23923736810684204, + "epoch": 4.95, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.002834535436704755, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 347340808.0, + "reward": 0.8722656965255737, + "reward_std": 0.04588241130113602, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.958984375, + "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, + "rewards/symbolic_reward_partial_score/mean": 0.9895833730697632, + "rewards/symbolic_reward_partial_score/std": 0.059213150292634964, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0609382390975952, + "sampling/importance_sampling_ratio/min": 0.006767674349248409, + "sampling/sampling_logp_difference/max": 4.995597839355469, + "sampling/sampling_logp_difference/mean": 0.12166203558444977, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.23699773102998734, + "epoch": 4.9526315789473685, + "grad_norm": 0.0013624252751469612, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23954419791698456, + "epoch": 4.955263157894737, + "grad_norm": 0.0018693705787882209, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1883 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23460357636213303, + "epoch": 4.957894736842105, + "grad_norm": 0.007947554811835289, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 300.1875, + "completions/mean_terminated_length": 300.1875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.23470059037208557, + "epoch": 4.9605263157894735, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.006391116417944431, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 347900136.0, + "reward": 0.8125, + "reward_std": 0.05671097710728645, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.87890625, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.9505208730697632, + "rewards/symbolic_reward_partial_score/std": 0.13652312755584717, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0614619255065918, + "sampling/importance_sampling_ratio/min": 4.8486199375474826e-06, + "sampling/sampling_logp_difference/max": 12.23681640625, + "sampling/sampling_logp_difference/mean": 0.12415052205324173, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.23707986623048782, + "epoch": 4.963157894736842, + "grad_norm": 0.0033761621452867985, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.23415328562259674, + "epoch": 4.965789473684211, + "grad_norm": 0.006086938548833132, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.24153150618076324, + "epoch": 4.968421052631579, + "grad_norm": 0.0028136041946709156, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 356.966796875, + "completions/mean_terminated_length": 294.1156921386719, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.22930483520030975, + "epoch": 4.971052631578948, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0019960456993430853, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 348517751.0, + "reward": 0.8305528163909912, + "reward_std": 0.04029066115617752, + "rewards/progression_diversity/mean": -0.0013597611105069518, + "rewards/progression_diversity/std": 0.030767880380153656, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.9612630009651184, + "rewards/symbolic_reward_partial_score/std": 0.1397629827260971, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0573625564575195, + "sampling/importance_sampling_ratio/min": 1.306584863414173e-06, + "sampling/sampling_logp_difference/max": 13.548093795776367, + "sampling/sampling_logp_difference/mean": 0.11435255408287048, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.22930220514535904, + "epoch": 4.973684210526316, + "grad_norm": 0.002270034048706293, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.22606796026229858, + "epoch": 4.976315789473684, + "grad_norm": 0.0009876276599243283, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.23382815718650818, + "epoch": 4.978947368421053, + "grad_norm": 0.0011854438344016671, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 349.64453125, + "completions/mean_terminated_length": 286.76470947265625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.2325260266661644, + "epoch": 4.981578947368421, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0010174971539527178, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 349086241.0, + "reward": 0.8748369216918945, + "reward_std": 0.014585242606699467, + "rewards/progression_diversity/mean": -0.001665753312408924, + "rewards/progression_diversity/std": 0.037691693753004074, + "rewards/symbolic_reward_accuracy/mean": 0.96484375, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.98779296875, + "rewards/symbolic_reward_partial_score/std": 0.07317520678043365, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0565659999847412, + "sampling/importance_sampling_ratio/min": 8.482191333314404e-06, + "sampling/sampling_logp_difference/max": 11.677541732788086, + "sampling/sampling_logp_difference/mean": 0.11560506373643875, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.22664856910705566, + "epoch": 4.984210526315789, + "grad_norm": 0.007991527207195759, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 1894 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.23320794105529785, + "epoch": 4.9868421052631575, + "grad_norm": 0.00029770872788503766, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.22329071164131165, + "epoch": 4.989473684210527, + "grad_norm": 0.0010386735666543245, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 323.32421875, + "completions/mean_terminated_length": 291.8943176269531, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.23654846101999283, + "epoch": 4.992105263157895, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.003904322162270546, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 349636743.0, + "reward": 0.8332948684692383, + "reward_std": 0.03343512490391731, + "rewards/progression_diversity/mean": -0.0005941174458712339, + "rewards/progression_diversity/std": 0.013443343341350555, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9736328125, + "rewards/symbolic_reward_partial_score/std": 0.09050531685352325, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.058250904083252, + "sampling/importance_sampling_ratio/min": 8.018063090275973e-05, + "sampling/sampling_logp_difference/max": 9.431228637695312, + "sampling/sampling_logp_difference/mean": 0.11942292749881744, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.23303639888763428, + "epoch": 4.994736842105263, + "grad_norm": 0.0021931680385023355, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.23574919998645782, + "epoch": 4.997368421052632, + "grad_norm": 0.0010034493170678616, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.23388534039258957, + "epoch": 5.0, + "grad_norm": 0.00732083385810256, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 491.96875, + "eval_completions/max_terminated_length": 491.96875, + "eval_completions/mean_length": 286.003662109375, + "eval_completions/mean_terminated_length": 286.003662109375, + "eval_completions/min_length": 170.9375, + "eval_completions/min_terminated_length": 170.9375, + "eval_entropy": 0.23818373354151845, + "eval_frac_reward_zero_std": 0.875, + "eval_loss": 2.37619078689022e-05, + "eval_num_tokens": 349636743.0, + "eval_reward": 0.8882439639419317, + "eval_reward_std": 0.02008125601182087, + "eval_rewards/progression_diversity/mean": -7.02718230058963e-05, + "eval_rewards/progression_diversity/std": 0.000760064329369925, + "eval_rewards/symbolic_reward_accuracy/mean": 0.9833984375, + "eval_rewards/symbolic_reward_accuracy/std": 0.07969532138668001, + "eval_rewards/symbolic_reward_partial_score/mean": 0.9947509746998549, + "eval_rewards/symbolic_reward_partial_score/std": 0.03326635103439912, + "eval_rewards/tag_count_reward/mean": -0.002197265625, + "eval_rewards/tag_count_reward/std": 0.016002451302483678, + "eval_runtime": 94.9953, + "eval_samples_per_second": 2.632, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0600753016769886, + "eval_sampling/importance_sampling_ratio/min": 0.004165468698646399, + "eval_sampling/sampling_logp_difference/max": 13.449735283851624, + "eval_sampling/sampling_logp_difference/mean": 0.12632932304404676, + "eval_steps_per_second": 0.021, + "step": 1900 + }, + { + "epoch": 5.0, + "step": 1900, + "total_flos": 0.0, + "train_loss": 0.013345611529686698, + "train_runtime": 16054.0136, + "train_samples_per_second": 0.951, + "train_steps_per_second": 0.118 + } + ], + "logging_steps": 1, + "max_steps": 1900, + "num_input_tokens_seen": 349636743, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..962c096 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e5c0eb24ce25b866eb756ba2cfdbb5fcea4aef29550622de9bac93352cb0e93 +size 11345