From 8f951141f0ce1cfa8f4ffcd322b6cacad21dfb3c Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sat, 13 Jun 2026 10:54:17 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: reaperdoesntknow/Dualmind-Qwen-1.7B-Thinking Source: Original Platform --- .gitattributes | 36 + README.md | 233 + chat_template.jinja | 89 + config.json | 63 + ...ut.tfevents.1774855351.0e755ff15ec0.1023.2 | 3 + ...t.tfevents.1774858526.0e755ff15ec0.15561.0 | 3 + generation_config.json | 12 + model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 29 + trainer_state .json | 5198 +++++++++++++++++ 11 files changed, 5672 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 events.out.tfevents.1774855351.0e755ff15ec0.1023.2 create mode 100644 events.out.tfevents.1774858526.0e755ff15ec0.15561.0 create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 trainer_state .json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..624dacb --- /dev/null +++ b/README.md @@ -0,0 +1,233 @@ +--- +license: apache-2.0 +library_name: transformers +pipeline_tag: text-generation +tags: +- qwen3 +- sft +- trl +- dualmind +- knowledge-distillation +- thinking +- opus +- self-critique +- convergent-intelligence +- convergentintel +- edge +- distillation +base_model: +- reaperdoesntknow/DualMinded-Qwen3-1.7B +datasets: +- nohurry/Opus-4.6-Reasoning-3000x-filtered +- zai-org/LongWriter-6k +language: +- en +--- + +# Dualmind-Qwen-1.7B-Thinking + +**Claude Opus 4.6 Reasoning Traces → 1.7B via DualMind SFT** + +*Convergent Intelligence LLC: Research Division* + +--- + +## What This Is + +A 1.7B model trained on **2.5M+ tokens of Claude Opus 4.6 reasoning traces** using the DualMind SFT methodology. The training data comes from [Opus-4.6-Reasoning-3000x-filtered](https://huggingface.co/datasets/nohurry/Opus-4.6-Reasoning-3000x-filtered) — a curated dataset of extended reasoning chains from Anthropic's most capable model, with refusals removed. + +This is the **Opus variant** of the DualMind family. Where the base [DualMind](https://huggingface.co/reaperdoesntknow/DualMind) model was trained on LogicInference data, this model absorbs the reasoning patterns of Claude Opus 4.6 — longer chains, more nuanced self-correction, and richer deliberative structure. The Opus teacher produces qualitatively different reasoning than synthetic logic datasets: it backtracks, hedges, reconsiders, and synthesizes in ways that reflect genuine uncertainty navigation rather than pattern completion. + +The base model is [Disctil-Qwen3-1.7B](https://huggingface.co/reaperdoesntknow/Disctil-Qwen3-1.7B) — already DISC-refined and sitting in the middle of the DistilQwen distillation chain — giving it a strong structural foundation before the Opus reasoning signal is applied. + +## Architecture + +| Parameter | Value | +|-----------|-------| +| Architecture | Qwen3ForCausalLM | +| Parameters | ~2.03B (1.7B effective) | +| Hidden Size | 2048 | +| Layers | 28 | +| Attention Heads | 16 (Q) / 8 (KV) — GQA | +| Intermediate | 6144 | +| Head Dimension | 128 | +| Context Length | 40,960 tokens (max position) | +| Vocabulary | 151,936 | +| Precision | BF16 | +| Activation | SiLU | + +## Training + +| Parameter | Value | +|-----------|-------| +| Base Model | [Disctil-Qwen3-1.7B](https://huggingface.co/reaperdoesntknow/Disctil-Qwen3-1.7B) | +| Dataset | [Opus-4.6-Reasoning-3000x-filtered](https://huggingface.co/datasets/nohurry/Opus-4.6-Reasoning-3000x-filtered) | +| Additional Tokens | ~2.5M | +| Max Sequence Length | 4,096 | +| Total Steps | 512 | +| Epochs | ~7.4 | +| Method | SFT (TRL SFTTrainer) | +| Precision | BF16 | +| Hardware | NVIDIA H100 | + +### Training Dynamics + +| Metric | Start | End | +|--------|-------|-----| +| Training Loss | 1.744 | 1.455 | +| Eval Loss | — | 1.406 | +| Token Accuracy | 61.0% | 67.8% | + +The loss curve shows clean convergence across 7.4 epochs with no signs of overfitting — eval loss (1.406) remains below final training loss (1.455). The 6.8 percentage point gain in token accuracy reflects genuine absorption of the Opus reasoning structure, not memorization. + +### Why Opus Traces + +The Opus-4.6-Reasoning dataset captures something that synthetic datasets don't: the way a frontier model navigates genuine uncertainty. Opus doesn't just solve problems — it reasons about its own confidence, backtracks when a line of thought weakens, and synthesizes across multiple attempted approaches. When you distill from these traces, the student doesn't just learn to produce correct answers. It learns the **shape of deliberation**. + +This is the DualMind thesis in practice: the cognitive loop (explore → examine → respond) isn't an architectural trick. It's a training signal. When the teacher naturally exhibits multi-phase reasoning, the student absorbs that structure through standard SFT. + +## Usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained( + "reaperdoesntknow/Dualmind-Qwen-1.7B-Thinking", + torch_dtype="auto", + device_map="auto" +) +tokenizer = AutoTokenizer.from_pretrained( + "reaperdoesntknow/Dualmind-Qwen-1.7B-Thinking" +) + +messages = [ + {"role": "user", "content": "What happens to information that falls into a black hole? Walk me through the paradox."} +] + +text = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True +) +inputs = tokenizer(text, return_tensors="pt").to(model.device) + +output = model.generate( + **inputs, + max_new_tokens=2048, + do_sample=True, + top_p=0.9, + temperature=0.7, + repetition_penalty=1.15 +) + +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + +### Generation Tips + +- **Temperature 0.6–0.8** — the Opus reasoning traces have natural variance in them. Don't flatten it with low temperature. +- **Repetition penalty 1.1–1.2** — prevents looping during extended reasoning chains. +- **Max tokens 1024–2048** — trained at 4096 max seq, so it can go long. The Opus signal rewards longer generation windows. +- The model may produce multi-phase reasoning naturally (exploring, then reconsidering, then concluding). This is the intended behavior — the DualMind cognitive loop emerging from the training signal. + +## Model Lineage + +``` +Qwen3-1.7B (base) + → DiStil-Qwen3-1.7B-uncensored (uncensored SFT) + → Disctil-Qwen3-1.7B (DISC refinement) + → Dualmind-Qwen-1.7B-Thinking ← you are here + ↑ + Opus 4.6 reasoning traces (2.5M tokens, DualMind SFT) +``` + +### DualMind Family Comparison + +| Model | Training Signal | Character | +|-------|----------------|-----------| +| [DualMind](https://huggingface.co/reaperdoesntknow/DualMind) | LogicInference | Structured logical deduction | +| **Dualmind-Qwen-1.7B-Thinking** | **Opus 4.6 Reasoning** | **Extended deliberation, self-correction** | +| [TopologicalQwen](https://huggingface.co/reaperdoesntknow/TopologicalQwen) | 30B-Thinking (TKD) | Topology-aware physics CoT | + +Same methodology, different teachers, different capabilities. The LogicInference variant is more mechanical. The Opus variant is more deliberative. TopologicalQwen is the full TKD pipeline with BV decomposition. They're complementary — different facets of the same cognitive architecture. + +## DualMind Collection + +| Model | Description | +|-------|-------------| +| [DualMind](https://huggingface.co/reaperdoesntknow/DualMind) | LogicInference-trained. Explore→Examine→Response cognitive loop. | +| [DualMind_Methodology](https://huggingface.co/reaperdoesntknow/DualMind_Methodolgy) | Paper: Three Teachers to Dual Cognition (DOI: 10.57967/hf/8184) | +| **[Dualmind-Qwen-1.7B-Thinking](https://huggingface.co/reaperdoesntknow/Dualmind-Qwen-1.7B-Thinking)** | **← this model. Opus 4.6 reasoning variant.** | +| [DualMind-GGUF](https://huggingface.co/reaperdoesntknow/DualMind-GGUF) | LogicInference variant quantized for edge deployment. | + +Full collection: [DualMind on HuggingFace](https://huggingface.co/collections/reaperdoesntknow/dualmind-69c93f888c6e79ecc69cf41e) + +## Papers + +- **[Structure Over Scale: Proof-Weighted Knowledge Distillation](https://doi.org/10.57967/hf/8165)** — DOI: 10.57967/hf/8165. The DistilQwen methodology paper. +- **[Three Teachers to Dual Cognition](https://doi.org/10.57967/hf/8184)** — DOI: 10.57967/hf/8184. The DualMind extension: ghost imprinting and multi-teacher convergence. + +## License + +Apache 2.0 + + +## Mathematical Foundations: Discrepancy Calculus (DISC) + +This model's training pipeline is grounded in Discrepancy Calculus — a measure-theoretic framework that treats singularities as primary structure rather than pathology. Full theory: *"On the Formal Analysis of Discrepancy Calculus"* (Colca, 2026; Convergent Intelligence LLC: Research Division). + +**The Core Operator:** + +$$Df(x) = \lim_{\varepsilon \downarrow 0} \frac{1}{\varepsilon} \int_x^{x+\varepsilon} \frac{|f(t) - f(x)|}{|t - x|}\, dt$$ + +For smooth $f$: $Df(x) = |f'(x)|$. For rough $f$: $D$ localizes irregularity to null sets while preserving integral structure. + +**The Mesh Fundamental Identity** — every BV function decomposes as: + +$$f(b) - f(a) = \underbrace{\int_a^b f'(x)\,dx}_{\text{smooth (AC)}} + \underbrace{\sum_{x \in J_f} \Delta f(x)}_{\text{jumps}} + \underbrace{D^c f(I)}_{\text{Cantor drift}}$$ + +Standard knowledge distillation captures only term 1. Topological Knowledge Distillation (TKD) preserves all three by treating the teacher's output distribution as a BV function and computing discrepancy energy, jump sets, and gap energy density before training begins. + +## Citation + +```bibtex +@misc{colca2026dualmind, + title={Three Teachers to Dual Cognition: From Knowledge Distillation to Emergent Reasoning}, + author={Colca, Roy}, + year={2026}, + doi={10.57967/hf/8184}, + publisher={Convergent Intelligence LLC: Research Division} +} +``` + +--- + +*Convergent Intelligence LLC: Research Division — 49 models, 22,598+ downloads across the portfolio.* +*[Full portfolio](https://huggingface.co/reaperdoesntknow) | [DualMind Collection](https://huggingface.co/collections/reaperdoesntknow/dualmind-69c93f888c6e79ecc69cf41e) | [DistilQwen Collection](https://huggingface.co/collections/reaperdoesntknow/distilqwen-69bf40ec669117e3f069ef1c)* + +--- + +## Convergent Intelligence Portfolio + +*Part of the [DualMind Series](https://huggingface.co/collections/reaperdoesntknow/dualmind-69c93f888c6e79ecc69cf41e) by [Convergent Intelligence LLC: Research Division](https://huggingface.co/reaperdoesntknow)* + +### DualMind Family + +| Model | Format | Description | +|-------|--------|-------------| +| [DualMind](https://huggingface.co/reaperdoesntknow/DualMind) | BF16 | LogicInference-trained. Explore→Examine→Response loop. | +| [DualMinded-Qwen3-1.7B](https://huggingface.co/reaperdoesntknow/DualMinded-Qwen3-1.7B) | BF16 | Opus 4.6 reasoning traces. Higher quality splits. | +| [Dualmind-Qwen-1.7B-Thinking](https://huggingface.co/reaperdoesntknow/Dualmind-Qwen-1.7B-Thinking) | BF16 | Thinking-teacher variant with extended deliberation. | +| [DualMind-GGUF](https://huggingface.co/reaperdoesntknow/DualMind-GGUF) | GGUF | Quantized LogicInference variant. CPU/6GB GPU. | +| [DualMinded-Qwen3-1.7B-GGUF](https://huggingface.co/reaperdoesntknow/DualMinded-Qwen3-1.7B-GGUF) | GGUF | Quantized Opus variant. Ollama ready. | + +### Papers + +| Paper | DOI | +|-------|-----| +| [Structure Over Scale](https://huggingface.co/reaperdoesntknow/Structure-Over-Scale) | 10.57967/hf/8165 | +| [Three Teachers to Dual Cognition](https://huggingface.co/reaperdoesntknow/DualMind_Methodolgy) | 10.57967/hf/8184 | +| [Discrepancy Calculus](https://huggingface.co/reaperdoesntknow/Discrepancy_Calculus) | 10.57967/hf/8194 | + +--- + +*Last updated: 2026-03-31 by Convergent Intelligence LLC: Research Division* + diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..1782cca --- /dev/null +++ b/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/events.out.tfevents.1774855351.0e755ff15ec0.1023.2 b/events.out.tfevents.1774855351.0e755ff15ec0.1023.2 new file mode 100644 index 0000000..29c87b6 --- /dev/null +++ b/events.out.tfevents.1774855351.0e755ff15ec0.1023.2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:754fa573c8076f901c055a875d3ec38572c33c6c1bb1341cae32f40b32310436 +size 202356 diff --git a/events.out.tfevents.1774858526.0e755ff15ec0.15561.0 b/events.out.tfevents.1774858526.0e755ff15ec0.15561.0 new file mode 100644 index 0000000..e8319ca --- /dev/null +++ b/events.out.tfevents.1774858526.0e755ff15ec0.15561.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0230efb71b3943a2b6a6f1ca78e937d3ccf451e65eb4e8e079dc482ecc730d7 +size 54371 diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..c33fb76 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.0.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..a47c948 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef9c37a37d926124140a8a543c3aa52b9e2da03a3d00e17e50425fa20a20c4ed +size 4063515640 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..9fd0fb4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/trainer_state .json b/trainer_state .json new file mode 100644 index 0000000..14205e7 --- /dev/null +++ b/trainer_state .json @@ -0,0 +1,5198 @@ +{ + "best_global_step": 512, + "best_metric": 1.4060174226760864, + "best_model_checkpoint": "Dually/checkpoint-512", + "epoch": 7.426078971533517, + "eval_steps": 128, + "global_step": 512, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2646127492189407, + "epoch": 0.014692378328741965, + "grad_norm": 9.75, + "learning_rate": 0.0, + "loss": 1.744248867034912, + "mean_token_accuracy": 0.6102629192173481, + "num_tokens": 7917.0, + "step": 1 + }, + { + "entropy": 1.378300040960312, + "epoch": 0.02938475665748393, + "grad_norm": 10.9375, + "learning_rate": 1.5625e-07, + "loss": 1.9481500387191772, + "mean_token_accuracy": 0.5874052550643682, + "num_tokens": 14736.0, + "step": 2 + }, + { + "entropy": 1.3521380089223385, + "epoch": 0.0440771349862259, + "grad_norm": 11.625, + "learning_rate": 3.125e-07, + "loss": 1.9126718044281006, + "mean_token_accuracy": 0.6104181408882141, + "num_tokens": 20834.0, + "step": 3 + }, + { + "entropy": 1.0458027385175228, + "epoch": 0.05876951331496786, + "grad_norm": 13.75, + "learning_rate": 4.6875000000000006e-07, + "loss": 1.5388509035110474, + "mean_token_accuracy": 0.6652283705770969, + "num_tokens": 26971.0, + "step": 4 + }, + { + "entropy": 1.4032921269536018, + "epoch": 0.07346189164370982, + "grad_norm": 11.6875, + "learning_rate": 6.25e-07, + "loss": 2.099245071411133, + "mean_token_accuracy": 0.5665754359215498, + "num_tokens": 34700.0, + "step": 5 + }, + { + "entropy": 1.3070856295526028, + "epoch": 0.0881542699724518, + "grad_norm": 12.625, + "learning_rate": 7.8125e-07, + "loss": 1.9474432468414307, + "mean_token_accuracy": 0.5986234582960606, + "num_tokens": 41284.0, + "step": 6 + }, + { + "entropy": 1.246600879356265, + "epoch": 0.10284664830119375, + "grad_norm": 11.5625, + "learning_rate": 9.375000000000001e-07, + "loss": 1.7984871864318848, + "mean_token_accuracy": 0.6245174538344145, + "num_tokens": 47262.0, + "step": 7 + }, + { + "entropy": 1.1514122374355793, + "epoch": 0.11753902662993572, + "grad_norm": 10.8125, + "learning_rate": 1.0937500000000001e-06, + "loss": 1.658499002456665, + "mean_token_accuracy": 0.6269348040223122, + "num_tokens": 54501.0, + "step": 8 + }, + { + "entropy": 1.2789062187075615, + "epoch": 0.1322314049586777, + "grad_norm": 12.9375, + "learning_rate": 1.25e-06, + "loss": 1.8666932582855225, + "mean_token_accuracy": 0.6204132493585348, + "num_tokens": 59298.0, + "step": 9 + }, + { + "entropy": 1.4759281110018492, + "epoch": 0.14692378328741965, + "grad_norm": 11.25, + "learning_rate": 1.40625e-06, + "loss": 2.1906216144561768, + "mean_token_accuracy": 0.5752640906721354, + "num_tokens": 65096.0, + "step": 10 + }, + { + "entropy": 1.2384399138391018, + "epoch": 0.16161616161616163, + "grad_norm": 11.625, + "learning_rate": 1.5625e-06, + "loss": 1.7301772832870483, + "mean_token_accuracy": 0.6402938701212406, + "num_tokens": 70811.0, + "step": 11 + }, + { + "entropy": 1.2877945825457573, + "epoch": 0.1763085399449036, + "grad_norm": 9.9375, + "learning_rate": 1.71875e-06, + "loss": 1.7703232765197754, + "mean_token_accuracy": 0.6134329959750175, + "num_tokens": 78037.0, + "step": 12 + }, + { + "entropy": 1.1743841245770454, + "epoch": 0.19100091827364554, + "grad_norm": 12.6875, + "learning_rate": 1.8750000000000003e-06, + "loss": 1.7103441953659058, + "mean_token_accuracy": 0.6294799540191889, + "num_tokens": 84881.0, + "step": 13 + }, + { + "entropy": 1.1487970873713493, + "epoch": 0.2056932966023875, + "grad_norm": 11.625, + "learning_rate": 2.0312500000000002e-06, + "loss": 1.724797010421753, + "mean_token_accuracy": 0.637124864384532, + "num_tokens": 91378.0, + "step": 14 + }, + { + "entropy": 1.2112453859299421, + "epoch": 0.22038567493112948, + "grad_norm": 10.6875, + "learning_rate": 2.1875000000000002e-06, + "loss": 1.6525938510894775, + "mean_token_accuracy": 0.6341305579990149, + "num_tokens": 97461.0, + "step": 15 + }, + { + "entropy": 1.2508288510143757, + "epoch": 0.23507805325987144, + "grad_norm": 10.0625, + "learning_rate": 2.3437500000000002e-06, + "loss": 1.7525631189346313, + "mean_token_accuracy": 0.6133801508694887, + "num_tokens": 103802.0, + "step": 16 + }, + { + "entropy": 1.1752750612795353, + "epoch": 0.2497704315886134, + "grad_norm": 10.8125, + "learning_rate": 2.5e-06, + "loss": 1.7630412578582764, + "mean_token_accuracy": 0.6321723479777575, + "num_tokens": 111377.0, + "step": 17 + }, + { + "entropy": 1.3553732447326183, + "epoch": 0.2644628099173554, + "grad_norm": 11.4375, + "learning_rate": 2.65625e-06, + "loss": 1.9398893117904663, + "mean_token_accuracy": 0.5853390581905842, + "num_tokens": 118146.0, + "step": 18 + }, + { + "entropy": 1.1929924674332142, + "epoch": 0.27915518824609736, + "grad_norm": 8.6875, + "learning_rate": 2.8125e-06, + "loss": 1.6914823055267334, + "mean_token_accuracy": 0.6384953130036592, + "num_tokens": 126437.0, + "step": 19 + }, + { + "entropy": 1.2564715202897787, + "epoch": 0.2938475665748393, + "grad_norm": 9.125, + "learning_rate": 2.96875e-06, + "loss": 1.7073726654052734, + "mean_token_accuracy": 0.6299608834087849, + "num_tokens": 133562.0, + "step": 20 + }, + { + "entropy": 1.1569741740822792, + "epoch": 0.3085399449035813, + "grad_norm": 9.0, + "learning_rate": 3.125e-06, + "loss": 1.7045152187347412, + "mean_token_accuracy": 0.6431939471513033, + "num_tokens": 141389.0, + "step": 21 + }, + { + "entropy": 1.2686172276735306, + "epoch": 0.32323232323232326, + "grad_norm": 11.4375, + "learning_rate": 3.28125e-06, + "loss": 1.9201271533966064, + "mean_token_accuracy": 0.6149793621152639, + "num_tokens": 149659.0, + "step": 22 + }, + { + "entropy": 1.3068278022110462, + "epoch": 0.3379247015610652, + "grad_norm": 9.75, + "learning_rate": 3.4375e-06, + "loss": 1.757298231124878, + "mean_token_accuracy": 0.6182622388005257, + "num_tokens": 156962.0, + "step": 23 + }, + { + "entropy": 1.1734640449285507, + "epoch": 0.3526170798898072, + "grad_norm": 9.625, + "learning_rate": 3.59375e-06, + "loss": 1.7031861543655396, + "mean_token_accuracy": 0.6426031272858381, + "num_tokens": 164117.0, + "step": 24 + }, + { + "entropy": 1.161806859076023, + "epoch": 0.3673094582185491, + "grad_norm": 8.25, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.7877576351165771, + "mean_token_accuracy": 0.6392325963824987, + "num_tokens": 171972.0, + "step": 25 + }, + { + "entropy": 1.1784982681274414, + "epoch": 0.3820018365472911, + "grad_norm": 8.9375, + "learning_rate": 3.90625e-06, + "loss": 1.5683488845825195, + "mean_token_accuracy": 0.6583592146635056, + "num_tokens": 178684.0, + "step": 26 + }, + { + "entropy": 1.2632846124470234, + "epoch": 0.39669421487603307, + "grad_norm": 8.8125, + "learning_rate": 4.0625000000000005e-06, + "loss": 1.7520173788070679, + "mean_token_accuracy": 0.633663909509778, + "num_tokens": 186596.0, + "step": 27 + }, + { + "entropy": 1.3619473539292812, + "epoch": 0.411386593204775, + "grad_norm": 8.8125, + "learning_rate": 4.21875e-06, + "loss": 1.6584488153457642, + "mean_token_accuracy": 0.6089041493833065, + "num_tokens": 193256.0, + "step": 28 + }, + { + "entropy": 1.5768938176333904, + "epoch": 0.426078971533517, + "grad_norm": 10.0, + "learning_rate": 4.3750000000000005e-06, + "loss": 2.0062549114227295, + "mean_token_accuracy": 0.5824067778885365, + "num_tokens": 199459.0, + "step": 29 + }, + { + "entropy": 1.1113787479698658, + "epoch": 0.44077134986225897, + "grad_norm": 7.46875, + "learning_rate": 4.53125e-06, + "loss": 1.6244635581970215, + "mean_token_accuracy": 0.6660626344382763, + "num_tokens": 208347.0, + "step": 30 + }, + { + "entropy": 1.3061846159398556, + "epoch": 0.4554637281910009, + "grad_norm": 8.75, + "learning_rate": 4.6875000000000004e-06, + "loss": 1.8602163791656494, + "mean_token_accuracy": 0.6074783802032471, + "num_tokens": 215643.0, + "step": 31 + }, + { + "entropy": 1.3427062667906284, + "epoch": 0.4701561065197429, + "grad_norm": 9.25, + "learning_rate": 4.84375e-06, + "loss": 1.8641481399536133, + "mean_token_accuracy": 0.6179640628397465, + "num_tokens": 221856.0, + "step": 32 + }, + { + "entropy": 1.5843740738928318, + "epoch": 0.48484848484848486, + "grad_norm": 9.3125, + "learning_rate": 5e-06, + "loss": 1.9042216539382935, + "mean_token_accuracy": 0.5840670578181744, + "num_tokens": 228502.0, + "step": 33 + }, + { + "entropy": 1.2740740850567818, + "epoch": 0.4995408631772268, + "grad_norm": 8.6875, + "learning_rate": 4.999946454160323e-06, + "loss": 1.657892107963562, + "mean_token_accuracy": 0.650751706212759, + "num_tokens": 235856.0, + "step": 34 + }, + { + "entropy": 1.2723774798214436, + "epoch": 0.5142332415059688, + "grad_norm": 6.6875, + "learning_rate": 4.999785818935018e-06, + "loss": 1.7274625301361084, + "mean_token_accuracy": 0.6495076902210712, + "num_tokens": 243386.0, + "step": 35 + }, + { + "entropy": 1.2608134560286999, + "epoch": 0.5289256198347108, + "grad_norm": 6.96875, + "learning_rate": 4.999518101205162e-06, + "loss": 1.5035489797592163, + "mean_token_accuracy": 0.6534168235957623, + "num_tokens": 250842.0, + "step": 36 + }, + { + "entropy": 1.3372750878334045, + "epoch": 0.5436179981634527, + "grad_norm": 7.1875, + "learning_rate": 4.999143312438893e-06, + "loss": 1.5423352718353271, + "mean_token_accuracy": 0.635912710800767, + "num_tokens": 257542.0, + "step": 37 + }, + { + "entropy": 1.3917848393321037, + "epoch": 0.5583103764921947, + "grad_norm": 6.53125, + "learning_rate": 4.998661468690914e-06, + "loss": 1.7300214767456055, + "mean_token_accuracy": 0.6232790667563677, + "num_tokens": 265642.0, + "step": 38 + }, + { + "entropy": 1.391078781336546, + "epoch": 0.5730027548209367, + "grad_norm": 7.6875, + "learning_rate": 4.998072590601808e-06, + "loss": 1.776171326637268, + "mean_token_accuracy": 0.6446701735258102, + "num_tokens": 271325.0, + "step": 39 + }, + { + "entropy": 1.419153816998005, + "epoch": 0.5876951331496786, + "grad_norm": 9.375, + "learning_rate": 4.997376703397151e-06, + "loss": 1.8279757499694824, + "mean_token_accuracy": 0.622003948315978, + "num_tokens": 276627.0, + "step": 40 + }, + { + "entropy": 1.32320836186409, + "epoch": 0.6023875114784206, + "grad_norm": 6.53125, + "learning_rate": 4.9965738368864345e-06, + "loss": 1.5522156953811646, + "mean_token_accuracy": 0.647944763302803, + "num_tokens": 283056.0, + "step": 41 + }, + { + "entropy": 1.4761433601379395, + "epoch": 0.6170798898071626, + "grad_norm": 6.09375, + "learning_rate": 4.99566402546179e-06, + "loss": 1.8013895750045776, + "mean_token_accuracy": 0.625993836671114, + "num_tokens": 290032.0, + "step": 42 + }, + { + "entropy": 1.309795543551445, + "epoch": 0.6317722681359045, + "grad_norm": 5.84375, + "learning_rate": 4.994647308096509e-06, + "loss": 1.5540151596069336, + "mean_token_accuracy": 0.6547523811459541, + "num_tokens": 297226.0, + "step": 43 + }, + { + "entropy": 1.356013897806406, + "epoch": 0.6464646464646465, + "grad_norm": 7.375, + "learning_rate": 4.99352372834338e-06, + "loss": 1.6234813928604126, + "mean_token_accuracy": 0.643750274553895, + "num_tokens": 302919.0, + "step": 44 + }, + { + "entropy": 1.3376823589205742, + "epoch": 0.6611570247933884, + "grad_norm": 5.71875, + "learning_rate": 4.992293334332821e-06, + "loss": 1.8485695123672485, + "mean_token_accuracy": 0.6466748863458633, + "num_tokens": 312466.0, + "step": 45 + }, + { + "entropy": 1.4561820216476917, + "epoch": 0.6758494031221304, + "grad_norm": 5.71875, + "learning_rate": 4.990956178770814e-06, + "loss": 1.6671806573867798, + "mean_token_accuracy": 0.6320093534886837, + "num_tokens": 319919.0, + "step": 46 + }, + { + "entropy": 1.3384652398526669, + "epoch": 0.6905417814508723, + "grad_norm": 5.25, + "learning_rate": 4.989512318936654e-06, + "loss": 1.5594499111175537, + "mean_token_accuracy": 0.6558941937983036, + "num_tokens": 327790.0, + "step": 47 + }, + { + "entropy": 1.6121552139520645, + "epoch": 0.7052341597796143, + "grad_norm": 6.4375, + "learning_rate": 4.987961816680493e-06, + "loss": 1.823395848274231, + "mean_token_accuracy": 0.5975735988467932, + "num_tokens": 334440.0, + "step": 48 + }, + { + "entropy": 1.473178207874298, + "epoch": 0.7199265381083563, + "grad_norm": 6.4375, + "learning_rate": 4.986304738420684e-06, + "loss": 1.7415186166763306, + "mean_token_accuracy": 0.6342501733452082, + "num_tokens": 340524.0, + "step": 49 + }, + { + "entropy": 1.472764991223812, + "epoch": 0.7346189164370982, + "grad_norm": 5.59375, + "learning_rate": 4.984541155140945e-06, + "loss": 1.6061500310897827, + "mean_token_accuracy": 0.6294202730059624, + "num_tokens": 347745.0, + "step": 50 + }, + { + "entropy": 1.453477706760168, + "epoch": 0.7493112947658402, + "grad_norm": 6.0, + "learning_rate": 4.982671142387316e-06, + "loss": 1.656001091003418, + "mean_token_accuracy": 0.6334934048354626, + "num_tokens": 354277.0, + "step": 51 + }, + { + "entropy": 1.5710610300302505, + "epoch": 0.7640036730945822, + "grad_norm": 7.8125, + "learning_rate": 4.980694780264918e-06, + "loss": 1.879157543182373, + "mean_token_accuracy": 0.6145644318312407, + "num_tokens": 361212.0, + "step": 52 + }, + { + "entropy": 1.5348852053284645, + "epoch": 0.7786960514233241, + "grad_norm": 5.46875, + "learning_rate": 4.978612153434527e-06, + "loss": 1.7147362232208252, + "mean_token_accuracy": 0.6237640716135502, + "num_tokens": 368602.0, + "step": 53 + }, + { + "entropy": 1.553167935460806, + "epoch": 0.7933884297520661, + "grad_norm": 5.40625, + "learning_rate": 4.976423351108943e-06, + "loss": 1.5548394918441772, + "mean_token_accuracy": 0.6278982330113649, + "num_tokens": 375209.0, + "step": 54 + }, + { + "entropy": 1.4203950092196465, + "epoch": 0.8080808080808081, + "grad_norm": 6.0, + "learning_rate": 4.974128467049177e-06, + "loss": 1.5801376104354858, + "mean_token_accuracy": 0.6398332640528679, + "num_tokens": 381525.0, + "step": 55 + }, + { + "entropy": 1.5313997939229012, + "epoch": 0.82277318640955, + "grad_norm": 6.0, + "learning_rate": 4.971727599560418e-06, + "loss": 1.455580472946167, + "mean_token_accuracy": 0.6429965775460005, + "num_tokens": 386924.0, + "step": 56 + }, + { + "entropy": 1.2961117215454578, + "epoch": 0.837465564738292, + "grad_norm": 5.125, + "learning_rate": 4.9692208514878445e-06, + "loss": 1.2825771570205688, + "mean_token_accuracy": 0.6828609891235828, + "num_tokens": 393737.0, + "step": 57 + }, + { + "entropy": 1.4601662941277027, + "epoch": 0.852157943067034, + "grad_norm": 5.34375, + "learning_rate": 4.966608330212198e-06, + "loss": 1.4792706966400146, + "mean_token_accuracy": 0.6504263170063496, + "num_tokens": 400614.0, + "step": 58 + }, + { + "entropy": 1.5047795996069908, + "epoch": 0.8668503213957759, + "grad_norm": 5.3125, + "learning_rate": 4.963890147645195e-06, + "loss": 1.6770135164260864, + "mean_token_accuracy": 0.6391430255025625, + "num_tokens": 408341.0, + "step": 59 + }, + { + "entropy": 1.6163291186094284, + "epoch": 0.8815426997245179, + "grad_norm": 6.53125, + "learning_rate": 4.961066420224729e-06, + "loss": 1.6832696199417114, + "mean_token_accuracy": 0.6109438072890043, + "num_tokens": 413972.0, + "step": 60 + }, + { + "entropy": 1.3349718637764454, + "epoch": 0.8962350780532599, + "grad_norm": 5.1875, + "learning_rate": 4.958137268909887e-06, + "loss": 1.410157322883606, + "mean_token_accuracy": 0.6598032023757696, + "num_tokens": 420932.0, + "step": 61 + }, + { + "entropy": 1.623923797160387, + "epoch": 0.9109274563820018, + "grad_norm": 6.6875, + "learning_rate": 4.95510281917576e-06, + "loss": 1.634577751159668, + "mean_token_accuracy": 0.6326909828931093, + "num_tokens": 425816.0, + "step": 62 + }, + { + "entropy": 1.7040501534938812, + "epoch": 0.9256198347107438, + "grad_norm": 5.5, + "learning_rate": 4.9519632010080765e-06, + "loss": 1.8456324338912964, + "mean_token_accuracy": 0.6114997789263725, + "num_tokens": 432371.0, + "step": 63 + }, + { + "entropy": 1.3237117007374763, + "epoch": 0.9403122130394858, + "grad_norm": 5.03125, + "learning_rate": 4.9487185488976284e-06, + "loss": 1.4221746921539307, + "mean_token_accuracy": 0.6745892986655235, + "num_tokens": 439631.0, + "step": 64 + }, + { + "entropy": 1.5274745747447014, + "epoch": 0.9550045913682277, + "grad_norm": 4.78125, + "learning_rate": 4.9453690018345144e-06, + "loss": 1.7518843412399292, + "mean_token_accuracy": 0.6415467616170645, + "num_tokens": 447294.0, + "step": 65 + }, + { + "entropy": 1.6217340305447578, + "epoch": 0.9696969696969697, + "grad_norm": 5.90625, + "learning_rate": 4.941914703302181e-06, + "loss": 1.9085001945495605, + "mean_token_accuracy": 0.602933943271637, + "num_tokens": 453188.0, + "step": 66 + }, + { + "entropy": 1.620997928082943, + "epoch": 0.9843893480257117, + "grad_norm": 6.1875, + "learning_rate": 4.938355801271282e-06, + "loss": 1.7774909734725952, + "mean_token_accuracy": 0.6129048503935337, + "num_tokens": 458849.0, + "step": 67 + }, + { + "entropy": 1.656703669577837, + "epoch": 0.9990817263544536, + "grad_norm": 4.78125, + "learning_rate": 4.9346924481933345e-06, + "loss": 1.8143261671066284, + "mean_token_accuracy": 0.6097159385681152, + "num_tokens": 467166.0, + "step": 68 + }, + { + "entropy": 1.6068611145019531, + "epoch": 1.0, + "grad_norm": 12.75, + "learning_rate": 4.930924800994192e-06, + "loss": 1.6701020002365112, + "mean_token_accuracy": 0.673130214214325, + "num_tokens": 467528.0, + "step": 69 + }, + { + "entropy": 1.3061356097459793, + "epoch": 1.014692378328742, + "grad_norm": 4.6875, + "learning_rate": 4.927053021067321e-06, + "loss": 1.3863725662231445, + "mean_token_accuracy": 0.6834770254790783, + "num_tokens": 474488.0, + "step": 70 + }, + { + "entropy": 1.3934976756572723, + "epoch": 1.0293847566574839, + "grad_norm": 5.1875, + "learning_rate": 4.923077274266886e-06, + "loss": 1.356377124786377, + "mean_token_accuracy": 0.6749065890908241, + "num_tokens": 480487.0, + "step": 71 + }, + { + "entropy": 1.2166254296898842, + "epoch": 1.044077134986226, + "grad_norm": 5.125, + "learning_rate": 4.91899773090065e-06, + "loss": 1.2729101181030273, + "mean_token_accuracy": 0.7036809120327234, + "num_tokens": 486565.0, + "step": 72 + }, + { + "entropy": 1.3788570798933506, + "epoch": 1.058769513314968, + "grad_norm": 4.46875, + "learning_rate": 4.914814565722671e-06, + "loss": 1.3389383554458618, + "mean_token_accuracy": 0.66689308360219, + "num_tokens": 494464.0, + "step": 73 + }, + { + "entropy": 1.4210151471197605, + "epoch": 1.0734618916437098, + "grad_norm": 5.21875, + "learning_rate": 4.9105279579258234e-06, + "loss": 1.5810019969940186, + "mean_token_accuracy": 0.6520100049674511, + "num_tokens": 502023.0, + "step": 74 + }, + { + "entropy": 1.3476647343486547, + "epoch": 1.0881542699724518, + "grad_norm": 5.125, + "learning_rate": 4.906138091134118e-06, + "loss": 1.4704662561416626, + "mean_token_accuracy": 0.6666250489652157, + "num_tokens": 508991.0, + "step": 75 + }, + { + "entropy": 1.715055987238884, + "epoch": 1.1028466483011938, + "grad_norm": 5.53125, + "learning_rate": 4.901645153394838e-06, + "loss": 1.7850326299667358, + "mean_token_accuracy": 0.6200868934392929, + "num_tokens": 515213.0, + "step": 76 + }, + { + "entropy": 1.33231370896101, + "epoch": 1.1175390266299357, + "grad_norm": 5.03125, + "learning_rate": 4.897049337170483e-06, + "loss": 1.4512275457382202, + "mean_token_accuracy": 0.6738582514226437, + "num_tokens": 522712.0, + "step": 77 + }, + { + "entropy": 1.430236928164959, + "epoch": 1.1322314049586777, + "grad_norm": 5.0625, + "learning_rate": 4.8923508393305224e-06, + "loss": 1.4064946174621582, + "mean_token_accuracy": 0.668969176709652, + "num_tokens": 529191.0, + "step": 78 + }, + { + "entropy": 1.3905731923878193, + "epoch": 1.1469237832874197, + "grad_norm": 4.59375, + "learning_rate": 4.887549861142967e-06, + "loss": 1.507306694984436, + "mean_token_accuracy": 0.6702582351863384, + "num_tokens": 538221.0, + "step": 79 + }, + { + "entropy": 1.6136963441967964, + "epoch": 1.1616161616161615, + "grad_norm": 5.875, + "learning_rate": 4.882646608265743e-06, + "loss": 1.8179521560668945, + "mean_token_accuracy": 0.6243273708969355, + "num_tokens": 544126.0, + "step": 80 + }, + { + "entropy": 1.3634940460324287, + "epoch": 1.1763085399449036, + "grad_norm": 5.4375, + "learning_rate": 4.8776412907378845e-06, + "loss": 1.4865293502807617, + "mean_token_accuracy": 0.6671657040715218, + "num_tokens": 550605.0, + "step": 81 + }, + { + "entropy": 1.4086507372558117, + "epoch": 1.1910009182736456, + "grad_norm": 5.59375, + "learning_rate": 4.872534122970536e-06, + "loss": 1.4881434440612793, + "mean_token_accuracy": 0.6586938947439194, + "num_tokens": 557623.0, + "step": 82 + }, + { + "entropy": 1.5293696448206902, + "epoch": 1.2056932966023874, + "grad_norm": 4.875, + "learning_rate": 4.867325323737765e-06, + "loss": 1.5523103475570679, + "mean_token_accuracy": 0.6492738723754883, + "num_tokens": 564872.0, + "step": 83 + }, + { + "entropy": 1.5812500715255737, + "epoch": 1.2203856749311295, + "grad_norm": 5.78125, + "learning_rate": 4.862015116167195e-06, + "loss": 1.5752160549163818, + "mean_token_accuracy": 0.6451857574284077, + "num_tokens": 570930.0, + "step": 84 + }, + { + "entropy": 1.5821347422897816, + "epoch": 1.2350780532598715, + "grad_norm": 5.78125, + "learning_rate": 4.856603727730446e-06, + "loss": 1.615751028060913, + "mean_token_accuracy": 0.6259971559047699, + "num_tokens": 577356.0, + "step": 85 + }, + { + "entropy": 1.2754014991223812, + "epoch": 1.2497704315886133, + "grad_norm": 5.59375, + "learning_rate": 4.8510913902333876e-06, + "loss": 1.2676167488098145, + "mean_token_accuracy": 0.6968142054975033, + "num_tokens": 584015.0, + "step": 86 + }, + { + "entropy": 1.4495088569819927, + "epoch": 1.2644628099173554, + "grad_norm": 6.78125, + "learning_rate": 4.845478339806211e-06, + "loss": 1.69454026222229, + "mean_token_accuracy": 0.6445063762366772, + "num_tokens": 590581.0, + "step": 87 + }, + { + "entropy": 1.3460834063589573, + "epoch": 1.2791551882460974, + "grad_norm": 5.4375, + "learning_rate": 4.839764816893315e-06, + "loss": 1.3651117086410522, + "mean_token_accuracy": 0.6775182671844959, + "num_tokens": 596816.0, + "step": 88 + }, + { + "entropy": 1.2347406335175037, + "epoch": 1.2938475665748392, + "grad_norm": 4.84375, + "learning_rate": 4.833951066243004e-06, + "loss": 1.2977527379989624, + "mean_token_accuracy": 0.6960192620754242, + "num_tokens": 604695.0, + "step": 89 + }, + { + "entropy": 1.3695692755281925, + "epoch": 1.3085399449035813, + "grad_norm": 5.75, + "learning_rate": 4.828037336897009e-06, + "loss": 1.640378475189209, + "mean_token_accuracy": 0.6644081249833107, + "num_tokens": 611293.0, + "step": 90 + }, + { + "entropy": 1.4080765470862389, + "epoch": 1.3232323232323233, + "grad_norm": 5.53125, + "learning_rate": 4.822023882179811e-06, + "loss": 1.437829852104187, + "mean_token_accuracy": 0.6620082408189774, + "num_tokens": 618241.0, + "step": 91 + }, + { + "entropy": 1.4261119589209557, + "epoch": 1.3379247015610651, + "grad_norm": 6.75, + "learning_rate": 4.815910959687795e-06, + "loss": 1.401389479637146, + "mean_token_accuracy": 0.6730383820831776, + "num_tokens": 623399.0, + "step": 92 + }, + { + "entropy": 1.444845873862505, + "epoch": 1.3526170798898072, + "grad_norm": 5.5, + "learning_rate": 4.809698831278217e-06, + "loss": 1.4253414869308472, + "mean_token_accuracy": 0.6722631379961967, + "num_tokens": 630189.0, + "step": 93 + }, + { + "entropy": 1.5948089621961117, + "epoch": 1.367309458218549, + "grad_norm": 5.53125, + "learning_rate": 4.803387763057981e-06, + "loss": 1.7228381633758545, + "mean_token_accuracy": 0.624824620783329, + "num_tokens": 637677.0, + "step": 94 + }, + { + "entropy": 1.4382336772978306, + "epoch": 1.382001836547291, + "grad_norm": 5.65625, + "learning_rate": 4.796978025372247e-06, + "loss": 1.484128475189209, + "mean_token_accuracy": 0.6511365957558155, + "num_tokens": 644602.0, + "step": 95 + }, + { + "entropy": 1.4599979110062122, + "epoch": 1.396694214876033, + "grad_norm": 5.65625, + "learning_rate": 4.79046989279284e-06, + "loss": 1.4486807584762573, + "mean_token_accuracy": 0.6637902185320854, + "num_tokens": 651044.0, + "step": 96 + }, + { + "entropy": 1.4183036126196384, + "epoch": 1.4113865932047749, + "grad_norm": 5.75, + "learning_rate": 4.783863644106502e-06, + "loss": 1.427507996559143, + "mean_token_accuracy": 0.6631320789456367, + "num_tokens": 657609.0, + "step": 97 + }, + { + "entropy": 1.36999873816967, + "epoch": 1.426078971533517, + "grad_norm": 5.21875, + "learning_rate": 4.77715956230294e-06, + "loss": 1.3143718242645264, + "mean_token_accuracy": 0.6901484504342079, + "num_tokens": 664907.0, + "step": 98 + }, + { + "entropy": 1.4660401456058025, + "epoch": 1.440771349862259, + "grad_norm": 5.0625, + "learning_rate": 4.770357934562704e-06, + "loss": 1.449824333190918, + "mean_token_accuracy": 0.6540784798562527, + "num_tokens": 672931.0, + "step": 99 + }, + { + "entropy": 1.4816335625946522, + "epoch": 1.4554637281910008, + "grad_norm": 5.6875, + "learning_rate": 4.7634590522448886e-06, + "loss": 1.5329768657684326, + "mean_token_accuracy": 0.6551914289593697, + "num_tokens": 679477.0, + "step": 100 + }, + { + "entropy": 1.7057891301810741, + "epoch": 1.4701561065197428, + "grad_norm": 5.28125, + "learning_rate": 4.7564632108746524e-06, + "loss": 1.9816747903823853, + "mean_token_accuracy": 0.5978529918938875, + "num_tokens": 687941.0, + "step": 101 + }, + { + "entropy": 1.5502509884536266, + "epoch": 1.4848484848484849, + "grad_norm": 4.78125, + "learning_rate": 4.7493707101305545e-06, + "loss": 1.7086496353149414, + "mean_token_accuracy": 0.6390546467155218, + "num_tokens": 696935.0, + "step": 102 + }, + { + "entropy": 1.5186870731413364, + "epoch": 1.4995408631772267, + "grad_norm": 5.5, + "learning_rate": 4.742181853831721e-06, + "loss": 1.626766324043274, + "mean_token_accuracy": 0.6448409371078014, + "num_tokens": 703847.0, + "step": 103 + }, + { + "entropy": 1.652857031673193, + "epoch": 1.514233241505969, + "grad_norm": 5.9375, + "learning_rate": 4.734896949924831e-06, + "loss": 1.8684097528457642, + "mean_token_accuracy": 0.6228403430432081, + "num_tokens": 709915.0, + "step": 104 + }, + { + "entropy": 1.3462006263434887, + "epoch": 1.5289256198347108, + "grad_norm": 6.5, + "learning_rate": 4.72751631047092e-06, + "loss": 1.3426733016967773, + "mean_token_accuracy": 0.696257971227169, + "num_tokens": 714957.0, + "step": 105 + }, + { + "entropy": 1.5189800672233105, + "epoch": 1.5436179981634526, + "grad_norm": 6.03125, + "learning_rate": 4.720040251632019e-06, + "loss": 1.5830191373825073, + "mean_token_accuracy": 0.6607260629534721, + "num_tokens": 720412.0, + "step": 106 + }, + { + "entropy": 1.3767579533159733, + "epoch": 1.5583103764921948, + "grad_norm": 4.65625, + "learning_rate": 4.712469093657605e-06, + "loss": 1.5013158321380615, + "mean_token_accuracy": 0.6755428463220596, + "num_tokens": 728047.0, + "step": 107 + }, + { + "entropy": 1.2227111794054508, + "epoch": 1.5730027548209367, + "grad_norm": 4.71875, + "learning_rate": 4.704803160870888e-06, + "loss": 1.4295673370361328, + "mean_token_accuracy": 0.6972721088677645, + "num_tokens": 735231.0, + "step": 108 + }, + { + "entropy": 1.4339848309755325, + "epoch": 1.5876951331496785, + "grad_norm": 4.84375, + "learning_rate": 4.697042781654913e-06, + "loss": 1.5147660970687866, + "mean_token_accuracy": 0.6610862240195274, + "num_tokens": 742143.0, + "step": 109 + }, + { + "entropy": 1.7120601199567318, + "epoch": 1.6023875114784207, + "grad_norm": 4.875, + "learning_rate": 4.6891882884384994e-06, + "loss": 1.8835252523422241, + "mean_token_accuracy": 0.6148366816341877, + "num_tokens": 748878.0, + "step": 110 + }, + { + "entropy": 1.366846838966012, + "epoch": 1.6170798898071626, + "grad_norm": 4.75, + "learning_rate": 4.681240017681994e-06, + "loss": 1.314517617225647, + "mean_token_accuracy": 0.6879813298583031, + "num_tokens": 755068.0, + "step": 111 + }, + { + "entropy": 1.5042930357158184, + "epoch": 1.6317722681359044, + "grad_norm": 4.375, + "learning_rate": 4.67319830986286e-06, + "loss": 1.4018523693084717, + "mean_token_accuracy": 0.6578907147049904, + "num_tokens": 762233.0, + "step": 112 + }, + { + "entropy": 1.2661687284708023, + "epoch": 1.6464646464646466, + "grad_norm": 4.65625, + "learning_rate": 4.665063509461098e-06, + "loss": 1.2542436122894287, + "mean_token_accuracy": 0.698942206799984, + "num_tokens": 768426.0, + "step": 113 + }, + { + "entropy": 1.402594517916441, + "epoch": 1.6611570247933884, + "grad_norm": 4.71875, + "learning_rate": 4.65683596494448e-06, + "loss": 1.4597134590148926, + "mean_token_accuracy": 0.6789544485509396, + "num_tokens": 775863.0, + "step": 114 + }, + { + "entropy": 1.3753325566649437, + "epoch": 1.6758494031221303, + "grad_norm": 4.4375, + "learning_rate": 4.648516028753632e-06, + "loss": 1.4206632375717163, + "mean_token_accuracy": 0.6716446243226528, + "num_tokens": 782179.0, + "step": 115 + }, + { + "entropy": 1.3194672428071499, + "epoch": 1.6905417814508723, + "grad_norm": 3.765625, + "learning_rate": 4.6401040572869295e-06, + "loss": 1.3911067247390747, + "mean_token_accuracy": 0.6902661826461554, + "num_tokens": 789999.0, + "step": 116 + }, + { + "entropy": 1.1185118220746517, + "epoch": 1.7052341597796143, + "grad_norm": 3.375, + "learning_rate": 4.631600410885231e-06, + "loss": 1.1200969219207764, + "mean_token_accuracy": 0.7202032878994942, + "num_tokens": 797816.0, + "step": 117 + }, + { + "entropy": 1.6263831928372383, + "epoch": 1.7199265381083562, + "grad_norm": 4.6875, + "learning_rate": 4.623005453816447e-06, + "loss": 1.7719974517822266, + "mean_token_accuracy": 0.6271817404776812, + "num_tokens": 803655.0, + "step": 118 + }, + { + "entropy": 1.5183000564575195, + "epoch": 1.7346189164370982, + "grad_norm": 4.875, + "learning_rate": 4.614319554259934e-06, + "loss": 1.5544443130493164, + "mean_token_accuracy": 0.657878614962101, + "num_tokens": 809724.0, + "step": 119 + }, + { + "entropy": 1.6561345160007477, + "epoch": 1.7493112947658402, + "grad_norm": 4.09375, + "learning_rate": 4.605543084290716e-06, + "loss": 1.8985499143600464, + "mean_token_accuracy": 0.6286845244467258, + "num_tokens": 815942.0, + "step": 120 + }, + { + "entropy": 1.3539353236556053, + "epoch": 1.764003673094582, + "grad_norm": 3.703125, + "learning_rate": 4.596676419863561e-06, + "loss": 1.453987956047058, + "mean_token_accuracy": 0.6819523572921753, + "num_tokens": 823392.0, + "step": 121 + }, + { + "entropy": 1.341381138190627, + "epoch": 1.778696051423324, + "grad_norm": 3.609375, + "learning_rate": 4.587719940796858e-06, + "loss": 1.3191108703613281, + "mean_token_accuracy": 0.6868677549064159, + "num_tokens": 830069.0, + "step": 122 + }, + { + "entropy": 1.5383460223674774, + "epoch": 1.7933884297520661, + "grad_norm": 4.1875, + "learning_rate": 4.578674030756364e-06, + "loss": 1.5670809745788574, + "mean_token_accuracy": 0.6535362396389246, + "num_tokens": 836019.0, + "step": 123 + }, + { + "entropy": 1.517351869493723, + "epoch": 1.808080808080808, + "grad_norm": 4.0625, + "learning_rate": 4.569539077238756e-06, + "loss": 1.5154145956039429, + "mean_token_accuracy": 0.6488186921924353, + "num_tokens": 842399.0, + "step": 124 + }, + { + "entropy": 1.4582207016646862, + "epoch": 1.82277318640955, + "grad_norm": 4.28125, + "learning_rate": 4.560315471555039e-06, + "loss": 1.5142159461975098, + "mean_token_accuracy": 0.6643350049853325, + "num_tokens": 848509.0, + "step": 125 + }, + { + "entropy": 1.8213122673332691, + "epoch": 1.837465564738292, + "grad_norm": 4.375, + "learning_rate": 4.551003608813784e-06, + "loss": 1.9438310861587524, + "mean_token_accuracy": 0.6077302508056164, + "num_tokens": 854624.0, + "step": 126 + }, + { + "entropy": 1.2630420215427876, + "epoch": 1.8521579430670339, + "grad_norm": 3.484375, + "learning_rate": 4.541603887904198e-06, + "loss": 1.3397103548049927, + "mean_token_accuracy": 0.6940340362489223, + "num_tokens": 862970.0, + "step": 127 + }, + { + "entropy": 1.521993912756443, + "epoch": 1.866850321395776, + "grad_norm": 3.734375, + "learning_rate": 4.532116711479039e-06, + "loss": 1.649178147315979, + "mean_token_accuracy": 0.6439413316547871, + "num_tokens": 870524.0, + "step": 128 + }, + { + "epoch": 1.866850321395776, + "eval_entropy": 1.3699656426906586, + "eval_loss": 1.4730713367462158, + "eval_mean_token_accuracy": 0.6691670566797256, + "eval_num_tokens": 870524.0, + "eval_runtime": 1.6753, + "eval_samples_per_second": 34.621, + "eval_steps_per_second": 4.775, + "step": 128 + }, + { + "entropy": 1.1558082550764084, + "epoch": 1.881542699724518, + "grad_norm": 3.921875, + "learning_rate": 4.522542485937369e-06, + "loss": 1.233929991722107, + "mean_token_accuracy": 0.7151609733700752, + "num_tokens": 878419.0, + "step": 129 + }, + { + "entropy": 1.4429849721491337, + "epoch": 1.8962350780532597, + "grad_norm": 3.625, + "learning_rate": 4.512881621407146e-06, + "loss": 1.3959054946899414, + "mean_token_accuracy": 0.6707526985555887, + "num_tokens": 885787.0, + "step": 130 + }, + { + "entropy": 1.315425992012024, + "epoch": 1.9109274563820018, + "grad_norm": 3.546875, + "learning_rate": 4.503134531727652e-06, + "loss": 1.2897546291351318, + "mean_token_accuracy": 0.6948176473379135, + "num_tokens": 891849.0, + "step": 131 + }, + { + "entropy": 1.3664759285748005, + "epoch": 1.9256198347107438, + "grad_norm": 4.03125, + "learning_rate": 4.493301634431768e-06, + "loss": 1.2926387786865234, + "mean_token_accuracy": 0.6821446903049946, + "num_tokens": 898855.0, + "step": 132 + }, + { + "entropy": 1.4670868627727032, + "epoch": 1.9403122130394856, + "grad_norm": 3.578125, + "learning_rate": 4.4833833507280884e-06, + "loss": 1.7085148096084595, + "mean_token_accuracy": 0.6683255881071091, + "num_tokens": 906434.0, + "step": 133 + }, + { + "entropy": 1.323768761008978, + "epoch": 1.9550045913682277, + "grad_norm": 4.0, + "learning_rate": 4.473380105482875e-06, + "loss": 1.2290809154510498, + "mean_token_accuracy": 0.695992011576891, + "num_tokens": 911789.0, + "step": 134 + }, + { + "entropy": 1.228843528777361, + "epoch": 1.9696969696969697, + "grad_norm": 3.6875, + "learning_rate": 4.463292327201862e-06, + "loss": 1.2656970024108887, + "mean_token_accuracy": 0.7151234671473503, + "num_tokens": 918845.0, + "step": 135 + }, + { + "entropy": 1.5310376584529877, + "epoch": 1.9843893480257115, + "grad_norm": 3.703125, + "learning_rate": 4.453120448011897e-06, + "loss": 1.666202425956726, + "mean_token_accuracy": 0.6378280259668827, + "num_tokens": 926751.0, + "step": 136 + }, + { + "entropy": 1.4586737379431725, + "epoch": 1.9990817263544536, + "grad_norm": 3.4375, + "learning_rate": 4.442864903642428e-06, + "loss": 1.6007022857666016, + "mean_token_accuracy": 0.6548678297549486, + "num_tokens": 934942.0, + "step": 137 + }, + { + "entropy": 1.197638750076294, + "epoch": 2.0, + "grad_norm": 24.625, + "learning_rate": 4.432526133406843e-06, + "loss": 1.1470692157745361, + "mean_token_accuracy": 0.7256637215614319, + "num_tokens": 935056.0, + "step": 138 + }, + { + "entropy": 1.3461281508207321, + "epoch": 2.014692378328742, + "grad_norm": 3.875, + "learning_rate": 4.422104580183649e-06, + "loss": 1.4025999307632446, + "mean_token_accuracy": 0.6926173456013203, + "num_tokens": 941229.0, + "step": 139 + }, + { + "entropy": 1.6541437543928623, + "epoch": 2.029384756657484, + "grad_norm": 4.09375, + "learning_rate": 4.4116006903975015e-06, + "loss": 1.6707371473312378, + "mean_token_accuracy": 0.6277699284255505, + "num_tokens": 948566.0, + "step": 140 + }, + { + "entropy": 1.2840190175920725, + "epoch": 2.044077134986226, + "grad_norm": 3.46875, + "learning_rate": 4.401014914000078e-06, + "loss": 1.2021766901016235, + "mean_token_accuracy": 0.6986711807549, + "num_tokens": 954773.0, + "step": 141 + }, + { + "entropy": 1.4697693847119808, + "epoch": 2.0587695133149677, + "grad_norm": 3.8125, + "learning_rate": 4.3903477044508066e-06, + "loss": 1.5293481349945068, + "mean_token_accuracy": 0.6505694799125195, + "num_tokens": 961834.0, + "step": 142 + }, + { + "entropy": 1.5127912238240242, + "epoch": 2.07346189164371, + "grad_norm": 3.8125, + "learning_rate": 4.379599518697444e-06, + "loss": 1.735708236694336, + "mean_token_accuracy": 0.6535333096981049, + "num_tokens": 968542.0, + "step": 143 + }, + { + "entropy": 1.3300835229456425, + "epoch": 2.088154269972452, + "grad_norm": 3.40625, + "learning_rate": 4.368770817156493e-06, + "loss": 1.3301336765289307, + "mean_token_accuracy": 0.6858880035579205, + "num_tokens": 976389.0, + "step": 144 + }, + { + "entropy": 1.2104117833077908, + "epoch": 2.1028466483011936, + "grad_norm": 3.296875, + "learning_rate": 4.357862063693486e-06, + "loss": 1.1409953832626343, + "mean_token_accuracy": 0.7150565646588802, + "num_tokens": 983299.0, + "step": 145 + }, + { + "entropy": 1.3603767342865467, + "epoch": 2.117539026629936, + "grad_norm": 3.5, + "learning_rate": 4.3468737256031155e-06, + "loss": 1.3795615434646606, + "mean_token_accuracy": 0.6860510632395744, + "num_tokens": 989539.0, + "step": 146 + }, + { + "entropy": 1.4871582835912704, + "epoch": 2.1322314049586777, + "grad_norm": 3.359375, + "learning_rate": 4.335806273589214e-06, + "loss": 1.573998212814331, + "mean_token_accuracy": 0.651427399367094, + "num_tokens": 997548.0, + "step": 147 + }, + { + "entropy": 1.4681177996098995, + "epoch": 2.1469237832874195, + "grad_norm": 4.125, + "learning_rate": 4.324660181744589e-06, + "loss": 1.5126701593399048, + "mean_token_accuracy": 0.6612453535199165, + "num_tokens": 1002910.0, + "step": 148 + }, + { + "entropy": 1.3668258637189865, + "epoch": 2.1616161616161618, + "grad_norm": 3.9375, + "learning_rate": 4.313435927530719e-06, + "loss": 1.3531981706619263, + "mean_token_accuracy": 0.6994708813726902, + "num_tokens": 1008967.0, + "step": 149 + }, + { + "entropy": 1.478852679952979, + "epoch": 2.1763085399449036, + "grad_norm": 3.59375, + "learning_rate": 4.3021339917572975e-06, + "loss": 1.4089787006378174, + "mean_token_accuracy": 0.6642785873264074, + "num_tokens": 1015107.0, + "step": 150 + }, + { + "entropy": 1.364537600427866, + "epoch": 2.1910009182736454, + "grad_norm": 3.734375, + "learning_rate": 4.290754858561636e-06, + "loss": 1.4817034006118774, + "mean_token_accuracy": 0.6840209178626537, + "num_tokens": 1022757.0, + "step": 151 + }, + { + "entropy": 1.5241572260856628, + "epoch": 2.2056932966023877, + "grad_norm": 3.9375, + "learning_rate": 4.2792990153879286e-06, + "loss": 1.6508989334106445, + "mean_token_accuracy": 0.6462781187146902, + "num_tokens": 1030034.0, + "step": 152 + }, + { + "entropy": 1.4099011793732643, + "epoch": 2.2203856749311295, + "grad_norm": 3.484375, + "learning_rate": 4.267766952966369e-06, + "loss": 1.361659288406372, + "mean_token_accuracy": 0.6699612885713577, + "num_tokens": 1036957.0, + "step": 153 + }, + { + "entropy": 1.5446589030325413, + "epoch": 2.2350780532598713, + "grad_norm": 3.671875, + "learning_rate": 4.25615916529213e-06, + "loss": 1.724553108215332, + "mean_token_accuracy": 0.6526541039347649, + "num_tokens": 1043955.0, + "step": 154 + }, + { + "entropy": 1.2551699057221413, + "epoch": 2.2497704315886136, + "grad_norm": 3.71875, + "learning_rate": 4.244476149604201e-06, + "loss": 1.2087091207504272, + "mean_token_accuracy": 0.6985193602740765, + "num_tokens": 1050087.0, + "step": 155 + }, + { + "entropy": 1.414844986051321, + "epoch": 2.2644628099173554, + "grad_norm": 3.890625, + "learning_rate": 4.2327184063640905e-06, + "loss": 1.4239919185638428, + "mean_token_accuracy": 0.6687978152185678, + "num_tokens": 1056578.0, + "step": 156 + }, + { + "entropy": 1.3670090530067682, + "epoch": 2.279155188246097, + "grad_norm": 3.34375, + "learning_rate": 4.220886439234385e-06, + "loss": 1.4656591415405273, + "mean_token_accuracy": 0.6766840294003487, + "num_tokens": 1063625.0, + "step": 157 + }, + { + "entropy": 1.3662168271839619, + "epoch": 2.2938475665748395, + "grad_norm": 3.625, + "learning_rate": 4.2089807550571786e-06, + "loss": 1.3282774686813354, + "mean_token_accuracy": 0.6898504607379436, + "num_tokens": 1070304.0, + "step": 158 + }, + { + "entropy": 1.39240912348032, + "epoch": 2.3085399449035813, + "grad_norm": 3.6875, + "learning_rate": 4.197001863832355e-06, + "loss": 1.341300368309021, + "mean_token_accuracy": 0.6747097820043564, + "num_tokens": 1076203.0, + "step": 159 + }, + { + "entropy": 1.2145243491977453, + "epoch": 2.323232323232323, + "grad_norm": 3.03125, + "learning_rate": 4.184950278695745e-06, + "loss": 1.2274876832962036, + "mean_token_accuracy": 0.7132319957017899, + "num_tokens": 1084492.0, + "step": 160 + }, + { + "entropy": 1.4162307903170586, + "epoch": 2.3379247015610654, + "grad_norm": 3.90625, + "learning_rate": 4.172826515897146e-06, + "loss": 1.4108020067214966, + "mean_token_accuracy": 0.6850063428282738, + "num_tokens": 1090620.0, + "step": 161 + }, + { + "entropy": 1.5210106298327446, + "epoch": 2.352617079889807, + "grad_norm": 3.84375, + "learning_rate": 4.160631094778205e-06, + "loss": 1.6292668581008911, + "mean_token_accuracy": 0.6409319471567869, + "num_tokens": 1096968.0, + "step": 162 + }, + { + "entropy": 1.3625940009951591, + "epoch": 2.367309458218549, + "grad_norm": 3.484375, + "learning_rate": 4.1483645377501726e-06, + "loss": 1.4572113752365112, + "mean_token_accuracy": 0.6829789131879807, + "num_tokens": 1104565.0, + "step": 163 + }, + { + "entropy": 1.3159149996936321, + "epoch": 2.3820018365472913, + "grad_norm": 3.578125, + "learning_rate": 4.136027370271526e-06, + "loss": 1.4974924325942993, + "mean_token_accuracy": 0.688617680221796, + "num_tokens": 1111651.0, + "step": 164 + }, + { + "entropy": 1.2680894508957863, + "epoch": 2.396694214876033, + "grad_norm": 3.265625, + "learning_rate": 4.123620120825459e-06, + "loss": 1.19207763671875, + "mean_token_accuracy": 0.7037702575325966, + "num_tokens": 1118713.0, + "step": 165 + }, + { + "entropy": 1.471781674772501, + "epoch": 2.411386593204775, + "grad_norm": 4.84375, + "learning_rate": 4.111143320897244e-06, + "loss": 1.400710105895996, + "mean_token_accuracy": 0.6655568517744541, + "num_tokens": 1123776.0, + "step": 166 + }, + { + "entropy": 1.3692159168422222, + "epoch": 2.426078971533517, + "grad_norm": 3.578125, + "learning_rate": 4.098597504951462e-06, + "loss": 1.358292579650879, + "mean_token_accuracy": 0.6901015266776085, + "num_tokens": 1130383.0, + "step": 167 + }, + { + "entropy": 1.4371467269957066, + "epoch": 2.440771349862259, + "grad_norm": 4.40625, + "learning_rate": 4.085983210409114e-06, + "loss": 1.5146667957305908, + "mean_token_accuracy": 0.6645361706614494, + "num_tokens": 1137520.0, + "step": 168 + }, + { + "entropy": 1.2199561521410942, + "epoch": 2.455463728191001, + "grad_norm": 3.515625, + "learning_rate": 4.073300977624594e-06, + "loss": 1.229698657989502, + "mean_token_accuracy": 0.701645290479064, + "num_tokens": 1144374.0, + "step": 169 + }, + { + "entropy": 1.3611398451030254, + "epoch": 2.470156106519743, + "grad_norm": 3.578125, + "learning_rate": 4.060551349862545e-06, + "loss": 1.5089250802993774, + "mean_token_accuracy": 0.6758165024220943, + "num_tokens": 1152178.0, + "step": 170 + }, + { + "entropy": 1.3143669590353966, + "epoch": 2.484848484848485, + "grad_norm": 3.875, + "learning_rate": 4.047734873274586e-06, + "loss": 1.3569583892822266, + "mean_token_accuracy": 0.6815139781683683, + "num_tokens": 1158215.0, + "step": 171 + }, + { + "entropy": 1.4629556462168694, + "epoch": 2.4995408631772267, + "grad_norm": 3.03125, + "learning_rate": 4.034852096875917e-06, + "loss": 1.4647691249847412, + "mean_token_accuracy": 0.6731207054108381, + "num_tokens": 1165732.0, + "step": 172 + }, + { + "entropy": 1.2865781262516975, + "epoch": 2.514233241505969, + "grad_norm": 3.140625, + "learning_rate": 4.021903572521802e-06, + "loss": 1.2602851390838623, + "mean_token_accuracy": 0.6870401427149773, + "num_tokens": 1173656.0, + "step": 173 + }, + { + "entropy": 1.2481586299836636, + "epoch": 2.5289256198347108, + "grad_norm": 4.0625, + "learning_rate": 4.0088898548839285e-06, + "loss": 1.4004302024841309, + "mean_token_accuracy": 0.7000849805772305, + "num_tokens": 1180446.0, + "step": 174 + }, + { + "entropy": 1.4398693591356277, + "epoch": 2.5436179981634526, + "grad_norm": 3.953125, + "learning_rate": 3.995811501426648e-06, + "loss": 1.4066277742385864, + "mean_token_accuracy": 0.6720283292233944, + "num_tokens": 1186431.0, + "step": 175 + }, + { + "entropy": 1.5063546746969223, + "epoch": 2.558310376492195, + "grad_norm": 4.15625, + "learning_rate": 3.982669072383093e-06, + "loss": 1.59934663772583, + "mean_token_accuracy": 0.6534310914576054, + "num_tokens": 1193242.0, + "step": 176 + }, + { + "entropy": 1.3044538162648678, + "epoch": 2.5730027548209367, + "grad_norm": 3.4375, + "learning_rate": 3.969463130731183e-06, + "loss": 1.319208025932312, + "mean_token_accuracy": 0.6941639743745327, + "num_tokens": 1200529.0, + "step": 177 + }, + { + "entropy": 1.3389928713440895, + "epoch": 2.5876951331496785, + "grad_norm": 3.53125, + "learning_rate": 3.956194242169506e-06, + "loss": 1.21915864944458, + "mean_token_accuracy": 0.696018248796463, + "num_tokens": 1206836.0, + "step": 178 + }, + { + "entropy": 1.424593310803175, + "epoch": 2.6023875114784207, + "grad_norm": 3.796875, + "learning_rate": 3.942862975093085e-06, + "loss": 1.4455430507659912, + "mean_token_accuracy": 0.6746221072971821, + "num_tokens": 1212825.0, + "step": 179 + }, + { + "entropy": 1.3970806077122688, + "epoch": 2.6170798898071626, + "grad_norm": 3.8125, + "learning_rate": 3.929469900569031e-06, + "loss": 1.4156700372695923, + "mean_token_accuracy": 0.6885317526757717, + "num_tokens": 1218891.0, + "step": 180 + }, + { + "entropy": 1.4365531019866467, + "epoch": 2.6317722681359044, + "grad_norm": 3.65625, + "learning_rate": 3.916015592312083e-06, + "loss": 1.4302277565002441, + "mean_token_accuracy": 0.677625959739089, + "num_tokens": 1225720.0, + "step": 181 + }, + { + "entropy": 1.4176970534026623, + "epoch": 2.6464646464646466, + "grad_norm": 3.75, + "learning_rate": 3.902500626660025e-06, + "loss": 1.4448661804199219, + "mean_token_accuracy": 0.6666108258068562, + "num_tokens": 1232834.0, + "step": 182 + }, + { + "entropy": 1.481460090726614, + "epoch": 2.6611570247933884, + "grad_norm": 3.390625, + "learning_rate": 3.888925582549006e-06, + "loss": 1.395293951034546, + "mean_token_accuracy": 0.6644096374511719, + "num_tokens": 1240900.0, + "step": 183 + }, + { + "entropy": 1.2771461643278599, + "epoch": 2.6758494031221303, + "grad_norm": 3.765625, + "learning_rate": 3.875291041488734e-06, + "loss": 1.2739803791046143, + "mean_token_accuracy": 0.6979184821248055, + "num_tokens": 1247930.0, + "step": 184 + }, + { + "entropy": 1.484820794314146, + "epoch": 2.6905417814508725, + "grad_norm": 3.921875, + "learning_rate": 3.861597587537568e-06, + "loss": 1.5094577074050903, + "mean_token_accuracy": 0.6664229705929756, + "num_tokens": 1254535.0, + "step": 185 + }, + { + "entropy": 1.3980318270623684, + "epoch": 2.7052341597796143, + "grad_norm": 3.21875, + "learning_rate": 3.847845807277501e-06, + "loss": 1.3485972881317139, + "mean_token_accuracy": 0.6803864054381847, + "num_tokens": 1261710.0, + "step": 186 + }, + { + "entropy": 1.429235778748989, + "epoch": 2.719926538108356, + "grad_norm": 3.625, + "learning_rate": 3.83403628978903e-06, + "loss": 1.463432788848877, + "mean_token_accuracy": 0.6720171179622412, + "num_tokens": 1268676.0, + "step": 187 + }, + { + "entropy": 1.2592237815260887, + "epoch": 2.734618916437098, + "grad_norm": 3.1875, + "learning_rate": 3.82016962662592e-06, + "loss": 1.3745115995407104, + "mean_token_accuracy": 0.7008189521729946, + "num_tokens": 1276737.0, + "step": 188 + }, + { + "entropy": 1.5628399066627026, + "epoch": 2.7493112947658402, + "grad_norm": 3.71875, + "learning_rate": 3.806246411789872e-06, + "loss": 1.67648446559906, + "mean_token_accuracy": 0.6497368421405554, + "num_tokens": 1283918.0, + "step": 189 + }, + { + "entropy": 1.4043805077672005, + "epoch": 2.764003673094582, + "grad_norm": 3.453125, + "learning_rate": 3.7922672417050687e-06, + "loss": 1.4935755729675293, + "mean_token_accuracy": 0.6647081095725298, + "num_tokens": 1291464.0, + "step": 190 + }, + { + "entropy": 1.4052232280373573, + "epoch": 2.7786960514233243, + "grad_norm": 3.671875, + "learning_rate": 3.77823271519263e-06, + "loss": 1.4137238264083862, + "mean_token_accuracy": 0.6810254417359829, + "num_tokens": 1298358.0, + "step": 191 + }, + { + "entropy": 1.2509154602885246, + "epoch": 2.793388429752066, + "grad_norm": 3.125, + "learning_rate": 3.764143433444962e-06, + "loss": 1.3093594312667847, + "mean_token_accuracy": 0.7068472765386105, + "num_tokens": 1306536.0, + "step": 192 + }, + { + "entropy": 1.424871776252985, + "epoch": 2.808080808080808, + "grad_norm": 4.21875, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.388112187385559, + "mean_token_accuracy": 0.675894346088171, + "num_tokens": 1312053.0, + "step": 193 + }, + { + "entropy": 1.4783972389996052, + "epoch": 2.8227731864095498, + "grad_norm": 3.953125, + "learning_rate": 3.735803020715362e-06, + "loss": 1.4788545370101929, + "mean_token_accuracy": 0.6643631141632795, + "num_tokens": 1317701.0, + "step": 194 + }, + { + "entropy": 1.5499090813100338, + "epoch": 2.837465564738292, + "grad_norm": 4.1875, + "learning_rate": 3.721553103742388e-06, + "loss": 1.5269482135772705, + "mean_token_accuracy": 0.6374738682061434, + "num_tokens": 1324429.0, + "step": 195 + }, + { + "entropy": 1.4945064820349216, + "epoch": 2.852157943067034, + "grad_norm": 4.0, + "learning_rate": 3.7072508595000935e-06, + "loss": 1.5595450401306152, + "mean_token_accuracy": 0.6467197947204113, + "num_tokens": 1330791.0, + "step": 196 + }, + { + "entropy": 1.3186984993517399, + "epoch": 2.866850321395776, + "grad_norm": 3.375, + "learning_rate": 3.6928969006490212e-06, + "loss": 1.2567209005355835, + "mean_token_accuracy": 0.6920228451490402, + "num_tokens": 1338599.0, + "step": 197 + }, + { + "entropy": 1.6448290199041367, + "epoch": 2.881542699724518, + "grad_norm": 3.65625, + "learning_rate": 3.6784918420649952e-06, + "loss": 1.6368999481201172, + "mean_token_accuracy": 0.6448632068932056, + "num_tokens": 1345432.0, + "step": 198 + }, + { + "entropy": 1.4667476452887058, + "epoch": 2.8962350780532597, + "grad_norm": 3.796875, + "learning_rate": 3.664036300812779e-06, + "loss": 1.43418288230896, + "mean_token_accuracy": 0.6738253943622112, + "num_tokens": 1351852.0, + "step": 199 + }, + { + "entropy": 1.4334681946784258, + "epoch": 2.9109274563820016, + "grad_norm": 3.390625, + "learning_rate": 3.64953089611965e-06, + "loss": 1.4027811288833618, + "mean_token_accuracy": 0.6735595650970936, + "num_tokens": 1358988.0, + "step": 200 + }, + { + "entropy": 1.4100973345339298, + "epoch": 2.925619834710744, + "grad_norm": 3.484375, + "learning_rate": 3.634976249348867e-06, + "loss": 1.6405863761901855, + "mean_token_accuracy": 0.6570570301264524, + "num_tokens": 1367775.0, + "step": 201 + }, + { + "entropy": 1.3591697476804256, + "epoch": 2.9403122130394856, + "grad_norm": 3.375, + "learning_rate": 3.6203729839730567e-06, + "loss": 1.395234227180481, + "mean_token_accuracy": 0.6774147320538759, + "num_tokens": 1374916.0, + "step": 202 + }, + { + "entropy": 1.3890802599489689, + "epoch": 2.955004591368228, + "grad_norm": 3.484375, + "learning_rate": 3.6057217255475034e-06, + "loss": 1.4430360794067383, + "mean_token_accuracy": 0.6852261833846569, + "num_tokens": 1381503.0, + "step": 203 + }, + { + "entropy": 1.1807276085019112, + "epoch": 2.9696969696969697, + "grad_norm": 3.890625, + "learning_rate": 3.591023101683355e-06, + "loss": 1.140221118927002, + "mean_token_accuracy": 0.7134524993598461, + "num_tokens": 1387530.0, + "step": 204 + }, + { + "entropy": 1.5116482749581337, + "epoch": 2.9843893480257115, + "grad_norm": 3.578125, + "learning_rate": 3.5762777420207382e-06, + "loss": 1.4948053359985352, + "mean_token_accuracy": 0.6597633976489305, + "num_tokens": 1394436.0, + "step": 205 + }, + { + "entropy": 1.4017915055155754, + "epoch": 2.9990817263544534, + "grad_norm": 3.53125, + "learning_rate": 3.5614862782017833e-06, + "loss": 1.3626996278762817, + "mean_token_accuracy": 0.6668695509433746, + "num_tokens": 1402172.0, + "step": 206 + }, + { + "entropy": 2.1258840560913086, + "epoch": 3.0, + "grad_norm": 14.5, + "learning_rate": 3.5466493438435707e-06, + "loss": 2.277696132659912, + "mean_token_accuracy": 0.540145993232727, + "num_tokens": 1402584.0, + "step": 207 + }, + { + "entropy": 1.4028206542134285, + "epoch": 3.014692378328742, + "grad_norm": 3.609375, + "learning_rate": 3.531767574510987e-06, + "loss": 1.37511146068573, + "mean_token_accuracy": 0.6790164671838284, + "num_tokens": 1409137.0, + "step": 208 + }, + { + "entropy": 1.1828167587518692, + "epoch": 3.029384756657484, + "grad_norm": 3.421875, + "learning_rate": 3.516841607689501e-06, + "loss": 1.20148766040802, + "mean_token_accuracy": 0.700589882209897, + "num_tokens": 1416171.0, + "step": 209 + }, + { + "entropy": 1.3335931710898876, + "epoch": 3.044077134986226, + "grad_norm": 3.34375, + "learning_rate": 3.5018720827578523e-06, + "loss": 1.328529715538025, + "mean_token_accuracy": 0.6999878343194723, + "num_tokens": 1423155.0, + "step": 210 + }, + { + "entropy": 1.6100288890302181, + "epoch": 3.0587695133149677, + "grad_norm": 3.84375, + "learning_rate": 3.486859640960668e-06, + "loss": 1.5763328075408936, + "mean_token_accuracy": 0.6437154449522495, + "num_tokens": 1429836.0, + "step": 211 + }, + { + "entropy": 1.2661379724740982, + "epoch": 3.07346189164371, + "grad_norm": 3.40625, + "learning_rate": 3.4718049253809894e-06, + "loss": 1.2684996128082275, + "mean_token_accuracy": 0.7078960034996271, + "num_tokens": 1436023.0, + "step": 212 + }, + { + "entropy": 1.5840991362929344, + "epoch": 3.088154269972452, + "grad_norm": 3.65625, + "learning_rate": 3.4567085809127247e-06, + "loss": 1.6588152647018433, + "mean_token_accuracy": 0.6349230799823999, + "num_tokens": 1444044.0, + "step": 213 + }, + { + "entropy": 1.4915673546493053, + "epoch": 3.1028466483011936, + "grad_norm": 4.21875, + "learning_rate": 3.441571254233027e-06, + "loss": 1.5922610759735107, + "mean_token_accuracy": 0.6678136102855206, + "num_tokens": 1450155.0, + "step": 214 + }, + { + "entropy": 1.3006605990231037, + "epoch": 3.117539026629936, + "grad_norm": 4.28125, + "learning_rate": 3.426393593774591e-06, + "loss": 1.3100212812423706, + "mean_token_accuracy": 0.6970670148730278, + "num_tokens": 1455544.0, + "step": 215 + }, + { + "entropy": 1.5377335771918297, + "epoch": 3.1322314049586777, + "grad_norm": 3.859375, + "learning_rate": 3.4111762496978753e-06, + "loss": 1.5166088342666626, + "mean_token_accuracy": 0.6497631967067719, + "num_tokens": 1462550.0, + "step": 216 + }, + { + "entropy": 1.4202686659991741, + "epoch": 3.1469237832874195, + "grad_norm": 3.484375, + "learning_rate": 3.39591987386325e-06, + "loss": 1.4353243112564087, + "mean_token_accuracy": 0.6745712738484144, + "num_tokens": 1469996.0, + "step": 217 + }, + { + "entropy": 1.298032023012638, + "epoch": 3.1616161616161618, + "grad_norm": 3.71875, + "learning_rate": 3.3806251198030843e-06, + "loss": 1.3633654117584229, + "mean_token_accuracy": 0.7036402598023415, + "num_tokens": 1476262.0, + "step": 218 + }, + { + "entropy": 1.3053624369204044, + "epoch": 3.1763085399449036, + "grad_norm": 3.4375, + "learning_rate": 3.3652926426937327e-06, + "loss": 1.2723848819732666, + "mean_token_accuracy": 0.6963967196643353, + "num_tokens": 1482676.0, + "step": 219 + }, + { + "entropy": 1.1979273930191994, + "epoch": 3.1910009182736454, + "grad_norm": 3.25, + "learning_rate": 3.3499230993274857e-06, + "loss": 1.3043560981750488, + "mean_token_accuracy": 0.707458607852459, + "num_tokens": 1490483.0, + "step": 220 + }, + { + "entropy": 1.27983458340168, + "epoch": 3.2056932966023877, + "grad_norm": 3.375, + "learning_rate": 3.3345171480844275e-06, + "loss": 1.2262349128723145, + "mean_token_accuracy": 0.7017333172261715, + "num_tokens": 1497544.0, + "step": 221 + }, + { + "entropy": 1.4334750287234783, + "epoch": 3.2203856749311295, + "grad_norm": 3.234375, + "learning_rate": 3.3190754489042343e-06, + "loss": 1.401545524597168, + "mean_token_accuracy": 0.676033478230238, + "num_tokens": 1505305.0, + "step": 222 + }, + { + "entropy": 1.3747035190463066, + "epoch": 3.2350780532598713, + "grad_norm": 3.53125, + "learning_rate": 3.303598663257904e-06, + "loss": 1.3610343933105469, + "mean_token_accuracy": 0.689595028758049, + "num_tokens": 1511573.0, + "step": 223 + }, + { + "entropy": 1.3864431343972683, + "epoch": 3.2497704315886136, + "grad_norm": 4.25, + "learning_rate": 3.288087454119425e-06, + "loss": 1.275547981262207, + "mean_token_accuracy": 0.6855261363089085, + "num_tokens": 1517044.0, + "step": 224 + }, + { + "entropy": 1.2548606544733047, + "epoch": 3.2644628099173554, + "grad_norm": 3.375, + "learning_rate": 3.272542485937369e-06, + "loss": 1.2298287153244019, + "mean_token_accuracy": 0.7007413618266582, + "num_tokens": 1523433.0, + "step": 225 + }, + { + "entropy": 1.4298927076160908, + "epoch": 3.279155188246097, + "grad_norm": 4.09375, + "learning_rate": 3.256964424606437e-06, + "loss": 1.5319254398345947, + "mean_token_accuracy": 0.6646219603717327, + "num_tokens": 1529915.0, + "step": 226 + }, + { + "entropy": 1.2698330879211426, + "epoch": 3.2938475665748395, + "grad_norm": 3.4375, + "learning_rate": 3.2413539374389275e-06, + "loss": 1.1953558921813965, + "mean_token_accuracy": 0.7178251221776009, + "num_tokens": 1536954.0, + "step": 227 + }, + { + "entropy": 1.2876740600913763, + "epoch": 3.3085399449035813, + "grad_norm": 3.5625, + "learning_rate": 3.225711693136156e-06, + "loss": 1.283000111579895, + "mean_token_accuracy": 0.7038575522601604, + "num_tokens": 1543584.0, + "step": 228 + }, + { + "entropy": 1.370558850467205, + "epoch": 3.323232323232323, + "grad_norm": 3.78125, + "learning_rate": 3.2100383617598075e-06, + "loss": 1.3564451932907104, + "mean_token_accuracy": 0.6855217441916466, + "num_tokens": 1550381.0, + "step": 229 + }, + { + "entropy": 1.4030266143381596, + "epoch": 3.3379247015610654, + "grad_norm": 3.9375, + "learning_rate": 3.194334614703231e-06, + "loss": 1.4133769273757935, + "mean_token_accuracy": 0.6617723945528269, + "num_tokens": 1558228.0, + "step": 230 + }, + { + "entropy": 1.3796805329620838, + "epoch": 3.352617079889807, + "grad_norm": 4.03125, + "learning_rate": 3.1786011246626858e-06, + "loss": 1.3643141984939575, + "mean_token_accuracy": 0.6847334876656532, + "num_tokens": 1564241.0, + "step": 231 + }, + { + "entropy": 1.366750942543149, + "epoch": 3.367309458218549, + "grad_norm": 4.0625, + "learning_rate": 3.1628385656085204e-06, + "loss": 1.2604291439056396, + "mean_token_accuracy": 0.6975630149245262, + "num_tokens": 1569807.0, + "step": 232 + }, + { + "entropy": 1.5372787863016129, + "epoch": 3.3820018365472913, + "grad_norm": 3.625, + "learning_rate": 3.147047612756302e-06, + "loss": 1.6237632036209106, + "mean_token_accuracy": 0.648850180208683, + "num_tokens": 1576906.0, + "step": 233 + }, + { + "entropy": 1.3469244949519634, + "epoch": 3.396694214876033, + "grad_norm": 3.59375, + "learning_rate": 3.131228942537895e-06, + "loss": 1.4804552793502808, + "mean_token_accuracy": 0.6760262958705425, + "num_tokens": 1584543.0, + "step": 234 + }, + { + "entropy": 1.327893067151308, + "epoch": 3.411386593204775, + "grad_norm": 3.84375, + "learning_rate": 3.115383232572483e-06, + "loss": 1.3390090465545654, + "mean_token_accuracy": 0.6867328248918056, + "num_tokens": 1590633.0, + "step": 235 + }, + { + "entropy": 1.5125273801386356, + "epoch": 3.426078971533517, + "grad_norm": 3.703125, + "learning_rate": 3.0995111616375417e-06, + "loss": 1.5971745252609253, + "mean_token_accuracy": 0.6557733975350857, + "num_tokens": 1598163.0, + "step": 236 + }, + { + "entropy": 1.2399644292891026, + "epoch": 3.440771349862259, + "grad_norm": 3.359375, + "learning_rate": 3.0836134096397642e-06, + "loss": 1.1880427598953247, + "mean_token_accuracy": 0.7046547196805477, + "num_tokens": 1604955.0, + "step": 237 + }, + { + "entropy": 1.4055788703262806, + "epoch": 3.455463728191001, + "grad_norm": 3.640625, + "learning_rate": 3.0676906575859335e-06, + "loss": 1.4325823783874512, + "mean_token_accuracy": 0.6649295091629028, + "num_tokens": 1612441.0, + "step": 238 + }, + { + "entropy": 1.3476755023002625, + "epoch": 3.470156106519743, + "grad_norm": 3.921875, + "learning_rate": 3.051743587553754e-06, + "loss": 1.304101586341858, + "mean_token_accuracy": 0.6877517551183701, + "num_tokens": 1618692.0, + "step": 239 + }, + { + "entropy": 1.470371063798666, + "epoch": 3.484848484848485, + "grad_norm": 4.15625, + "learning_rate": 3.035772882662627e-06, + "loss": 1.3992159366607666, + "mean_token_accuracy": 0.6673696786165237, + "num_tokens": 1624467.0, + "step": 240 + }, + { + "entropy": 1.363200306892395, + "epoch": 3.4995408631772267, + "grad_norm": 3.859375, + "learning_rate": 3.019779227044398e-06, + "loss": 1.4320815801620483, + "mean_token_accuracy": 0.693215861916542, + "num_tokens": 1630855.0, + "step": 241 + }, + { + "entropy": 1.499861165881157, + "epoch": 3.514233241505969, + "grad_norm": 3.734375, + "learning_rate": 3.0037633058140433e-06, + "loss": 1.635284662246704, + "mean_token_accuracy": 0.6594692952930927, + "num_tokens": 1638233.0, + "step": 242 + }, + { + "entropy": 1.4187380149960518, + "epoch": 3.5289256198347108, + "grad_norm": 3.71875, + "learning_rate": 2.9877258050403214e-06, + "loss": 1.4408353567123413, + "mean_token_accuracy": 0.6815416235476732, + "num_tokens": 1644307.0, + "step": 243 + }, + { + "entropy": 1.2909758538007736, + "epoch": 3.5436179981634526, + "grad_norm": 3.65625, + "learning_rate": 2.9716674117163886e-06, + "loss": 1.1600507497787476, + "mean_token_accuracy": 0.7066163346171379, + "num_tokens": 1650324.0, + "step": 244 + }, + { + "entropy": 1.1622727885842323, + "epoch": 3.558310376492195, + "grad_norm": 3.890625, + "learning_rate": 2.9555888137303695e-06, + "loss": 1.1322875022888184, + "mean_token_accuracy": 0.7267099879682064, + "num_tokens": 1655841.0, + "step": 245 + }, + { + "entropy": 1.499529305845499, + "epoch": 3.5730027548209367, + "grad_norm": 4.03125, + "learning_rate": 2.939490699835887e-06, + "loss": 1.5371626615524292, + "mean_token_accuracy": 0.6663133464753628, + "num_tokens": 1661462.0, + "step": 246 + }, + { + "entropy": 1.4221386834979057, + "epoch": 3.5876951331496785, + "grad_norm": 3.859375, + "learning_rate": 2.9233737596225616e-06, + "loss": 1.4404072761535645, + "mean_token_accuracy": 0.6666534543037415, + "num_tokens": 1668939.0, + "step": 247 + }, + { + "entropy": 1.2013383097946644, + "epoch": 3.6023875114784207, + "grad_norm": 3.40625, + "learning_rate": 2.9072386834864723e-06, + "loss": 1.2697336673736572, + "mean_token_accuracy": 0.7121818475425243, + "num_tokens": 1677284.0, + "step": 248 + }, + { + "entropy": 1.370607167482376, + "epoch": 3.6170798898071626, + "grad_norm": 4.0, + "learning_rate": 2.8910861626005774e-06, + "loss": 1.4362889528274536, + "mean_token_accuracy": 0.6732826940715313, + "num_tokens": 1684406.0, + "step": 249 + }, + { + "entropy": 1.5012065656483173, + "epoch": 3.6317722681359044, + "grad_norm": 3.390625, + "learning_rate": 2.8749168888851126e-06, + "loss": 1.5818341970443726, + "mean_token_accuracy": 0.6568798068910837, + "num_tokens": 1692633.0, + "step": 250 + }, + { + "entropy": 1.1994779370725155, + "epoch": 3.6464646464646466, + "grad_norm": 3.515625, + "learning_rate": 2.858731554977948e-06, + "loss": 1.1643537282943726, + "mean_token_accuracy": 0.716903805732727, + "num_tokens": 1699505.0, + "step": 251 + }, + { + "entropy": 1.4587336257100105, + "epoch": 3.6611570247933884, + "grad_norm": 3.71875, + "learning_rate": 2.8425308542049208e-06, + "loss": 1.6454309225082397, + "mean_token_accuracy": 0.6632649935781956, + "num_tokens": 1705828.0, + "step": 252 + }, + { + "entropy": 1.20838226005435, + "epoch": 3.6758494031221303, + "grad_norm": 2.921875, + "learning_rate": 2.82631548055013e-06, + "loss": 1.144212007522583, + "mean_token_accuracy": 0.7113795578479767, + "num_tokens": 1713697.0, + "step": 253 + }, + { + "entropy": 1.4037099555134773, + "epoch": 3.6905417814508725, + "grad_norm": 4.03125, + "learning_rate": 2.8100861286262137e-06, + "loss": 1.4409031867980957, + "mean_token_accuracy": 0.6661250051110983, + "num_tokens": 1721124.0, + "step": 254 + }, + { + "entropy": 1.344285275787115, + "epoch": 3.7052341597796143, + "grad_norm": 3.703125, + "learning_rate": 2.7938434936445946e-06, + "loss": 1.3361552953720093, + "mean_token_accuracy": 0.6808187067508698, + "num_tokens": 1727536.0, + "step": 255 + }, + { + "entropy": 1.53640878200531, + "epoch": 3.719926538108356, + "grad_norm": 4.25, + "learning_rate": 2.7775882713856946e-06, + "loss": 1.504386067390442, + "mean_token_accuracy": 0.6495453417301178, + "num_tokens": 1733126.0, + "step": 256 + }, + { + "epoch": 3.719926538108356, + "eval_entropy": 1.3394816517829895, + "eval_loss": 1.415405035018921, + "eval_mean_token_accuracy": 0.676714651286602, + "eval_num_tokens": 1733126.0, + "eval_runtime": 1.677, + "eval_samples_per_second": 34.585, + "eval_steps_per_second": 4.77, + "step": 256 + }, + { + "entropy": 1.3805672824382782, + "epoch": 3.734618916437098, + "grad_norm": 3.265625, + "learning_rate": 2.761321158169134e-06, + "loss": 1.276710033416748, + "mean_token_accuracy": 0.6919525004923344, + "num_tokens": 1740583.0, + "step": 257 + }, + { + "entropy": 1.4145719185471535, + "epoch": 3.7493112947658402, + "grad_norm": 3.25, + "learning_rate": 2.7450428508239024e-06, + "loss": 1.492904543876648, + "mean_token_accuracy": 0.6696864552795887, + "num_tokens": 1749615.0, + "step": 258 + }, + { + "entropy": 1.3122917041182518, + "epoch": 3.764003673094582, + "grad_norm": 4.5, + "learning_rate": 2.7287540466585067e-06, + "loss": 1.3468831777572632, + "mean_token_accuracy": 0.6847816966474056, + "num_tokens": 1755286.0, + "step": 259 + }, + { + "entropy": 1.2696636728942394, + "epoch": 3.7786960514233243, + "grad_norm": 3.328125, + "learning_rate": 2.7124554434311047e-06, + "loss": 1.251458764076233, + "mean_token_accuracy": 0.695728961378336, + "num_tokens": 1763474.0, + "step": 260 + }, + { + "entropy": 1.477790392935276, + "epoch": 3.793388429752066, + "grad_norm": 3.859375, + "learning_rate": 2.696147739319613e-06, + "loss": 1.4353973865509033, + "mean_token_accuracy": 0.6688569448888302, + "num_tokens": 1770417.0, + "step": 261 + }, + { + "entropy": 1.2040468826889992, + "epoch": 3.808080808080808, + "grad_norm": 3.796875, + "learning_rate": 2.6798316328917988e-06, + "loss": 1.1171551942825317, + "mean_token_accuracy": 0.7124636992812157, + "num_tokens": 1776268.0, + "step": 262 + }, + { + "entropy": 1.334158930927515, + "epoch": 3.8227731864095498, + "grad_norm": 3.671875, + "learning_rate": 2.663507823075358e-06, + "loss": 1.442462682723999, + "mean_token_accuracy": 0.6851038224995136, + "num_tokens": 1784097.0, + "step": 263 + }, + { + "entropy": 1.2884401306509972, + "epoch": 3.837465564738292, + "grad_norm": 3.703125, + "learning_rate": 2.6471770091279725e-06, + "loss": 1.2929211854934692, + "mean_token_accuracy": 0.6950427368283272, + "num_tokens": 1790700.0, + "step": 264 + }, + { + "entropy": 1.455569889396429, + "epoch": 3.852157943067034, + "grad_norm": 3.703125, + "learning_rate": 2.6308398906073603e-06, + "loss": 1.5066760778427124, + "mean_token_accuracy": 0.6729711573570967, + "num_tokens": 1797581.0, + "step": 265 + }, + { + "entropy": 1.3149556033313274, + "epoch": 3.866850321395776, + "grad_norm": 3.578125, + "learning_rate": 2.6144971673413023e-06, + "loss": 1.345811128616333, + "mean_token_accuracy": 0.6892800442874432, + "num_tokens": 1804984.0, + "step": 266 + }, + { + "entropy": 1.5259748809039593, + "epoch": 3.881542699724518, + "grad_norm": 4.09375, + "learning_rate": 2.5981495393976718e-06, + "loss": 1.560139775276184, + "mean_token_accuracy": 0.6488614473491907, + "num_tokens": 1810652.0, + "step": 267 + }, + { + "entropy": 1.6216607764363289, + "epoch": 3.8962350780532597, + "grad_norm": 4.09375, + "learning_rate": 2.5817977070544408e-06, + "loss": 1.6535224914550781, + "mean_token_accuracy": 0.6346077732741833, + "num_tokens": 1817391.0, + "step": 268 + }, + { + "entropy": 1.465709399431944, + "epoch": 3.9109274563820016, + "grad_norm": 3.578125, + "learning_rate": 2.5654423707696834e-06, + "loss": 1.5264501571655273, + "mean_token_accuracy": 0.6553380750119686, + "num_tokens": 1825473.0, + "step": 269 + }, + { + "entropy": 1.3719651140272617, + "epoch": 3.925619834710744, + "grad_norm": 4.0625, + "learning_rate": 2.5490842311515706e-06, + "loss": 1.405755639076233, + "mean_token_accuracy": 0.6818547490984201, + "num_tokens": 1830992.0, + "step": 270 + }, + { + "entropy": 1.30729578435421, + "epoch": 3.9403122130394856, + "grad_norm": 3.125, + "learning_rate": 2.5327239889283613e-06, + "loss": 1.2894923686981201, + "mean_token_accuracy": 0.6817844286561012, + "num_tokens": 1838915.0, + "step": 271 + }, + { + "entropy": 1.6357484422624111, + "epoch": 3.955004591368228, + "grad_norm": 3.515625, + "learning_rate": 2.5163623449183797e-06, + "loss": 1.7700730562210083, + "mean_token_accuracy": 0.6349711399525404, + "num_tokens": 1847192.0, + "step": 272 + }, + { + "entropy": 1.42452397570014, + "epoch": 3.9696969696969697, + "grad_norm": 3.578125, + "learning_rate": 2.5e-06, + "loss": 1.4052257537841797, + "mean_token_accuracy": 0.670428641140461, + "num_tokens": 1853853.0, + "step": 273 + }, + { + "entropy": 1.2537009306252003, + "epoch": 3.9843893480257115, + "grad_norm": 3.390625, + "learning_rate": 2.4836376550816207e-06, + "loss": 1.320694088935852, + "mean_token_accuracy": 0.708381250500679, + "num_tokens": 1860810.0, + "step": 274 + }, + { + "entropy": 1.3919994123280048, + "epoch": 3.9990817263544534, + "grad_norm": 3.265625, + "learning_rate": 2.4672760110716395e-06, + "loss": 1.4379889965057373, + "mean_token_accuracy": 0.6738540474325418, + "num_tokens": 1869708.0, + "step": 275 + }, + { + "entropy": 1.3144437074661255, + "epoch": 4.0, + "grad_norm": 17.25, + "learning_rate": 2.45091576884843e-06, + "loss": 1.3155449628829956, + "mean_token_accuracy": 0.6774193644523621, + "num_tokens": 1870112.0, + "step": 276 + }, + { + "entropy": 1.3445695489645004, + "epoch": 4.014692378328742, + "grad_norm": 3.515625, + "learning_rate": 2.434557629230318e-06, + "loss": 1.3874691724777222, + "mean_token_accuracy": 0.689882904291153, + "num_tokens": 1877614.0, + "step": 277 + }, + { + "entropy": 1.5155527740716934, + "epoch": 4.029384756657484, + "grad_norm": 4.75, + "learning_rate": 2.41820229294556e-06, + "loss": 1.4299671649932861, + "mean_token_accuracy": 0.6567830629646778, + "num_tokens": 1882254.0, + "step": 278 + }, + { + "entropy": 1.2426442056894302, + "epoch": 4.044077134986226, + "grad_norm": 3.875, + "learning_rate": 2.4018504606023295e-06, + "loss": 1.2982856035232544, + "mean_token_accuracy": 0.6901493407785892, + "num_tokens": 1888736.0, + "step": 279 + }, + { + "entropy": 1.4402744472026825, + "epoch": 4.058769513314968, + "grad_norm": 3.59375, + "learning_rate": 2.385502832658699e-06, + "loss": 1.526076078414917, + "mean_token_accuracy": 0.6596730649471283, + "num_tokens": 1896071.0, + "step": 280 + }, + { + "entropy": 1.3624507710337639, + "epoch": 4.07346189164371, + "grad_norm": 3.46875, + "learning_rate": 2.3691601093926406e-06, + "loss": 1.5298748016357422, + "mean_token_accuracy": 0.672961063683033, + "num_tokens": 1904222.0, + "step": 281 + }, + { + "entropy": 1.518877875059843, + "epoch": 4.088154269972452, + "grad_norm": 3.5625, + "learning_rate": 2.3528229908720275e-06, + "loss": 1.5621310472488403, + "mean_token_accuracy": 0.6666549891233444, + "num_tokens": 1911418.0, + "step": 282 + }, + { + "entropy": 1.408459410071373, + "epoch": 4.102846648301194, + "grad_norm": 3.796875, + "learning_rate": 2.3364921769246423e-06, + "loss": 1.5553926229476929, + "mean_token_accuracy": 0.6725407186895609, + "num_tokens": 1919869.0, + "step": 283 + }, + { + "entropy": 1.5305853299796581, + "epoch": 4.117539026629935, + "grad_norm": 4.28125, + "learning_rate": 2.3201683671082016e-06, + "loss": 1.5349313020706177, + "mean_token_accuracy": 0.6633393950760365, + "num_tokens": 1924769.0, + "step": 284 + }, + { + "entropy": 1.3491091206669807, + "epoch": 4.132231404958677, + "grad_norm": 3.703125, + "learning_rate": 2.3038522606803882e-06, + "loss": 1.3523074388504028, + "mean_token_accuracy": 0.6839871145784855, + "num_tokens": 1931557.0, + "step": 285 + }, + { + "entropy": 1.4780425243079662, + "epoch": 4.14692378328742, + "grad_norm": 4.1875, + "learning_rate": 2.287544556568896e-06, + "loss": 1.565530776977539, + "mean_token_accuracy": 0.6532882675528526, + "num_tokens": 1938144.0, + "step": 286 + }, + { + "entropy": 1.440908256918192, + "epoch": 4.161616161616162, + "grad_norm": 3.453125, + "learning_rate": 2.271245953341494e-06, + "loss": 1.4530143737792969, + "mean_token_accuracy": 0.6627631783485413, + "num_tokens": 1945396.0, + "step": 287 + }, + { + "entropy": 1.297002412378788, + "epoch": 4.176308539944904, + "grad_norm": 3.765625, + "learning_rate": 2.2549571491760985e-06, + "loss": 1.3622161149978638, + "mean_token_accuracy": 0.6911368295550346, + "num_tokens": 1952257.0, + "step": 288 + }, + { + "entropy": 1.1849941164255142, + "epoch": 4.191000918273645, + "grad_norm": 3.203125, + "learning_rate": 2.238678841830867e-06, + "loss": 1.1791870594024658, + "mean_token_accuracy": 0.7209084387868643, + "num_tokens": 1960488.0, + "step": 289 + }, + { + "entropy": 1.609047919511795, + "epoch": 4.205693296602387, + "grad_norm": 4.4375, + "learning_rate": 2.2224117286143063e-06, + "loss": 1.7078322172164917, + "mean_token_accuracy": 0.639599371701479, + "num_tokens": 1966355.0, + "step": 290 + }, + { + "entropy": 1.2323498874902725, + "epoch": 4.22038567493113, + "grad_norm": 3.734375, + "learning_rate": 2.2061565063554063e-06, + "loss": 1.1928443908691406, + "mean_token_accuracy": 0.7171883173286915, + "num_tokens": 1973373.0, + "step": 291 + }, + { + "entropy": 1.3962544910609722, + "epoch": 4.235078053259872, + "grad_norm": 3.78125, + "learning_rate": 2.1899138713737876e-06, + "loss": 1.3652687072753906, + "mean_token_accuracy": 0.673550434410572, + "num_tokens": 1980499.0, + "step": 292 + }, + { + "entropy": 1.2281092554330826, + "epoch": 4.249770431588614, + "grad_norm": 3.359375, + "learning_rate": 2.173684519449872e-06, + "loss": 1.2023086547851562, + "mean_token_accuracy": 0.7110824286937714, + "num_tokens": 1987656.0, + "step": 293 + }, + { + "entropy": 1.4712859019637108, + "epoch": 4.264462809917355, + "grad_norm": 4.1875, + "learning_rate": 2.1574691457950805e-06, + "loss": 1.5222008228302002, + "mean_token_accuracy": 0.6687300186604261, + "num_tokens": 1993905.0, + "step": 294 + }, + { + "entropy": 1.006372582167387, + "epoch": 4.279155188246097, + "grad_norm": 2.84375, + "learning_rate": 2.1412684450220524e-06, + "loss": 0.9414258003234863, + "mean_token_accuracy": 0.7512795105576515, + "num_tokens": 2002852.0, + "step": 295 + }, + { + "entropy": 1.3876865170896053, + "epoch": 4.293847566574839, + "grad_norm": 3.828125, + "learning_rate": 2.1250831111148873e-06, + "loss": 1.3618916273117065, + "mean_token_accuracy": 0.68668382614851, + "num_tokens": 2009321.0, + "step": 296 + }, + { + "entropy": 1.117498192936182, + "epoch": 4.308539944903581, + "grad_norm": 3.625, + "learning_rate": 2.1089138373994226e-06, + "loss": 1.0988891124725342, + "mean_token_accuracy": 0.7218944355845451, + "num_tokens": 2015800.0, + "step": 297 + }, + { + "entropy": 1.300987858325243, + "epoch": 4.3232323232323235, + "grad_norm": 3.734375, + "learning_rate": 2.0927613165135285e-06, + "loss": 1.2009758949279785, + "mean_token_accuracy": 0.692545972764492, + "num_tokens": 2022915.0, + "step": 298 + }, + { + "entropy": 1.3463373892009258, + "epoch": 4.337924701561065, + "grad_norm": 3.8125, + "learning_rate": 2.0766262403774388e-06, + "loss": 1.3616676330566406, + "mean_token_accuracy": 0.6792471520602703, + "num_tokens": 2029521.0, + "step": 299 + }, + { + "entropy": 1.3613272085785866, + "epoch": 4.352617079889807, + "grad_norm": 3.34375, + "learning_rate": 2.0605093001641138e-06, + "loss": 1.2829806804656982, + "mean_token_accuracy": 0.682598739862442, + "num_tokens": 2037788.0, + "step": 300 + }, + { + "entropy": 1.3312906958162785, + "epoch": 4.367309458218549, + "grad_norm": 4.15625, + "learning_rate": 2.0444111862696313e-06, + "loss": 1.2278118133544922, + "mean_token_accuracy": 0.6921750158071518, + "num_tokens": 2044247.0, + "step": 301 + }, + { + "entropy": 1.3146127797663212, + "epoch": 4.382001836547291, + "grad_norm": 3.609375, + "learning_rate": 2.0283325882836126e-06, + "loss": 1.3366804122924805, + "mean_token_accuracy": 0.7049176283180714, + "num_tokens": 2051380.0, + "step": 302 + }, + { + "entropy": 1.7008662782609463, + "epoch": 4.3966942148760335, + "grad_norm": 4.46875, + "learning_rate": 2.01227419495968e-06, + "loss": 1.6468967199325562, + "mean_token_accuracy": 0.6412950064986944, + "num_tokens": 2056909.0, + "step": 303 + }, + { + "entropy": 1.3109636045992374, + "epoch": 4.411386593204775, + "grad_norm": 4.0, + "learning_rate": 1.996236694185957e-06, + "loss": 1.3502867221832275, + "mean_token_accuracy": 0.6835893988609314, + "num_tokens": 2063308.0, + "step": 304 + }, + { + "entropy": 1.3513622879981995, + "epoch": 4.426078971533517, + "grad_norm": 3.40625, + "learning_rate": 1.9802207729556023e-06, + "loss": 1.4691942930221558, + "mean_token_accuracy": 0.688535138964653, + "num_tokens": 2071132.0, + "step": 305 + }, + { + "entropy": 1.1963273957371712, + "epoch": 4.440771349862259, + "grad_norm": 2.9375, + "learning_rate": 1.964227117337374e-06, + "loss": 1.2708735466003418, + "mean_token_accuracy": 0.707404674962163, + "num_tokens": 2080159.0, + "step": 306 + }, + { + "entropy": 1.4165263511240482, + "epoch": 4.455463728191001, + "grad_norm": 4.5, + "learning_rate": 1.9482564124462478e-06, + "loss": 1.335919976234436, + "mean_token_accuracy": 0.6746666543185711, + "num_tokens": 2085158.0, + "step": 307 + }, + { + "entropy": 1.2119375206530094, + "epoch": 4.470156106519743, + "grad_norm": 3.40625, + "learning_rate": 1.9323093424140673e-06, + "loss": 1.1746174097061157, + "mean_token_accuracy": 0.717445420101285, + "num_tokens": 2091050.0, + "step": 308 + }, + { + "entropy": 1.4248721897602081, + "epoch": 4.484848484848484, + "grad_norm": 3.40625, + "learning_rate": 1.9163865903602374e-06, + "loss": 1.3420960903167725, + "mean_token_accuracy": 0.6748971827328205, + "num_tokens": 2098572.0, + "step": 309 + }, + { + "entropy": 1.3115894440561533, + "epoch": 4.499540863177227, + "grad_norm": 4.3125, + "learning_rate": 1.9004888383624596e-06, + "loss": 1.261577844619751, + "mean_token_accuracy": 0.6892781518399715, + "num_tokens": 2104436.0, + "step": 310 + }, + { + "entropy": 1.2737204805016518, + "epoch": 4.514233241505969, + "grad_norm": 3.25, + "learning_rate": 1.8846167674275175e-06, + "loss": 1.2144405841827393, + "mean_token_accuracy": 0.7029564455151558, + "num_tokens": 2111721.0, + "step": 311 + }, + { + "entropy": 1.451767385005951, + "epoch": 4.528925619834711, + "grad_norm": 3.625, + "learning_rate": 1.8687710574621051e-06, + "loss": 1.5465962886810303, + "mean_token_accuracy": 0.6663695089519024, + "num_tokens": 2118973.0, + "step": 312 + }, + { + "entropy": 1.4293602593243122, + "epoch": 4.543617998163453, + "grad_norm": 3.6875, + "learning_rate": 1.852952387243698e-06, + "loss": 1.3797839879989624, + "mean_token_accuracy": 0.6705848015844822, + "num_tokens": 2125146.0, + "step": 313 + }, + { + "entropy": 1.237271387130022, + "epoch": 4.558310376492194, + "grad_norm": 4.0, + "learning_rate": 1.8371614343914798e-06, + "loss": 1.2865500450134277, + "mean_token_accuracy": 0.7063010260462761, + "num_tokens": 2131187.0, + "step": 314 + }, + { + "entropy": 1.3471387848258018, + "epoch": 4.573002754820937, + "grad_norm": 4.03125, + "learning_rate": 1.8213988753373147e-06, + "loss": 1.395220398902893, + "mean_token_accuracy": 0.6758437845855951, + "num_tokens": 2137515.0, + "step": 315 + }, + { + "entropy": 1.3820691257715225, + "epoch": 4.587695133149679, + "grad_norm": 3.640625, + "learning_rate": 1.8056653852967699e-06, + "loss": 1.4266245365142822, + "mean_token_accuracy": 0.6850110292434692, + "num_tokens": 2144669.0, + "step": 316 + }, + { + "entropy": 1.318991456180811, + "epoch": 4.602387511478421, + "grad_norm": 3.8125, + "learning_rate": 1.7899616382401935e-06, + "loss": 1.3007704019546509, + "mean_token_accuracy": 0.6927743554115295, + "num_tokens": 2150840.0, + "step": 317 + }, + { + "entropy": 1.2886290848255157, + "epoch": 4.6170798898071626, + "grad_norm": 3.328125, + "learning_rate": 1.7742883068638447e-06, + "loss": 1.2548828125, + "mean_token_accuracy": 0.708765309303999, + "num_tokens": 2158229.0, + "step": 318 + }, + { + "entropy": 1.4580038003623486, + "epoch": 4.631772268135904, + "grad_norm": 4.15625, + "learning_rate": 1.758646062561073e-06, + "loss": 1.4200549125671387, + "mean_token_accuracy": 0.6693456768989563, + "num_tokens": 2164006.0, + "step": 319 + }, + { + "entropy": 1.2778888307511806, + "epoch": 4.646464646464646, + "grad_norm": 3.859375, + "learning_rate": 1.743035575393564e-06, + "loss": 1.2976222038269043, + "mean_token_accuracy": 0.7035369873046875, + "num_tokens": 2170612.0, + "step": 320 + }, + { + "entropy": 1.2509517259895802, + "epoch": 4.661157024793388, + "grad_norm": 3.671875, + "learning_rate": 1.7274575140626318e-06, + "loss": 1.2400519847869873, + "mean_token_accuracy": 0.7016804441809654, + "num_tokens": 2176823.0, + "step": 321 + }, + { + "entropy": 1.322471171617508, + "epoch": 4.675849403122131, + "grad_norm": 3.609375, + "learning_rate": 1.7119125458805767e-06, + "loss": 1.3159892559051514, + "mean_token_accuracy": 0.689349815249443, + "num_tokens": 2183067.0, + "step": 322 + }, + { + "entropy": 1.4905264489352703, + "epoch": 4.6905417814508725, + "grad_norm": 3.296875, + "learning_rate": 1.6964013367420967e-06, + "loss": 1.5832772254943848, + "mean_token_accuracy": 0.6548038627952337, + "num_tokens": 2190961.0, + "step": 323 + }, + { + "entropy": 1.3516268730163574, + "epoch": 4.705234159779614, + "grad_norm": 3.640625, + "learning_rate": 1.6809245510957667e-06, + "loss": 1.2710171937942505, + "mean_token_accuracy": 0.6953945681452751, + "num_tokens": 2197269.0, + "step": 324 + }, + { + "entropy": 1.3766049407422543, + "epoch": 4.719926538108356, + "grad_norm": 3.890625, + "learning_rate": 1.665482851915573e-06, + "loss": 1.3786622285842896, + "mean_token_accuracy": 0.685791440308094, + "num_tokens": 2203978.0, + "step": 325 + }, + { + "entropy": 1.3528023660182953, + "epoch": 4.734618916437098, + "grad_norm": 3.34375, + "learning_rate": 1.6500769006725142e-06, + "loss": 1.392067313194275, + "mean_token_accuracy": 0.6816291362047195, + "num_tokens": 2210550.0, + "step": 326 + }, + { + "entropy": 1.547594841569662, + "epoch": 4.749311294765841, + "grad_norm": 4.59375, + "learning_rate": 1.634707357306267e-06, + "loss": 1.6562820672988892, + "mean_token_accuracy": 0.6610909849405289, + "num_tokens": 2216651.0, + "step": 327 + }, + { + "entropy": 1.4075745232403278, + "epoch": 4.7640036730945825, + "grad_norm": 3.625, + "learning_rate": 1.6193748801969164e-06, + "loss": 1.3409862518310547, + "mean_token_accuracy": 0.6761545985937119, + "num_tokens": 2223859.0, + "step": 328 + }, + { + "entropy": 1.4365106411278248, + "epoch": 4.778696051423324, + "grad_norm": 3.828125, + "learning_rate": 1.6040801261367494e-06, + "loss": 1.4949394464492798, + "mean_token_accuracy": 0.6674479302018881, + "num_tokens": 2230698.0, + "step": 329 + }, + { + "entropy": 1.39594067633152, + "epoch": 4.793388429752066, + "grad_norm": 3.546875, + "learning_rate": 1.588823750302126e-06, + "loss": 1.4408518075942993, + "mean_token_accuracy": 0.6747472062706947, + "num_tokens": 2237816.0, + "step": 330 + }, + { + "entropy": 1.5729894042015076, + "epoch": 4.808080808080808, + "grad_norm": 3.421875, + "learning_rate": 1.5736064062254094e-06, + "loss": 1.6204023361206055, + "mean_token_accuracy": 0.6384808495640755, + "num_tokens": 2246004.0, + "step": 331 + }, + { + "entropy": 1.3464174792170525, + "epoch": 4.82277318640955, + "grad_norm": 3.78125, + "learning_rate": 1.5584287457669733e-06, + "loss": 1.3943066596984863, + "mean_token_accuracy": 0.6859343573451042, + "num_tokens": 2252413.0, + "step": 332 + }, + { + "entropy": 1.6637616902589798, + "epoch": 4.837465564738292, + "grad_norm": 3.96875, + "learning_rate": 1.5432914190872757e-06, + "loss": 1.8307788372039795, + "mean_token_accuracy": 0.6110464110970497, + "num_tokens": 2259455.0, + "step": 333 + }, + { + "entropy": 1.4676074236631393, + "epoch": 4.852157943067034, + "grad_norm": 3.875, + "learning_rate": 1.528195074619011e-06, + "loss": 1.4894895553588867, + "mean_token_accuracy": 0.6655828766524792, + "num_tokens": 2266752.0, + "step": 334 + }, + { + "entropy": 1.3864082805812359, + "epoch": 4.866850321395776, + "grad_norm": 3.515625, + "learning_rate": 1.5131403590393323e-06, + "loss": 1.432395577430725, + "mean_token_accuracy": 0.6768216788768768, + "num_tokens": 2274152.0, + "step": 335 + }, + { + "entropy": 1.3860022686421871, + "epoch": 4.881542699724518, + "grad_norm": 3.65625, + "learning_rate": 1.4981279172421481e-06, + "loss": 1.3834329843521118, + "mean_token_accuracy": 0.677151620388031, + "num_tokens": 2281472.0, + "step": 336 + }, + { + "entropy": 1.437876883894205, + "epoch": 4.89623507805326, + "grad_norm": 3.703125, + "learning_rate": 1.4831583923105e-06, + "loss": 1.584201455116272, + "mean_token_accuracy": 0.6708448305726051, + "num_tokens": 2289551.0, + "step": 337 + }, + { + "entropy": 1.330020684748888, + "epoch": 4.910927456382002, + "grad_norm": 3.59375, + "learning_rate": 1.4682324254890135e-06, + "loss": 1.3487207889556885, + "mean_token_accuracy": 0.6990271396934986, + "num_tokens": 2296413.0, + "step": 338 + }, + { + "entropy": 1.408544309437275, + "epoch": 4.925619834710744, + "grad_norm": 3.65625, + "learning_rate": 1.4533506561564305e-06, + "loss": 1.437635898590088, + "mean_token_accuracy": 0.673307441174984, + "num_tokens": 2303769.0, + "step": 339 + }, + { + "entropy": 1.2854021713137627, + "epoch": 4.940312213039486, + "grad_norm": 3.875, + "learning_rate": 1.4385137217982178e-06, + "loss": 1.3022122383117676, + "mean_token_accuracy": 0.7127466425299644, + "num_tokens": 2310004.0, + "step": 340 + }, + { + "entropy": 1.3864372000098228, + "epoch": 4.955004591368228, + "grad_norm": 3.203125, + "learning_rate": 1.4237222579792618e-06, + "loss": 1.4041612148284912, + "mean_token_accuracy": 0.6763649247586727, + "num_tokens": 2318788.0, + "step": 341 + }, + { + "entropy": 1.2509639486670494, + "epoch": 4.96969696969697, + "grad_norm": 3.90625, + "learning_rate": 1.4089768983166445e-06, + "loss": 1.2501071691513062, + "mean_token_accuracy": 0.7078825943171978, + "num_tokens": 2325084.0, + "step": 342 + }, + { + "entropy": 1.454335866495967, + "epoch": 4.9843893480257115, + "grad_norm": 4.28125, + "learning_rate": 1.3942782744524974e-06, + "loss": 1.449426293373108, + "mean_token_accuracy": 0.6640381850302219, + "num_tokens": 2331645.0, + "step": 343 + }, + { + "entropy": 1.2466024421155453, + "epoch": 4.999081726354453, + "grad_norm": 4.0, + "learning_rate": 1.379627016026944e-06, + "loss": 1.1815191507339478, + "mean_token_accuracy": 0.7297266945242882, + "num_tokens": 2337128.0, + "step": 344 + }, + { + "entropy": 1.5639240741729736, + "epoch": 5.0, + "grad_norm": 14.75, + "learning_rate": 1.3650237506511333e-06, + "loss": 1.6862883567810059, + "mean_token_accuracy": 0.6340509057044983, + "num_tokens": 2337640.0, + "step": 345 + }, + { + "entropy": 1.3247104361653328, + "epoch": 5.014692378328742, + "grad_norm": 3.875, + "learning_rate": 1.3504691038803504e-06, + "loss": 1.336502194404602, + "mean_token_accuracy": 0.6947694420814514, + "num_tokens": 2343507.0, + "step": 346 + }, + { + "entropy": 1.248255368322134, + "epoch": 5.029384756657484, + "grad_norm": 3.640625, + "learning_rate": 1.3359636991872215e-06, + "loss": 1.2202980518341064, + "mean_token_accuracy": 0.7036343440413475, + "num_tokens": 2349761.0, + "step": 347 + }, + { + "entropy": 1.4622275196015835, + "epoch": 5.044077134986226, + "grad_norm": 3.984375, + "learning_rate": 1.3215081579350058e-06, + "loss": 1.478945016860962, + "mean_token_accuracy": 0.6600709166377783, + "num_tokens": 2356336.0, + "step": 348 + }, + { + "entropy": 1.344462625682354, + "epoch": 5.058769513314968, + "grad_norm": 3.546875, + "learning_rate": 1.307103099350979e-06, + "loss": 1.4435844421386719, + "mean_token_accuracy": 0.6860466375946999, + "num_tokens": 2363847.0, + "step": 349 + }, + { + "entropy": 1.5077877081930637, + "epoch": 5.07346189164371, + "grad_norm": 4.25, + "learning_rate": 1.2927491404999077e-06, + "loss": 1.546884298324585, + "mean_token_accuracy": 0.6479404345154762, + "num_tokens": 2369738.0, + "step": 350 + }, + { + "entropy": 1.298233974725008, + "epoch": 5.088154269972452, + "grad_norm": 3.4375, + "learning_rate": 1.2784468962576136e-06, + "loss": 1.274640440940857, + "mean_token_accuracy": 0.7008918151259422, + "num_tokens": 2376780.0, + "step": 351 + }, + { + "entropy": 1.3465782329440117, + "epoch": 5.102846648301194, + "grad_norm": 3.34375, + "learning_rate": 1.2641969792846393e-06, + "loss": 1.298094391822815, + "mean_token_accuracy": 0.6762865297496319, + "num_tokens": 2384800.0, + "step": 352 + }, + { + "entropy": 1.164300974458456, + "epoch": 5.117539026629935, + "grad_norm": 3.921875, + "learning_rate": 1.2500000000000007e-06, + "loss": 1.0800127983093262, + "mean_token_accuracy": 0.7225227616727352, + "num_tokens": 2391130.0, + "step": 353 + }, + { + "entropy": 1.3427915126085281, + "epoch": 5.132231404958677, + "grad_norm": 3.546875, + "learning_rate": 1.235856566555039e-06, + "loss": 1.2384849786758423, + "mean_token_accuracy": 0.6905228663235903, + "num_tokens": 2398399.0, + "step": 354 + }, + { + "entropy": 1.3961762860417366, + "epoch": 5.14692378328742, + "grad_norm": 3.75, + "learning_rate": 1.2217672848073702e-06, + "loss": 1.442419171333313, + "mean_token_accuracy": 0.6752587780356407, + "num_tokens": 2405354.0, + "step": 355 + }, + { + "entropy": 1.2891596406698227, + "epoch": 5.161616161616162, + "grad_norm": 3.375, + "learning_rate": 1.2077327582949313e-06, + "loss": 1.1359935998916626, + "mean_token_accuracy": 0.7036025710403919, + "num_tokens": 2412665.0, + "step": 356 + }, + { + "entropy": 1.3552672527730465, + "epoch": 5.176308539944904, + "grad_norm": 4.09375, + "learning_rate": 1.193753588210128e-06, + "loss": 1.3831486701965332, + "mean_token_accuracy": 0.6862701997160912, + "num_tokens": 2418974.0, + "step": 357 + }, + { + "entropy": 1.4431088119745255, + "epoch": 5.191000918273645, + "grad_norm": 3.90625, + "learning_rate": 1.1798303733740801e-06, + "loss": 1.3804062604904175, + "mean_token_accuracy": 0.6673993915319443, + "num_tokens": 2425258.0, + "step": 358 + }, + { + "entropy": 1.3874380216002464, + "epoch": 5.205693296602387, + "grad_norm": 4.125, + "learning_rate": 1.1659637102109713e-06, + "loss": 1.4015424251556396, + "mean_token_accuracy": 0.6745005883276463, + "num_tokens": 2430975.0, + "step": 359 + }, + { + "entropy": 1.4490499272942543, + "epoch": 5.22038567493113, + "grad_norm": 3.96875, + "learning_rate": 1.1521541927224994e-06, + "loss": 1.3985379934310913, + "mean_token_accuracy": 0.6676309891045094, + "num_tokens": 2437146.0, + "step": 360 + }, + { + "entropy": 1.2855382412672043, + "epoch": 5.235078053259872, + "grad_norm": 3.578125, + "learning_rate": 1.1384024124624324e-06, + "loss": 1.2529319524765015, + "mean_token_accuracy": 0.6951777450740337, + "num_tokens": 2443983.0, + "step": 361 + }, + { + "entropy": 1.354172457009554, + "epoch": 5.249770431588614, + "grad_norm": 3.5, + "learning_rate": 1.1247089585112666e-06, + "loss": 1.4322967529296875, + "mean_token_accuracy": 0.6889848560094833, + "num_tokens": 2450684.0, + "step": 362 + }, + { + "entropy": 1.3293256983160973, + "epoch": 5.264462809917355, + "grad_norm": 3.671875, + "learning_rate": 1.1110744174509952e-06, + "loss": 1.3209728002548218, + "mean_token_accuracy": 0.687582079321146, + "num_tokens": 2457888.0, + "step": 363 + }, + { + "entropy": 1.3070398084819317, + "epoch": 5.279155188246097, + "grad_norm": 4.0, + "learning_rate": 1.0974993733399762e-06, + "loss": 1.3550182580947876, + "mean_token_accuracy": 0.6897185444831848, + "num_tokens": 2464596.0, + "step": 364 + }, + { + "entropy": 1.4290886037051678, + "epoch": 5.293847566574839, + "grad_norm": 3.671875, + "learning_rate": 1.0839844076879186e-06, + "loss": 1.3897254467010498, + "mean_token_accuracy": 0.6664892844855785, + "num_tokens": 2470842.0, + "step": 365 + }, + { + "entropy": 1.2070421613752842, + "epoch": 5.308539944903581, + "grad_norm": 3.546875, + "learning_rate": 1.0705300994309697e-06, + "loss": 1.1175053119659424, + "mean_token_accuracy": 0.7257428281009197, + "num_tokens": 2477103.0, + "step": 366 + }, + { + "entropy": 1.3485257625579834, + "epoch": 5.3232323232323235, + "grad_norm": 3.84375, + "learning_rate": 1.0571370249069163e-06, + "loss": 1.29289710521698, + "mean_token_accuracy": 0.692170076072216, + "num_tokens": 2483582.0, + "step": 367 + }, + { + "entropy": 1.3372049815952778, + "epoch": 5.337924701561065, + "grad_norm": 3.609375, + "learning_rate": 1.043805757830495e-06, + "loss": 1.2823678255081177, + "mean_token_accuracy": 0.6924175024032593, + "num_tokens": 2489571.0, + "step": 368 + }, + { + "entropy": 1.453590054064989, + "epoch": 5.352617079889807, + "grad_norm": 3.65625, + "learning_rate": 1.0305368692688175e-06, + "loss": 1.435807466506958, + "mean_token_accuracy": 0.6810374148190022, + "num_tokens": 2495721.0, + "step": 369 + }, + { + "entropy": 1.1400656588375568, + "epoch": 5.367309458218549, + "grad_norm": 3.171875, + "learning_rate": 1.0173309276169075e-06, + "loss": 1.142919898033142, + "mean_token_accuracy": 0.7174917720258236, + "num_tokens": 2503949.0, + "step": 370 + }, + { + "entropy": 1.367987047880888, + "epoch": 5.382001836547291, + "grad_norm": 3.9375, + "learning_rate": 1.0041884985733524e-06, + "loss": 1.4378776550292969, + "mean_token_accuracy": 0.6763719841837883, + "num_tokens": 2510208.0, + "step": 371 + }, + { + "entropy": 1.4379977211356163, + "epoch": 5.3966942148760335, + "grad_norm": 3.921875, + "learning_rate": 9.911101451160714e-07, + "loss": 1.4669700860977173, + "mean_token_accuracy": 0.676942465826869, + "num_tokens": 2516100.0, + "step": 372 + }, + { + "entropy": 1.3764347173273563, + "epoch": 5.411386593204775, + "grad_norm": 3.578125, + "learning_rate": 9.780964274781984e-07, + "loss": 1.5235940217971802, + "mean_token_accuracy": 0.6685472317039967, + "num_tokens": 2524555.0, + "step": 373 + }, + { + "entropy": 1.5450992733240128, + "epoch": 5.426078971533517, + "grad_norm": 4.0625, + "learning_rate": 9.651479031240837e-07, + "loss": 1.5131289958953857, + "mean_token_accuracy": 0.6463721804320812, + "num_tokens": 2531282.0, + "step": 374 + }, + { + "entropy": 1.457864124327898, + "epoch": 5.440771349862259, + "grad_norm": 3.59375, + "learning_rate": 9.522651267254149e-07, + "loss": 1.5437613725662231, + "mean_token_accuracy": 0.6583030465990305, + "num_tokens": 2539541.0, + "step": 375 + }, + { + "entropy": 1.3948032334446907, + "epoch": 5.455463728191001, + "grad_norm": 3.546875, + "learning_rate": 9.394486501374556e-07, + "loss": 1.4869863986968994, + "mean_token_accuracy": 0.6875165402889252, + "num_tokens": 2546176.0, + "step": 376 + }, + { + "entropy": 1.4013639837503433, + "epoch": 5.470156106519743, + "grad_norm": 3.890625, + "learning_rate": 9.266990223754069e-07, + "loss": 1.4661489725112915, + "mean_token_accuracy": 0.6756766103208065, + "num_tokens": 2553070.0, + "step": 377 + }, + { + "entropy": 1.2801647149026394, + "epoch": 5.484848484848484, + "grad_norm": 3.328125, + "learning_rate": 9.140167895908867e-07, + "loss": 1.312286138534546, + "mean_token_accuracy": 0.6866127587854862, + "num_tokens": 2561136.0, + "step": 378 + }, + { + "entropy": 1.4677649438381195, + "epoch": 5.499540863177227, + "grad_norm": 4.25, + "learning_rate": 9.014024950485384e-07, + "loss": 1.3887375593185425, + "mean_token_accuracy": 0.6793632172048092, + "num_tokens": 2567304.0, + "step": 379 + }, + { + "entropy": 1.4178661219775677, + "epoch": 5.514233241505969, + "grad_norm": 3.921875, + "learning_rate": 8.88856679102757e-07, + "loss": 1.37828528881073, + "mean_token_accuracy": 0.6739732995629311, + "num_tokens": 2574401.0, + "step": 380 + }, + { + "entropy": 1.379222609102726, + "epoch": 5.528925619834711, + "grad_norm": 4.125, + "learning_rate": 8.763798791745413e-07, + "loss": 1.3763408660888672, + "mean_token_accuracy": 0.6974320486187935, + "num_tokens": 2580639.0, + "step": 381 + }, + { + "entropy": 1.2704015038907528, + "epoch": 5.543617998163453, + "grad_norm": 3.734375, + "learning_rate": 8.639726297284742e-07, + "loss": 1.1959011554718018, + "mean_token_accuracy": 0.700306411832571, + "num_tokens": 2587418.0, + "step": 382 + }, + { + "entropy": 1.4778965413570404, + "epoch": 5.558310376492194, + "grad_norm": 3.640625, + "learning_rate": 8.516354622498279e-07, + "loss": 1.3851994276046753, + "mean_token_accuracy": 0.666837640106678, + "num_tokens": 2594190.0, + "step": 383 + }, + { + "entropy": 1.1784359328448772, + "epoch": 5.573002754820937, + "grad_norm": 3.09375, + "learning_rate": 8.393689052217966e-07, + "loss": 1.236546277999878, + "mean_token_accuracy": 0.7079413570463657, + "num_tokens": 2602487.0, + "step": 384 + }, + { + "epoch": 5.573002754820937, + "eval_entropy": 1.3389692306518555, + "eval_loss": 1.406298041343689, + "eval_mean_token_accuracy": 0.6783231347799301, + "eval_num_tokens": 2602487.0, + "eval_runtime": 1.6707, + "eval_samples_per_second": 34.715, + "eval_steps_per_second": 4.788, + "step": 384 + }, + { + "entropy": 1.588452558964491, + "epoch": 5.587695133149679, + "grad_norm": 4.0625, + "learning_rate": 8.271734841028553e-07, + "loss": 1.7794235944747925, + "mean_token_accuracy": 0.6403734050691128, + "num_tokens": 2610026.0, + "step": 385 + }, + { + "entropy": 1.407985232770443, + "epoch": 5.602387511478421, + "grad_norm": 3.609375, + "learning_rate": 8.150497213042552e-07, + "loss": 1.4693222045898438, + "mean_token_accuracy": 0.6730717644095421, + "num_tokens": 2616763.0, + "step": 386 + }, + { + "entropy": 1.554512519389391, + "epoch": 5.6170798898071626, + "grad_norm": 4.125, + "learning_rate": 8.029981361676456e-07, + "loss": 1.7840030193328857, + "mean_token_accuracy": 0.6635391432791948, + "num_tokens": 2623765.0, + "step": 387 + }, + { + "entropy": 1.3049566857516766, + "epoch": 5.631772268135904, + "grad_norm": 3.390625, + "learning_rate": 7.910192449428216e-07, + "loss": 1.3470871448516846, + "mean_token_accuracy": 0.6958029642701149, + "num_tokens": 2630843.0, + "step": 388 + }, + { + "entropy": 1.496280875056982, + "epoch": 5.646464646464646, + "grad_norm": 3.625, + "learning_rate": 7.791135607656147e-07, + "loss": 1.5659466981887817, + "mean_token_accuracy": 0.6670752931386232, + "num_tokens": 2638100.0, + "step": 389 + }, + { + "entropy": 1.448220781981945, + "epoch": 5.661157024793388, + "grad_norm": 4.0625, + "learning_rate": 7.672815936359107e-07, + "loss": 1.6060407161712646, + "mean_token_accuracy": 0.6671166494488716, + "num_tokens": 2645333.0, + "step": 390 + }, + { + "entropy": 1.3865485899150372, + "epoch": 5.675849403122131, + "grad_norm": 3.890625, + "learning_rate": 7.555238503958001e-07, + "loss": 1.4103912115097046, + "mean_token_accuracy": 0.6776862740516663, + "num_tokens": 2651746.0, + "step": 391 + }, + { + "entropy": 1.4491654373705387, + "epoch": 5.6905417814508725, + "grad_norm": 3.859375, + "learning_rate": 7.43840834707871e-07, + "loss": 1.5049588680267334, + "mean_token_accuracy": 0.6596195660531521, + "num_tokens": 2658321.0, + "step": 392 + }, + { + "entropy": 1.396248023957014, + "epoch": 5.705234159779614, + "grad_norm": 3.703125, + "learning_rate": 7.322330470336314e-07, + "loss": 1.44000244140625, + "mean_token_accuracy": 0.6730465441942215, + "num_tokens": 2665932.0, + "step": 393 + }, + { + "entropy": 1.2807521969079971, + "epoch": 5.719926538108356, + "grad_norm": 3.578125, + "learning_rate": 7.207009846120718e-07, + "loss": 1.3392530679702759, + "mean_token_accuracy": 0.694486953318119, + "num_tokens": 2672679.0, + "step": 394 + }, + { + "entropy": 1.3551440499722958, + "epoch": 5.734618916437098, + "grad_norm": 3.078125, + "learning_rate": 7.092451414383644e-07, + "loss": 1.317352056503296, + "mean_token_accuracy": 0.6877163723111153, + "num_tokens": 2681371.0, + "step": 395 + }, + { + "entropy": 1.3415511585772038, + "epoch": 5.749311294765841, + "grad_norm": 3.84375, + "learning_rate": 6.97866008242703e-07, + "loss": 1.4178882837295532, + "mean_token_accuracy": 0.6842059157788754, + "num_tokens": 2688476.0, + "step": 396 + }, + { + "entropy": 1.1355127394199371, + "epoch": 5.7640036730945825, + "grad_norm": 3.25, + "learning_rate": 6.865640724692815e-07, + "loss": 1.0911461114883423, + "mean_token_accuracy": 0.7424787282943726, + "num_tokens": 2695575.0, + "step": 397 + }, + { + "entropy": 1.2852298319339752, + "epoch": 5.778696051423324, + "grad_norm": 3.703125, + "learning_rate": 6.753398182554116e-07, + "loss": 1.2322055101394653, + "mean_token_accuracy": 0.7083672620356083, + "num_tokens": 2702192.0, + "step": 398 + }, + { + "entropy": 1.5159233435988426, + "epoch": 5.793388429752066, + "grad_norm": 4.75, + "learning_rate": 6.641937264107868e-07, + "loss": 1.5141452550888062, + "mean_token_accuracy": 0.6346602737903595, + "num_tokens": 2708132.0, + "step": 399 + }, + { + "entropy": 1.3606117404997349, + "epoch": 5.808080808080808, + "grad_norm": 3.375, + "learning_rate": 6.53126274396885e-07, + "loss": 1.4291459321975708, + "mean_token_accuracy": 0.6642967071384192, + "num_tokens": 2715821.0, + "step": 400 + }, + { + "entropy": 1.2682021632790565, + "epoch": 5.82277318640955, + "grad_norm": 3.5, + "learning_rate": 6.421379363065142e-07, + "loss": 1.293297529220581, + "mean_token_accuracy": 0.6939828936010599, + "num_tokens": 2722727.0, + "step": 401 + }, + { + "entropy": 1.4758578278124332, + "epoch": 5.837465564738292, + "grad_norm": 4.0, + "learning_rate": 6.312291828435077e-07, + "loss": 1.668549656867981, + "mean_token_accuracy": 0.6563255451619625, + "num_tokens": 2730755.0, + "step": 402 + }, + { + "entropy": 1.524743027985096, + "epoch": 5.852157943067034, + "grad_norm": 4.1875, + "learning_rate": 6.204004813025569e-07, + "loss": 1.5362768173217773, + "mean_token_accuracy": 0.6655668392777443, + "num_tokens": 2736790.0, + "step": 403 + }, + { + "entropy": 1.209791924804449, + "epoch": 5.866850321395776, + "grad_norm": 3.265625, + "learning_rate": 6.096522955491932e-07, + "loss": 1.2250401973724365, + "mean_token_accuracy": 0.7064687013626099, + "num_tokens": 2744896.0, + "step": 404 + }, + { + "entropy": 1.291951572522521, + "epoch": 5.881542699724518, + "grad_norm": 3.609375, + "learning_rate": 5.989850859999227e-07, + "loss": 1.3850435018539429, + "mean_token_accuracy": 0.6930592581629753, + "num_tokens": 2751935.0, + "step": 405 + }, + { + "entropy": 1.3803613483905792, + "epoch": 5.89623507805326, + "grad_norm": 3.953125, + "learning_rate": 5.883993096024993e-07, + "loss": 1.3516204357147217, + "mean_token_accuracy": 0.6931698061525822, + "num_tokens": 2758137.0, + "step": 406 + }, + { + "entropy": 1.1959835402667522, + "epoch": 5.910927456382002, + "grad_norm": 4.03125, + "learning_rate": 5.778954198163514e-07, + "loss": 1.2868695259094238, + "mean_token_accuracy": 0.7249186784029007, + "num_tokens": 2764628.0, + "step": 407 + }, + { + "entropy": 1.2468183785676956, + "epoch": 5.925619834710744, + "grad_norm": 3.40625, + "learning_rate": 5.674738665931575e-07, + "loss": 1.235489010810852, + "mean_token_accuracy": 0.7177602611482143, + "num_tokens": 2771814.0, + "step": 408 + }, + { + "entropy": 1.60049744322896, + "epoch": 5.940312213039486, + "grad_norm": 3.796875, + "learning_rate": 5.571350963575728e-07, + "loss": 1.5064845085144043, + "mean_token_accuracy": 0.652068167924881, + "num_tokens": 2778109.0, + "step": 409 + }, + { + "entropy": 1.3452286906540394, + "epoch": 5.955004591368228, + "grad_norm": 3.578125, + "learning_rate": 5.468795519881043e-07, + "loss": 1.2341338396072388, + "mean_token_accuracy": 0.6869874056428671, + "num_tokens": 2784412.0, + "step": 410 + }, + { + "entropy": 1.4359879940748215, + "epoch": 5.96969696969697, + "grad_norm": 3.75, + "learning_rate": 5.367076727981383e-07, + "loss": 1.4686487913131714, + "mean_token_accuracy": 0.6682025790214539, + "num_tokens": 2790953.0, + "step": 411 + }, + { + "entropy": 1.3611623905599117, + "epoch": 5.9843893480257115, + "grad_norm": 3.765625, + "learning_rate": 5.266198945171253e-07, + "loss": 1.4125094413757324, + "mean_token_accuracy": 0.6872195526957512, + "num_tokens": 2798320.0, + "step": 412 + }, + { + "entropy": 1.2970973066985607, + "epoch": 5.999081726354453, + "grad_norm": 3.390625, + "learning_rate": 5.166166492719124e-07, + "loss": 1.251451849937439, + "mean_token_accuracy": 0.7029491886496544, + "num_tokens": 2804831.0, + "step": 413 + }, + { + "entropy": 0.6531462669372559, + "epoch": 6.0, + "grad_norm": 10.8125, + "learning_rate": 5.066983655682325e-07, + "loss": 0.6284084320068359, + "mean_token_accuracy": 0.824404776096344, + "num_tokens": 2805168.0, + "step": 414 + }, + { + "entropy": 1.3656140714883804, + "epoch": 6.014692378328742, + "grad_norm": 3.640625, + "learning_rate": 4.968654682723487e-07, + "loss": 1.2719142436981201, + "mean_token_accuracy": 0.6842254959046841, + "num_tokens": 2811186.0, + "step": 415 + }, + { + "entropy": 1.4867672063410282, + "epoch": 6.029384756657484, + "grad_norm": 3.328125, + "learning_rate": 4.871183785928546e-07, + "loss": 1.557564616203308, + "mean_token_accuracy": 0.6533172447234392, + "num_tokens": 2818841.0, + "step": 416 + }, + { + "entropy": 1.4871499314904213, + "epoch": 6.044077134986226, + "grad_norm": 3.796875, + "learning_rate": 4.774575140626317e-07, + "loss": 1.444616436958313, + "mean_token_accuracy": 0.6553527489304543, + "num_tokens": 2825100.0, + "step": 417 + }, + { + "entropy": 1.398858230561018, + "epoch": 6.058769513314968, + "grad_norm": 4.15625, + "learning_rate": 4.678832885209622e-07, + "loss": 1.36493980884552, + "mean_token_accuracy": 0.6685687974095345, + "num_tokens": 2831217.0, + "step": 418 + }, + { + "entropy": 1.470800019800663, + "epoch": 6.07346189164371, + "grad_norm": 3.84375, + "learning_rate": 4.5839611209580277e-07, + "loss": 1.413637638092041, + "mean_token_accuracy": 0.6679843384772539, + "num_tokens": 2837281.0, + "step": 419 + }, + { + "entropy": 1.4590460509061813, + "epoch": 6.088154269972452, + "grad_norm": 3.640625, + "learning_rate": 4.4899639118621606e-07, + "loss": 1.5037059783935547, + "mean_token_accuracy": 0.6643645130097866, + "num_tokens": 2844295.0, + "step": 420 + }, + { + "entropy": 1.3967719785869122, + "epoch": 6.102846648301194, + "grad_norm": 3.734375, + "learning_rate": 4.396845284449608e-07, + "loss": 1.6165939569473267, + "mean_token_accuracy": 0.6795150488615036, + "num_tokens": 2851174.0, + "step": 421 + }, + { + "entropy": 1.4725304134190083, + "epoch": 6.117539026629935, + "grad_norm": 3.65625, + "learning_rate": 4.3046092276124467e-07, + "loss": 1.367598533630371, + "mean_token_accuracy": 0.6686763595789671, + "num_tokens": 2857478.0, + "step": 422 + }, + { + "entropy": 1.3191987164318562, + "epoch": 6.132231404958677, + "grad_norm": 3.421875, + "learning_rate": 4.2132596924363666e-07, + "loss": 1.270479679107666, + "mean_token_accuracy": 0.6981482766568661, + "num_tokens": 2864721.0, + "step": 423 + }, + { + "entropy": 1.27112677693367, + "epoch": 6.14692378328742, + "grad_norm": 4.5, + "learning_rate": 4.122800592031426e-07, + "loss": 1.3095415830612183, + "mean_token_accuracy": 0.6866735070943832, + "num_tokens": 2869929.0, + "step": 424 + }, + { + "entropy": 1.386510156095028, + "epoch": 6.161616161616162, + "grad_norm": 3.421875, + "learning_rate": 4.033235801364402e-07, + "loss": 1.378354787826538, + "mean_token_accuracy": 0.6822472270578146, + "num_tokens": 2876724.0, + "step": 425 + }, + { + "entropy": 1.406757928431034, + "epoch": 6.176308539944904, + "grad_norm": 3.5, + "learning_rate": 3.94456915709284e-07, + "loss": 1.4385218620300293, + "mean_token_accuracy": 0.6637353654950857, + "num_tokens": 2884843.0, + "step": 426 + }, + { + "entropy": 1.3404726311564445, + "epoch": 6.191000918273645, + "grad_norm": 4.25, + "learning_rate": 3.85680445740067e-07, + "loss": 1.3165702819824219, + "mean_token_accuracy": 0.6864796336740255, + "num_tokens": 2889782.0, + "step": 427 + }, + { + "entropy": 1.3043682426214218, + "epoch": 6.205693296602387, + "grad_norm": 3.6875, + "learning_rate": 3.7699454618355306e-07, + "loss": 1.3163912296295166, + "mean_token_accuracy": 0.6922629773616791, + "num_tokens": 2896735.0, + "step": 428 + }, + { + "entropy": 1.3188545294106007, + "epoch": 6.22038567493113, + "grad_norm": 4.3125, + "learning_rate": 3.683995891147696e-07, + "loss": 1.4004876613616943, + "mean_token_accuracy": 0.6927106529474258, + "num_tokens": 2902419.0, + "step": 429 + }, + { + "entropy": 1.1944062858819962, + "epoch": 6.235078053259872, + "grad_norm": 3.15625, + "learning_rate": 3.598959427130716e-07, + "loss": 1.161584734916687, + "mean_token_accuracy": 0.7194525264203548, + "num_tokens": 2910508.0, + "step": 430 + }, + { + "entropy": 1.3897150121629238, + "epoch": 6.249770431588614, + "grad_norm": 3.421875, + "learning_rate": 3.514839712463683e-07, + "loss": 1.3794488906860352, + "mean_token_accuracy": 0.6831434555351734, + "num_tokens": 2917905.0, + "step": 431 + }, + { + "entropy": 1.5148936957120895, + "epoch": 6.264462809917355, + "grad_norm": 3.90625, + "learning_rate": 3.4316403505552045e-07, + "loss": 1.6462560892105103, + "mean_token_accuracy": 0.6511917188763618, + "num_tokens": 2924680.0, + "step": 432 + }, + { + "entropy": 1.4466035105288029, + "epoch": 6.279155188246097, + "grad_norm": 3.765625, + "learning_rate": 3.3493649053890325e-07, + "loss": 1.496029257774353, + "mean_token_accuracy": 0.6696281190961599, + "num_tokens": 2932051.0, + "step": 433 + }, + { + "entropy": 1.372856643050909, + "epoch": 6.293847566574839, + "grad_norm": 3.515625, + "learning_rate": 3.268016901371407e-07, + "loss": 1.3746181726455688, + "mean_token_accuracy": 0.6710073538124561, + "num_tokens": 2938884.0, + "step": 434 + }, + { + "entropy": 1.4355628602206707, + "epoch": 6.308539944903581, + "grad_norm": 3.75, + "learning_rate": 3.187599823180071e-07, + "loss": 1.4408472776412964, + "mean_token_accuracy": 0.6618307530879974, + "num_tokens": 2945324.0, + "step": 435 + }, + { + "entropy": 1.4253287892788649, + "epoch": 6.3232323232323235, + "grad_norm": 3.359375, + "learning_rate": 3.108117115615006e-07, + "loss": 1.4239962100982666, + "mean_token_accuracy": 0.6748282723128796, + "num_tokens": 2952312.0, + "step": 436 + }, + { + "entropy": 1.3979435861110687, + "epoch": 6.337924701561065, + "grad_norm": 3.53125, + "learning_rate": 3.0295721834508686e-07, + "loss": 1.3381223678588867, + "mean_token_accuracy": 0.6750478371977806, + "num_tokens": 2959402.0, + "step": 437 + }, + { + "entropy": 1.405780129134655, + "epoch": 6.352617079889807, + "grad_norm": 3.71875, + "learning_rate": 2.9519683912911267e-07, + "loss": 1.437018871307373, + "mean_token_accuracy": 0.6710403822362423, + "num_tokens": 2966008.0, + "step": 438 + }, + { + "entropy": 1.4760498031973839, + "epoch": 6.367309458218549, + "grad_norm": 3.78125, + "learning_rate": 2.875309063423956e-07, + "loss": 1.6535365581512451, + "mean_token_accuracy": 0.6638116780668497, + "num_tokens": 2972476.0, + "step": 439 + }, + { + "entropy": 1.4005642868578434, + "epoch": 6.382001836547291, + "grad_norm": 4.1875, + "learning_rate": 2.7995974836798194e-07, + "loss": 1.3646724224090576, + "mean_token_accuracy": 0.6797922551631927, + "num_tokens": 2978239.0, + "step": 440 + }, + { + "entropy": 1.293638188391924, + "epoch": 6.3966942148760335, + "grad_norm": 3.640625, + "learning_rate": 2.7248368952908055e-07, + "loss": 1.2501822710037231, + "mean_token_accuracy": 0.6940024830400944, + "num_tokens": 2984971.0, + "step": 441 + }, + { + "entropy": 1.6457590200006962, + "epoch": 6.411386593204775, + "grad_norm": 4.1875, + "learning_rate": 2.6510305007516974e-07, + "loss": 1.590219497680664, + "mean_token_accuracy": 0.6325998902320862, + "num_tokens": 2992049.0, + "step": 442 + }, + { + "entropy": 1.658167488873005, + "epoch": 6.426078971533517, + "grad_norm": 3.78125, + "learning_rate": 2.5781814616827936e-07, + "loss": 1.833939790725708, + "mean_token_accuracy": 0.6285004448145628, + "num_tokens": 2998916.0, + "step": 443 + }, + { + "entropy": 1.231806393712759, + "epoch": 6.440771349862259, + "grad_norm": 3.75, + "learning_rate": 2.506292898694468e-07, + "loss": 1.28045654296875, + "mean_token_accuracy": 0.7068323567509651, + "num_tokens": 3005512.0, + "step": 444 + }, + { + "entropy": 1.4775658771395683, + "epoch": 6.455463728191001, + "grad_norm": 3.703125, + "learning_rate": 2.43536789125349e-07, + "loss": 1.4871981143951416, + "mean_token_accuracy": 0.6590262055397034, + "num_tokens": 3012108.0, + "step": 445 + }, + { + "entropy": 1.3822932876646519, + "epoch": 6.470156106519743, + "grad_norm": 4.09375, + "learning_rate": 2.365409477551117e-07, + "loss": 1.378037691116333, + "mean_token_accuracy": 0.6818497627973557, + "num_tokens": 3018657.0, + "step": 446 + }, + { + "entropy": 1.3791243396699429, + "epoch": 6.484848484848484, + "grad_norm": 3.6875, + "learning_rate": 2.2964206543729662e-07, + "loss": 1.3524497747421265, + "mean_token_accuracy": 0.6901324354112148, + "num_tokens": 3025796.0, + "step": 447 + }, + { + "entropy": 1.4186674058437347, + "epoch": 6.499540863177227, + "grad_norm": 3.359375, + "learning_rate": 2.2284043769706026e-07, + "loss": 1.519934058189392, + "mean_token_accuracy": 0.6646804567426443, + "num_tokens": 3033756.0, + "step": 448 + }, + { + "entropy": 1.2882320508360863, + "epoch": 6.514233241505969, + "grad_norm": 3.90625, + "learning_rate": 2.1613635589349756e-07, + "loss": 1.3580642938613892, + "mean_token_accuracy": 0.69617984816432, + "num_tokens": 3040800.0, + "step": 449 + }, + { + "entropy": 1.3028308153152466, + "epoch": 6.528925619834711, + "grad_norm": 4.15625, + "learning_rate": 2.0953010720716037e-07, + "loss": 1.3698010444641113, + "mean_token_accuracy": 0.6910812072455883, + "num_tokens": 3047199.0, + "step": 450 + }, + { + "entropy": 1.3760417755693197, + "epoch": 6.543617998163453, + "grad_norm": 3.640625, + "learning_rate": 2.0302197462775453e-07, + "loss": 1.3222535848617554, + "mean_token_accuracy": 0.6876711696386337, + "num_tokens": 3054037.0, + "step": 451 + }, + { + "entropy": 1.1697348654270172, + "epoch": 6.558310376492194, + "grad_norm": 3.421875, + "learning_rate": 1.9661223694201898e-07, + "loss": 1.1562086343765259, + "mean_token_accuracy": 0.7222369164228439, + "num_tokens": 3060835.0, + "step": 452 + }, + { + "entropy": 1.2634800747036934, + "epoch": 6.573002754820937, + "grad_norm": 4.03125, + "learning_rate": 1.9030116872178317e-07, + "loss": 1.1908690929412842, + "mean_token_accuracy": 0.7110442295670509, + "num_tokens": 3066363.0, + "step": 453 + }, + { + "entropy": 1.3702223263680935, + "epoch": 6.587695133149679, + "grad_norm": 4.03125, + "learning_rate": 1.8408904031220476e-07, + "loss": 1.5914932489395142, + "mean_token_accuracy": 0.6844195239245892, + "num_tokens": 3073703.0, + "step": 454 + }, + { + "entropy": 1.397190399467945, + "epoch": 6.602387511478421, + "grad_norm": 3.625, + "learning_rate": 1.7797611782018942e-07, + "loss": 1.5647703409194946, + "mean_token_accuracy": 0.676605511456728, + "num_tokens": 3081815.0, + "step": 455 + }, + { + "entropy": 1.5388475097715855, + "epoch": 6.6170798898071626, + "grad_norm": 3.96875, + "learning_rate": 1.719626631029911e-07, + "loss": 1.640184760093689, + "mean_token_accuracy": 0.6557557284832001, + "num_tokens": 3088623.0, + "step": 456 + }, + { + "entropy": 1.19817179068923, + "epoch": 6.631772268135904, + "grad_norm": 4.09375, + "learning_rate": 1.6604893375699594e-07, + "loss": 1.1748251914978027, + "mean_token_accuracy": 0.721715409308672, + "num_tokens": 3094847.0, + "step": 457 + }, + { + "entropy": 1.35783052444458, + "epoch": 6.646464646464646, + "grad_norm": 3.4375, + "learning_rate": 1.602351831066862e-07, + "loss": 1.3434321880340576, + "mean_token_accuracy": 0.6994687616825104, + "num_tokens": 3102248.0, + "step": 458 + }, + { + "entropy": 1.0797005984932184, + "epoch": 6.661157024793388, + "grad_norm": 3.15625, + "learning_rate": 1.5452166019378989e-07, + "loss": 1.0800870656967163, + "mean_token_accuracy": 0.7316170409321785, + "num_tokens": 3110001.0, + "step": 459 + }, + { + "entropy": 1.3832198455929756, + "epoch": 6.675849403122131, + "grad_norm": 3.5, + "learning_rate": 1.4890860976661314e-07, + "loss": 1.3797262907028198, + "mean_token_accuracy": 0.6900227032601833, + "num_tokens": 3117274.0, + "step": 460 + }, + { + "entropy": 1.1709060333669186, + "epoch": 6.6905417814508725, + "grad_norm": 3.453125, + "learning_rate": 1.4339627226955394e-07, + "loss": 1.2020788192749023, + "mean_token_accuracy": 0.7185935415327549, + "num_tokens": 3123995.0, + "step": 461 + }, + { + "entropy": 1.4667035713791847, + "epoch": 6.705234159779614, + "grad_norm": 3.90625, + "learning_rate": 1.3798488383280489e-07, + "loss": 1.3801714181900024, + "mean_token_accuracy": 0.6576951071619987, + "num_tokens": 3131304.0, + "step": 462 + }, + { + "entropy": 1.5537691339850426, + "epoch": 6.719926538108356, + "grad_norm": 3.75, + "learning_rate": 1.3267467626223606e-07, + "loss": 1.476824402809143, + "mean_token_accuracy": 0.6602157857269049, + "num_tokens": 3137410.0, + "step": 463 + }, + { + "entropy": 1.235965933650732, + "epoch": 6.734618916437098, + "grad_norm": 3.984375, + "learning_rate": 1.2746587702946538e-07, + "loss": 1.1468799114227295, + "mean_token_accuracy": 0.7125266939401627, + "num_tokens": 3142974.0, + "step": 464 + }, + { + "entropy": 1.2582230232656002, + "epoch": 6.749311294765841, + "grad_norm": 3.296875, + "learning_rate": 1.223587092621162e-07, + "loss": 1.2624547481536865, + "mean_token_accuracy": 0.70580143481493, + "num_tokens": 3151139.0, + "step": 465 + }, + { + "entropy": 1.2958066929131746, + "epoch": 6.7640036730945825, + "grad_norm": 3.65625, + "learning_rate": 1.1735339173425759e-07, + "loss": 1.276180386543274, + "mean_token_accuracy": 0.7052340060472488, + "num_tokens": 3158302.0, + "step": 466 + }, + { + "entropy": 1.3456409573554993, + "epoch": 6.778696051423324, + "grad_norm": 3.953125, + "learning_rate": 1.1245013885703343e-07, + "loss": 1.3040763139724731, + "mean_token_accuracy": 0.7013955563306808, + "num_tokens": 3165056.0, + "step": 467 + }, + { + "entropy": 1.3066479973495007, + "epoch": 6.793388429752066, + "grad_norm": 3.890625, + "learning_rate": 1.0764916066947795e-07, + "loss": 1.4025518894195557, + "mean_token_accuracy": 0.6909100040793419, + "num_tokens": 3172110.0, + "step": 468 + }, + { + "entropy": 1.3883640430867672, + "epoch": 6.808080808080808, + "grad_norm": 4.21875, + "learning_rate": 1.0295066282951738e-07, + "loss": 1.4634349346160889, + "mean_token_accuracy": 0.6746046468615532, + "num_tokens": 3178411.0, + "step": 469 + }, + { + "entropy": 1.5163409858942032, + "epoch": 6.82277318640955, + "grad_norm": 3.734375, + "learning_rate": 9.835484660516203e-08, + "loss": 1.6738132238388062, + "mean_token_accuracy": 0.6578865684568882, + "num_tokens": 3185092.0, + "step": 470 + }, + { + "entropy": 1.357316054403782, + "epoch": 6.837465564738292, + "grad_norm": 3.828125, + "learning_rate": 9.386190886588208e-08, + "loss": 1.3559865951538086, + "mean_token_accuracy": 0.6832005195319653, + "num_tokens": 3191782.0, + "step": 471 + }, + { + "entropy": 1.3066742308437824, + "epoch": 6.852157943067034, + "grad_norm": 3.921875, + "learning_rate": 8.947204207417681e-08, + "loss": 1.3522746562957764, + "mean_token_accuracy": 0.7099240720272064, + "num_tokens": 3197795.0, + "step": 472 + }, + { + "entropy": 1.3219049498438835, + "epoch": 6.866850321395776, + "grad_norm": 3.953125, + "learning_rate": 8.518543427732951e-08, + "loss": 1.3603464365005493, + "mean_token_accuracy": 0.6895252950489521, + "num_tokens": 3204782.0, + "step": 473 + }, + { + "entropy": 1.241905678063631, + "epoch": 6.881542699724518, + "grad_norm": 3.59375, + "learning_rate": 8.100226909935061e-08, + "loss": 1.1739405393600464, + "mean_token_accuracy": 0.7041682228446007, + "num_tokens": 3210805.0, + "step": 474 + }, + { + "entropy": 1.3946216590702534, + "epoch": 6.89623507805326, + "grad_norm": 3.265625, + "learning_rate": 7.692272573311427e-08, + "loss": 1.4739960432052612, + "mean_token_accuracy": 0.68177555128932, + "num_tokens": 3219953.0, + "step": 475 + }, + { + "entropy": 1.3859229907393456, + "epoch": 6.910927456382002, + "grad_norm": 3.3125, + "learning_rate": 7.294697893267977e-08, + "loss": 1.376434326171875, + "mean_token_accuracy": 0.6881735809147358, + "num_tokens": 3227524.0, + "step": 476 + }, + { + "entropy": 1.140619345009327, + "epoch": 6.925619834710744, + "grad_norm": 2.859375, + "learning_rate": 6.907519900580862e-08, + "loss": 0.9896233081817627, + "mean_token_accuracy": 0.7222557105123997, + "num_tokens": 3235487.0, + "step": 477 + }, + { + "entropy": 1.4113641753792763, + "epoch": 6.940312213039486, + "grad_norm": 3.203125, + "learning_rate": 6.530755180666593e-08, + "loss": 1.35636568069458, + "mean_token_accuracy": 0.6656701732426882, + "num_tokens": 3243387.0, + "step": 478 + }, + { + "entropy": 1.2951929830014706, + "epoch": 6.955004591368228, + "grad_norm": 3.578125, + "learning_rate": 6.164419872871835e-08, + "loss": 1.3108189105987549, + "mean_token_accuracy": 0.6984525807201862, + "num_tokens": 3250630.0, + "step": 479 + }, + { + "entropy": 1.2143667675554752, + "epoch": 6.96969696969697, + "grad_norm": 3.390625, + "learning_rate": 5.8085296697819036e-08, + "loss": 1.1499379873275757, + "mean_token_accuracy": 0.7011887915432453, + "num_tokens": 3258109.0, + "step": 480 + }, + { + "entropy": 1.3035505078732967, + "epoch": 6.9843893480257115, + "grad_norm": 3.78125, + "learning_rate": 5.463099816548578e-08, + "loss": 1.3332489728927612, + "mean_token_accuracy": 0.6942353397607803, + "num_tokens": 3265127.0, + "step": 481 + }, + { + "entropy": 1.3939937017858028, + "epoch": 6.999081726354453, + "grad_norm": 3.3125, + "learning_rate": 5.128145110237154e-08, + "loss": 1.4082341194152832, + "mean_token_accuracy": 0.6772250905632973, + "num_tokens": 3272251.0, + "step": 482 + }, + { + "entropy": 0.9707384705543518, + "epoch": 7.0, + "grad_norm": 10.5, + "learning_rate": 4.8036798991923925e-08, + "loss": 0.9935500025749207, + "mean_token_accuracy": 0.7545045018196106, + "num_tokens": 3272696.0, + "step": 483 + }, + { + "entropy": 1.1763608865439892, + "epoch": 7.014692378328742, + "grad_norm": 3.953125, + "learning_rate": 4.489718082424044e-08, + "loss": 1.2014083862304688, + "mean_token_accuracy": 0.7273847311735153, + "num_tokens": 3278916.0, + "step": 484 + }, + { + "entropy": 1.1877197846770287, + "epoch": 7.029384756657484, + "grad_norm": 3.578125, + "learning_rate": 4.186273109011374e-08, + "loss": 1.205470085144043, + "mean_token_accuracy": 0.7204346731305122, + "num_tokens": 3286066.0, + "step": 485 + }, + { + "entropy": 1.1499557420611382, + "epoch": 7.044077134986226, + "grad_norm": 3.109375, + "learning_rate": 3.893357977527101e-08, + "loss": 1.011968970298767, + "mean_token_accuracy": 0.724033422768116, + "num_tokens": 3293689.0, + "step": 486 + }, + { + "entropy": 1.4138266146183014, + "epoch": 7.058769513314968, + "grad_norm": 4.5, + "learning_rate": 3.610985235480563e-08, + "loss": 1.3265433311462402, + "mean_token_accuracy": 0.675995796918869, + "num_tokens": 3299096.0, + "step": 487 + }, + { + "entropy": 1.4587520882487297, + "epoch": 7.07346189164371, + "grad_norm": 3.515625, + "learning_rate": 3.339166978780256e-08, + "loss": 1.5998899936676025, + "mean_token_accuracy": 0.6766221728175879, + "num_tokens": 3306153.0, + "step": 488 + }, + { + "entropy": 1.4471938125789165, + "epoch": 7.088154269972452, + "grad_norm": 4.3125, + "learning_rate": 3.077914851215585e-08, + "loss": 1.523323893547058, + "mean_token_accuracy": 0.669438187032938, + "num_tokens": 3312485.0, + "step": 489 + }, + { + "entropy": 1.3640289083123207, + "epoch": 7.102846648301194, + "grad_norm": 3.515625, + "learning_rate": 2.8272400439581514e-08, + "loss": 1.3801430463790894, + "mean_token_accuracy": 0.6815820559859276, + "num_tokens": 3319393.0, + "step": 490 + }, + { + "entropy": 1.3892018273472786, + "epoch": 7.117539026629935, + "grad_norm": 3.6875, + "learning_rate": 2.5871532950824395e-08, + "loss": 1.5527251958847046, + "mean_token_accuracy": 0.6826623827219009, + "num_tokens": 3326550.0, + "step": 491 + }, + { + "entropy": 1.2440132424235344, + "epoch": 7.132231404958677, + "grad_norm": 4.1875, + "learning_rate": 2.3576648891056876e-08, + "loss": 1.3222265243530273, + "mean_token_accuracy": 0.6882449053227901, + "num_tokens": 3332636.0, + "step": 492 + }, + { + "entropy": 1.4525053799152374, + "epoch": 7.14692378328742, + "grad_norm": 3.890625, + "learning_rate": 2.1387846565474047e-08, + "loss": 1.398058533668518, + "mean_token_accuracy": 0.6715537309646606, + "num_tokens": 3340220.0, + "step": 493 + }, + { + "entropy": 1.3993450328707695, + "epoch": 7.161616161616162, + "grad_norm": 3.671875, + "learning_rate": 1.930521973508237e-08, + "loss": 1.3713880777359009, + "mean_token_accuracy": 0.6754298955202103, + "num_tokens": 3347587.0, + "step": 494 + }, + { + "entropy": 1.5341194830834866, + "epoch": 7.176308539944904, + "grad_norm": 4.0625, + "learning_rate": 1.732885761268427e-08, + "loss": 1.5632487535476685, + "mean_token_accuracy": 0.6606750525534153, + "num_tokens": 3354512.0, + "step": 495 + }, + { + "entropy": 1.3622602969408035, + "epoch": 7.191000918273645, + "grad_norm": 4.21875, + "learning_rate": 1.54588448590548e-08, + "loss": 1.3854639530181885, + "mean_token_accuracy": 0.6834001019597054, + "num_tokens": 3360290.0, + "step": 496 + }, + { + "entropy": 1.2942306697368622, + "epoch": 7.205693296602387, + "grad_norm": 3.78125, + "learning_rate": 1.3695261579316776e-08, + "loss": 1.2496094703674316, + "mean_token_accuracy": 0.7065734341740608, + "num_tokens": 3366921.0, + "step": 497 + }, + { + "entropy": 1.3717114739120007, + "epoch": 7.22038567493113, + "grad_norm": 3.609375, + "learning_rate": 1.2038183319507957e-08, + "loss": 1.334294319152832, + "mean_token_accuracy": 0.6929150782525539, + "num_tokens": 3373498.0, + "step": 498 + }, + { + "entropy": 1.5312567129731178, + "epoch": 7.235078053259872, + "grad_norm": 4.0, + "learning_rate": 1.0487681063345856e-08, + "loss": 1.564821481704712, + "mean_token_accuracy": 0.6466786749660969, + "num_tokens": 3380077.0, + "step": 499 + }, + { + "entropy": 1.3633756004273891, + "epoch": 7.249770431588614, + "grad_norm": 3.34375, + "learning_rate": 9.043821229186567e-09, + "loss": 1.402569055557251, + "mean_token_accuracy": 0.6758171431720257, + "num_tokens": 3387922.0, + "step": 500 + }, + { + "entropy": 1.2247029319405556, + "epoch": 7.264462809917355, + "grad_norm": 3.625, + "learning_rate": 7.70666566718009e-09, + "loss": 1.2212553024291992, + "mean_token_accuracy": 0.70748520642519, + "num_tokens": 3393752.0, + "step": 501 + }, + { + "entropy": 1.4086985550820827, + "epoch": 7.279155188246097, + "grad_norm": 3.9375, + "learning_rate": 6.476271656620237e-09, + "loss": 1.6283751726150513, + "mean_token_accuracy": 0.6691960953176022, + "num_tokens": 3400652.0, + "step": 502 + }, + { + "entropy": 1.4841664768755436, + "epoch": 7.293847566574839, + "grad_norm": 4.84375, + "learning_rate": 5.352691903491303e-09, + "loss": 1.4717917442321777, + "mean_token_accuracy": 0.6619565561413765, + "num_tokens": 3405519.0, + "step": 503 + }, + { + "entropy": 1.504446342587471, + "epoch": 7.308539944903581, + "grad_norm": 4.3125, + "learning_rate": 4.335974538210441e-09, + "loss": 1.6657609939575195, + "mean_token_accuracy": 0.6465577762573957, + "num_tokens": 3412942.0, + "step": 504 + }, + { + "entropy": 1.175526186823845, + "epoch": 7.3232323232323235, + "grad_norm": 3.953125, + "learning_rate": 3.4261631135654174e-09, + "loss": 1.1835148334503174, + "mean_token_accuracy": 0.7157127186655998, + "num_tokens": 3419831.0, + "step": 505 + }, + { + "entropy": 1.3945834636688232, + "epoch": 7.337924701561065, + "grad_norm": 3.53125, + "learning_rate": 2.623296602849712e-09, + "loss": 1.3837270736694336, + "mean_token_accuracy": 0.6788865961134434, + "num_tokens": 3427129.0, + "step": 506 + }, + { + "entropy": 1.2734551429748535, + "epoch": 7.352617079889807, + "grad_norm": 3.546875, + "learning_rate": 1.9274093981927476e-09, + "loss": 1.2432376146316528, + "mean_token_accuracy": 0.6965003944933414, + "num_tokens": 3434476.0, + "step": 507 + }, + { + "entropy": 1.2341192811727524, + "epoch": 7.367309458218549, + "grad_norm": 3.3125, + "learning_rate": 1.3385313090857888e-09, + "loss": 1.2809759378433228, + "mean_token_accuracy": 0.7019614204764366, + "num_tokens": 3441949.0, + "step": 508 + }, + { + "entropy": 1.2455067448318005, + "epoch": 7.382001836547291, + "grad_norm": 4.03125, + "learning_rate": 8.566875611068503e-10, + "loss": 1.1633739471435547, + "mean_token_accuracy": 0.7040832042694092, + "num_tokens": 3448515.0, + "step": 509 + }, + { + "entropy": 1.3598694279789925, + "epoch": 7.3966942148760335, + "grad_norm": 3.734375, + "learning_rate": 4.818987948379538e-10, + "loss": 1.4113589525222778, + "mean_token_accuracy": 0.6924026310443878, + "num_tokens": 3455770.0, + "step": 510 + }, + { + "entropy": 1.0381613001227379, + "epoch": 7.411386593204775, + "grad_norm": 2.90625, + "learning_rate": 2.1418106498249936e-10, + "loss": 1.0188246965408325, + "mean_token_accuracy": 0.7484844401478767, + "num_tokens": 3464219.0, + "step": 511 + }, + { + "entropy": 1.511953193694353, + "epoch": 7.426078971533517, + "grad_norm": 4.46875, + "learning_rate": 5.354583967692728e-11, + "loss": 1.4547315835952759, + "mean_token_accuracy": 0.6628812402486801, + "num_tokens": 3469540.0, + "step": 512 + }, + { + "epoch": 7.426078971533517, + "eval_entropy": 1.3383248895406723, + "eval_loss": 1.4060174226760864, + "eval_mean_token_accuracy": 0.6780907139182091, + "eval_num_tokens": 3469540.0, + "eval_runtime": 1.6771, + "eval_samples_per_second": 34.583, + "eval_steps_per_second": 4.77, + "step": 512 + } + ], + "logging_steps": 1, + "max_steps": 512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 128, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.581762221338624e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}