From 85dd4b53c0d9b4bb084362b3087879bfb2baa2d8 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 29 May 2026 03:56:16 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: toroe/SmolLM-3B-Science-DE Source: Original Platform --- .gitattributes | 36 + README.md | 280 ++ chat_template.jinja | 94 + config.json | 108 + generation_config.json | 10 + model-00001-of-00003.safetensors | 3 + model-00002-of-00003.safetensors | 3 + model-00003-of-00003.safetensors | 3 + model.safetensors.index.json | 335 ++ optimizer.bin | 3 + pytorch_model_fsdp.bin | 3 + rng_state_0.pth | 3 + rng_state_1.pth | 3 + rng_state_2.pth | 3 + rng_state_3.pth | 3 + rng_state_4.pth | 3 + rng_state_5.pth | 3 + rng_state_6.pth | 3 + rng_state_7.pth | 3 + scheduler.pt | 3 + special_tokens_map.json | 16 + tokenizer.json | 3 + tokenizer_config.json | 2064 +++++++++++ trainer_state.json | 5488 ++++++++++++++++++++++++++++++ training_args.bin | 3 + 25 files changed, 8479 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00003.safetensors create mode 100644 model-00002-of-00003.safetensors create mode 100644 model-00003-of-00003.safetensors create mode 100644 model.safetensors.index.json create mode 100644 optimizer.bin create mode 100644 pytorch_model_fsdp.bin create mode 100644 rng_state_0.pth create mode 100644 rng_state_1.pth create mode 100644 rng_state_2.pth create mode 100644 rng_state_3.pth create mode 100644 rng_state_4.pth create mode 100644 rng_state_5.pth create mode 100644 rng_state_6.pth create mode 100644 rng_state_7.pth create mode 100644 scheduler.pt create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..eb6e3aa --- /dev/null +++ b/README.md @@ -0,0 +1,280 @@ +--- +language: + - de +license: other +base_model: HuggingFaceTB/SmolLM3-3B +tags: + - sft + - instruction-tuning + - reasoning + - german + - multilingual + - long-context + - fsdp + - transformers +datasets: + - DGurgurov/Nemotron-Multilingual-Reasoning +metrics: + - token_accuracy +library_name: transformers +pipeline_tag: text-generation +--- + + # SmolLM3-3B — German Reasoning Instruction SFT (Nemotron Multilingual Reasoning) + + ## Model Description + + This model is a **Supervised Fine-Tuned (SFT)** version of: + + `HuggingFaceTB/SmolLM3-3B` + + It was fine-tuned on the **German (`de`) split** of the dataset: + + `DGurgurov/Nemotron-Multilingual-Reasoning` + + The goal of the training was to improve: + + - German instruction following +- Step-by-step reasoning +- Long-context conversation behavior + + The model was trained using chat-formatted conversations and **completion-only loss**, meaning only assistant responses contributed to optimization. + + Key properties: + + - Base model: SmolLM3-3B +- Language specialization: German +- Context length during training: **16,384 tokens** +- Chat formatted dataset +- Long-context packing enabled + + --- + + ## Intended Uses + + ### Suitable For +- German conversational assistants +- Educational tutoring +- Reasoning and structured explanation tasks +- Long-document Q&A in German +- Research experiments with long-context small LLMs + + ### Not Suitable For +- Medical or legal advice without human review +- Autonomous decision-making +- Safety-critical systems +- High-stakes financial decisions + + --- + + ## Training Data + + Dataset used: + + `DGurgurov/Nemotron-Multilingual-Reasoning` + + Processing configuration: + + - Language filtering: **German only** +- Converted into chat messages (`prepare_messages=True`) +- Assistant-only optimization (`completion_only_loss=True`) + + Only the assistant responses were used to compute loss; user and system messages were masked. + + Please review the dataset card for provenance and limitations. + + --- + + ## Training Procedure + + Training was performed using **HuggingFace Accelerate with FSDP (Fully Sharded Data Parallel)** across 8 processes. + + ### Core Setup + + - Training method: Supervised fine-tuning (SFT) +- Epochs: **3** +- Maximum sequence length: **16,384** +- Sequence packing: enabled +- Precision: **bfloat16** +- Kernel optimization: Liger kernel enabled +- Gradient checkpointing: enabled +- Distributed: FSDP (8 processes) + + --- + + ### Optimization + + - Optimizer: `adamw_torch_fused` +- Per-device batch size: 4 +- Gradient accumulation: 4 +- Effective batch size (per GPU): 16 sequences per step +- Weight decay: 0.05 + + Learning rate schedule: + + - Scheduler: `cosine_with_min_lr` +- Warmup ratio: 0.05 +- Minimum LR: 5e-6 + + --- + + ### Logging & Checkpoints + + - Logging every 5 steps +- Checkpoint every 450 steps +- Weights & Biases tracking enabled +- Token accuracy logged during training + + --- + + ### Data Processing + + - Dataset workers: 16 +- Dataset preparation: enabled +- Chat message preparation: enabled +- German split: enabled + + --- + + ## Usage + + ### Transformers + + ```python +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + + model_id = "YOUR_USERNAME/YOUR_MODEL_NAME" + + tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) +model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + torch_dtype=torch.bfloat16, +) + + messages = [ + {"role": "system", "content": "Du bist ein hilfreicher Assistent."}, + {"role": "user", "content": "Warum ist der Himmel blau?"} +] + + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + + outputs = model.generate( + **inputs, + max_new_tokens=512, + temperature=0.7, + top_p=0.9, + do_sample=True +) + + print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` +**Important:** +You should use `apply_chat_template()` when prompting. The model was trained on chat-formatted conversations and performance will degrade without it. + + --- + + ## Evaluation + + During training, **token accuracy** was logged as a diagnostic metric. + + Token accuracy: +- is useful for monitoring training stability +- is **NOT** a benchmark score +- does not represent real reasoning performance + + For proper evaluation, use: +- German instruction-following benchmarks +- reasoning datasets +- long-context evaluation tasks + + --- + + ## Limitations + + - May hallucinate facts +- Reasoning chains can still contain logical errors +- Performance near 16k context depends heavily on prompt structure +- Improvements mainly apply to German +- Smaller model size means weaker world knowledge than large LLMs +- Not aligned for safety-critical deployment + + --- + + ## Bias & Safety + + This model inherits biases from: +- the base model +- the training dataset + + Recommended mitigations: +- add moderation filters +- use system prompts enforcing safe behavior +- include human review for sensitive deployments + + --- + + ## License + + This model is a derivative of: + + `HuggingFaceTB/SmolLM3-3B` + + Therefore, the original base model license and usage restrictions apply, along with any dataset terms. + + Verify compatibility before commercial deployment. + + --- + + ## Reproducibility (Training Arguments) + + ```text +accelerate launch --use_fsdp --num_processes 8 --config_file sft/my_config.yaml sft/sft_trainer.py + + --model_name HuggingFaceTB/SmolLM3-3B +--tokenizer_name HuggingFaceTB/SmolLM3-3B +--dataset_path DGurgurov/Nemotron-Multilingual-Reasoning +--skip_prepare_dataset False +--lang_split de +--prepare_messages True +--completion_only_loss True +--max_length 16384 +--dataset_num_proc 16 +--packing True +--use_liger_kernel True +--bf16 True +--log_token_accuracy True +--optim adamw_torch_fused +--gradient_checkpointing True +--per_device_train_batch_size 4 +--gradient_accumulation_steps 4 +--ddp_find_unused_parameters False +--lr_scheduler_type cosine_with_min_lr +--lr_scheduler_kwargs {"min_lr": 5.0e-6} +--warmup_ratio 0.05 +--weight_decay 0.05 +--report_to wandb +--run_name smol_3b_3epochs_lns_de +--num_train_epochs 3 +--save_strategy steps +--logging_steps 5 +--save_steps 450 +``` +--- + + ## Citation + + If you use this model, please cite: + + - `HuggingFaceTB/SmolLM3-3B` +- `DGurgurov/Nemotron-Multilingual-Reasoning` + + --- + + ## Acknowledgements + + - HuggingFaceTB — SmolLM3 base model +- Nemotron Multilingual Reasoning dataset authors +- HuggingFace Accelerate and Transformers libraries \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..e01e3a1 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,94 @@ +{# ───── defaults ───── #} +{%- if enable_thinking is not defined -%} +{%- set enable_thinking = true -%} +{%- endif -%} + +{# ───── reasoning mode ───── #} +{%- if enable_thinking -%} + {%- set reasoning_mode = "/think" -%} +{%- else -%} + {%- set reasoning_mode = "/no_think" -%} +{%- endif -%} + +{# ───── header (system message) ───── #} +{{- "<|im_start|>system\n" -}} + +{%- if messages[0].role == "system" -%} + {%- set system_message = messages[0].content -%} + {%- if "/no_think" in system_message -%} + {%- set reasoning_mode = "/no_think" -%} + {%- elif "/think" in system_message -%} + {%- set reasoning_mode = "/think" -%} + {%- endif -%} + {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%} +{%- endif -%} + +{%- if "/system_override" in system_message -%} + {{- custom_instructions.replace("/system_override", "").rstrip() -}} + {{- "<|im_end|>\n" -}} +{%- else -%} + {{- "## Metadata\n\n" -}} + {{- "Knowledge Cutoff Date: June 2025\n" -}} + {%- set today = strftime_now("%d %B %Y") -%} + {{- "Today Date: " ~ today ~ "\n" -}} + {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}} + + {{- "## Custom Instructions\n\n" -}} + {%- if custom_instructions -%} + {{- custom_instructions + "\n\n" -}} + {%- elif reasoning_mode == "/think" -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: Thought section Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}} + {%- else -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}} + {%- endif -%} + + {%- if xml_tools or python_tools or tools -%} + {{- "### Tools\n\n" -}} + {%- if xml_tools or tools -%} + {%- if tools -%} + {%- set xml_tools = tools -%} + {%- endif -%} + {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within XML tags:\n\n\n") -%} + {%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #} + {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%} + {%- endfor -%} + {%- set xml_tool_string = ns.xml_tool_string + "\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n" -%} + {{- xml_tool_string -}} + {%- endif -%} + {%- if python_tools -%} + {%- set ns = namespace(python_tool_string="When you send a message containing Python code between '' and '' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n\n") -%} + {%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #} + {%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%} + {%- endfor -%} + {%- set python_tool_string = ns.python_tool_string + "\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%} + {{- python_tool_string -}} + {%- endif -%} + {{- "\n\n" -}} + {{- "<|im_end|>\n" -}} + {%- endif -%} +{%- endif -%} +{# ───── main loop ───── #} +{%- for message in messages -%} + {%- set content = message.content if message.content is string else "" -%} + {%- if message.role == "user" -%} + {{ "<|im_start|>" + message.role + "\n" + content + "<|im_end|>\n" }} + {%- elif message.role == "assistant" -%} + {% generation %} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- endif -%} + {% endgeneration %} + {%- elif message.role == "tool" -%} + {{ "<|im_start|>" + "user\n" + content + "<|im_end|>\n" }} + {%- endif -%} +{%- endfor -%} +{# ───── generation prompt ───── #} +{%- if add_generation_prompt -%} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" }} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..001258f --- /dev/null +++ b/config.json @@ -0,0 +1,108 @@ +{ + "architectures": [ + "SmolLM3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 128012, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 11008, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 65536, + "max_window_layers": 28, + "mlp_bias": false, + "model_type": "smollm3", + "no_rope_layer_interval": 4, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 16, + "num_hidden_layers": 36, + "num_key_value_heads": 4, + "pad_token_id": 128012, + "pretraining_tp": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 5000000.0, + "sliding_window": null, + "transformers_version": "4.57.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..43e5ef6 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,10 @@ +{ + "do_sample": true, + "eos_token_id": [ + 128012 + ], + "pad_token_id": 128012, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.57.0" +} diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..99ff52c --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98f23a714e8e3aad1a1e6188401d863dcba73c837d691989efd6a9900a9ba51e +size 4932711224 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..15aca90 --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:117007cda11c61011215ad10bf4f0549ee11678907a9893ca1911a1ac51c777b +size 4999889128 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..ae73ee6 --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24b12aad6c88f0b8a47767229bab43c90a5c01b463302e5b39b1c947498aca53 +size 3418504984 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..ffbd34f --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,335 @@ +{ + "metadata": { + "total_parameters": 384387328, + "total_size": 13351067648 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/optimizer.bin b/optimizer.bin new file mode 100644 index 0000000..4d9bce1 --- /dev/null +++ b/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62c5cee1bc4642cc9a791b9c1200dbc7d9a005504ec182096f9df09a3b40ef8a +size 24601100995 diff --git a/pytorch_model_fsdp.bin b/pytorch_model_fsdp.bin new file mode 100644 index 0000000..8a988a7 --- /dev/null +++ b/pytorch_model_fsdp.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee9db1c68a1486d24104f7f7df7f558c188f09e1d386a5d02a24f0e11d8de04c +size 13351232180 diff --git a/rng_state_0.pth b/rng_state_0.pth new file mode 100644 index 0000000..af3da00 --- /dev/null +++ b/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b093dfe59b41efeb45cc3d628d3360abaa2303bbaa489081411faf431e52941d +size 16389 diff --git a/rng_state_1.pth b/rng_state_1.pth new file mode 100644 index 0000000..d70d4a2 --- /dev/null +++ b/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:450a0ac1645503c0b14fe9c37d77060cc76b1c9942dcfdd0e779cd526b2e98d9 +size 16389 diff --git a/rng_state_2.pth b/rng_state_2.pth new file mode 100644 index 0000000..8ac8ed7 --- /dev/null +++ b/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:938b37918eac9a4cbef3805f7d2abdcef094a334f848e73ac19fcdc39d38663a +size 16389 diff --git a/rng_state_3.pth b/rng_state_3.pth new file mode 100644 index 0000000..4fd4b11 --- /dev/null +++ b/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8b27a54988f134299ab296b95e8c1e63d476dffdba7c6f120f2076e8688f355 +size 16389 diff --git a/rng_state_4.pth b/rng_state_4.pth new file mode 100644 index 0000000..1976284 --- /dev/null +++ b/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d95f73d920296d5d9558e47894c5a2c0d649d7cb10a3b07a013d6bfbd3b8cf90 +size 16389 diff --git a/rng_state_5.pth b/rng_state_5.pth new file mode 100644 index 0000000..a187a68 --- /dev/null +++ b/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70b945bb634c9daf4a00433296ecc5245b34a2b5f09017993b5f5f03b84dabea +size 16389 diff --git a/rng_state_6.pth b/rng_state_6.pth new file mode 100644 index 0000000..5ff5924 --- /dev/null +++ b/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfdd1fca0dace16a59c8592c531a70661218184bb0249c5862bbfb5ab0844fc9 +size 16389 diff --git a/rng_state_7.pth b/rng_state_7.pth new file mode 100644 index 0000000..4396c8f --- /dev/null +++ b/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d106363f9f1b0ff898c86d083a097bf22fd84de35e5670aa299504abcc99752a +size 16389 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000..f388c63 --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dfc927859b95b185390f63bc27c1e0c41086b3c66aec5bf0d42c28c8979ed70 +size 1465 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..190d562 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..074fa66 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b6a500b662a34eb3f0374db856ba4ad7de4c81040571d78dc0d357238930005 +size 17208819 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..61910c2 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128003": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128014": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128015": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128016": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128017": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128018": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": null, + "clean_up_tokenization_spaces": true, + "eos_token": "<|im_end|>", + "extra_special_tokens": {}, + "fast": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|im_end|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..48bc236 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,5488 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 3030, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0049517207229512255, + "grad_norm": 4.494762897491455, + "learning_rate": 5.263157894736843e-07, + "loss": 1.0863, + "mean_token_accuracy": 0.7343291759490966, + "num_tokens": 10417883.0, + "step": 5 + }, + { + "epoch": 0.009903441445902451, + "grad_norm": 3.2665634155273438, + "learning_rate": 1.1842105263157894e-06, + "loss": 1.076, + "mean_token_accuracy": 0.7338468879461288, + "num_tokens": 20806975.0, + "step": 10 + }, + { + "epoch": 0.014855162168853677, + "grad_norm": 1.024572491645813, + "learning_rate": 1.8421052631578948e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.7371937811374665, + "num_tokens": 31224276.0, + "step": 15 + }, + { + "epoch": 0.019806882891804902, + "grad_norm": 1.0827839374542236, + "learning_rate": 2.5e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7427578687667846, + "num_tokens": 41600322.0, + "step": 20 + }, + { + "epoch": 0.02475860361475613, + "grad_norm": 0.7391007542610168, + "learning_rate": 3.157894736842105e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7464866191148758, + "num_tokens": 51996625.0, + "step": 25 + }, + { + "epoch": 0.029710324337707355, + "grad_norm": 0.4652908146381378, + "learning_rate": 3.815789473684211e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7493666768074035, + "num_tokens": 62376576.0, + "step": 30 + }, + { + "epoch": 0.03466204506065858, + "grad_norm": 0.3708527982234955, + "learning_rate": 4.473684210526316e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7504437863826752, + "num_tokens": 72739525.0, + "step": 35 + }, + { + "epoch": 0.039613765783609804, + "grad_norm": 0.3008730709552765, + "learning_rate": 5.131578947368422e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7509326428174973, + "num_tokens": 83158379.0, + "step": 40 + }, + { + "epoch": 0.04456548650656103, + "grad_norm": 12.92960262298584, + "learning_rate": 5.789473684210527e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7486720085144043, + "num_tokens": 93561279.0, + "step": 45 + }, + { + "epoch": 0.04951720722951226, + "grad_norm": 0.2265157252550125, + "learning_rate": 6.447368421052632e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7535740584135056, + "num_tokens": 103975900.0, + "step": 50 + }, + { + "epoch": 0.05446892795246348, + "grad_norm": 0.22998610138893127, + "learning_rate": 7.1052631578947375e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7536993026733398, + "num_tokens": 114358295.0, + "step": 55 + }, + { + "epoch": 0.05942064867541471, + "grad_norm": 0.20382240414619446, + "learning_rate": 7.763157894736843e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.758358484506607, + "num_tokens": 124741874.0, + "step": 60 + }, + { + "epoch": 0.06437236939836594, + "grad_norm": 0.20741654932498932, + "learning_rate": 8.421052631578948e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7618912905454636, + "num_tokens": 135125198.0, + "step": 65 + }, + { + "epoch": 0.06932409012131716, + "grad_norm": 0.19601891934871674, + "learning_rate": 9.078947368421054e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7635368853807449, + "num_tokens": 145516638.0, + "step": 70 + }, + { + "epoch": 0.07427581084426839, + "grad_norm": 0.1902785450220108, + "learning_rate": 9.736842105263159e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7641876369714737, + "num_tokens": 155902199.0, + "step": 75 + }, + { + "epoch": 0.07922753156721961, + "grad_norm": 0.21357858180999756, + "learning_rate": 1.0394736842105264e-05, + "loss": 0.8855, + "mean_token_accuracy": 0.7601119011640549, + "num_tokens": 166260992.0, + "step": 80 + }, + { + "epoch": 0.08417925229017083, + "grad_norm": 0.21855609118938446, + "learning_rate": 1.105263157894737e-05, + "loss": 0.8808, + "mean_token_accuracy": 0.7608364224433899, + "num_tokens": 176655378.0, + "step": 85 + }, + { + "epoch": 0.08913097301312206, + "grad_norm": 0.22869279980659485, + "learning_rate": 1.1710526315789475e-05, + "loss": 0.8838, + "mean_token_accuracy": 0.7606816172599793, + "num_tokens": 187049412.0, + "step": 90 + }, + { + "epoch": 0.09408269373607328, + "grad_norm": 0.21363244950771332, + "learning_rate": 1.236842105263158e-05, + "loss": 0.8668, + "mean_token_accuracy": 0.7635636687278747, + "num_tokens": 197428098.0, + "step": 95 + }, + { + "epoch": 0.09903441445902451, + "grad_norm": 0.2309870570898056, + "learning_rate": 1.3026315789473684e-05, + "loss": 0.8568, + "mean_token_accuracy": 0.7659854501485824, + "num_tokens": 207818677.0, + "step": 100 + }, + { + "epoch": 0.10398613518197573, + "grad_norm": 0.23947252333164215, + "learning_rate": 1.3684210526315791e-05, + "loss": 0.8636, + "mean_token_accuracy": 0.7642275482416153, + "num_tokens": 218213873.0, + "step": 105 + }, + { + "epoch": 0.10893785590492697, + "grad_norm": 0.247147336602211, + "learning_rate": 1.4342105263157895e-05, + "loss": 0.8585, + "mean_token_accuracy": 0.7652702659368515, + "num_tokens": 228635598.0, + "step": 110 + }, + { + "epoch": 0.11388957662787819, + "grad_norm": 0.2094859629869461, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.8672, + "mean_token_accuracy": 0.7633111596107482, + "num_tokens": 239063783.0, + "step": 115 + }, + { + "epoch": 0.11884129735082942, + "grad_norm": 0.33069688081741333, + "learning_rate": 1.5657894736842107e-05, + "loss": 0.8528, + "mean_token_accuracy": 0.7691669374704361, + "num_tokens": 249470214.0, + "step": 120 + }, + { + "epoch": 0.12379301807378064, + "grad_norm": 0.2780938446521759, + "learning_rate": 1.6315789473684213e-05, + "loss": 0.8464, + "mean_token_accuracy": 0.7677050292491913, + "num_tokens": 259887336.0, + "step": 125 + }, + { + "epoch": 0.12874473879673187, + "grad_norm": 0.22850839793682098, + "learning_rate": 1.6973684210526318e-05, + "loss": 0.8364, + "mean_token_accuracy": 0.7698385208845139, + "num_tokens": 270284999.0, + "step": 130 + }, + { + "epoch": 0.1336964595196831, + "grad_norm": 0.2590622305870056, + "learning_rate": 1.763157894736842e-05, + "loss": 0.8543, + "mean_token_accuracy": 0.764978775382042, + "num_tokens": 280680904.0, + "step": 135 + }, + { + "epoch": 0.1386481802426343, + "grad_norm": 0.21655666828155518, + "learning_rate": 1.828947368421053e-05, + "loss": 0.8441, + "mean_token_accuracy": 0.7674482077360153, + "num_tokens": 291077727.0, + "step": 140 + }, + { + "epoch": 0.14359990096558553, + "grad_norm": 0.27917516231536865, + "learning_rate": 1.894736842105263e-05, + "loss": 0.8314, + "mean_token_accuracy": 0.7709874927997589, + "num_tokens": 301498523.0, + "step": 145 + }, + { + "epoch": 0.14855162168853678, + "grad_norm": 0.2864488661289215, + "learning_rate": 1.960526315789474e-05, + "loss": 0.8347, + "mean_token_accuracy": 0.769895127415657, + "num_tokens": 311908148.0, + "step": 150 + }, + { + "epoch": 0.153503342411488, + "grad_norm": 0.24762648344039917, + "learning_rate": 1.9999982126514483e-05, + "loss": 0.8387, + "mean_token_accuracy": 0.7687274873256683, + "num_tokens": 322322809.0, + "step": 155 + }, + { + "epoch": 0.15845506313443922, + "grad_norm": 0.23356765508651733, + "learning_rate": 1.9999781050780763e-05, + "loss": 0.8111, + "mean_token_accuracy": 0.7753471970558167, + "num_tokens": 332741557.0, + "step": 160 + }, + { + "epoch": 0.16340678385739044, + "grad_norm": 0.22937612235546112, + "learning_rate": 1.999935656346626e-05, + "loss": 0.8395, + "mean_token_accuracy": 0.7681750059127808, + "num_tokens": 343139709.0, + "step": 165 + }, + { + "epoch": 0.16835850458034166, + "grad_norm": 0.23504067957401276, + "learning_rate": 1.999870867721605e-05, + "loss": 0.8159, + "mean_token_accuracy": 0.7737643718719482, + "num_tokens": 353571629.0, + "step": 170 + }, + { + "epoch": 0.1733102253032929, + "grad_norm": 0.2992646396160126, + "learning_rate": 1.9997837411330073e-05, + "loss": 0.8384, + "mean_token_accuracy": 0.7703067153692246, + "num_tokens": 363981542.0, + "step": 175 + }, + { + "epoch": 0.17826194602624412, + "grad_norm": 0.2727688252925873, + "learning_rate": 1.999674279176254e-05, + "loss": 0.8161, + "mean_token_accuracy": 0.7731374055147171, + "num_tokens": 374352478.0, + "step": 180 + }, + { + "epoch": 0.18321366674919534, + "grad_norm": 0.3550196886062622, + "learning_rate": 1.9995424851121163e-05, + "loss": 0.8287, + "mean_token_accuracy": 0.7716741412878036, + "num_tokens": 384752657.0, + "step": 185 + }, + { + "epoch": 0.18816538747214656, + "grad_norm": 0.2896004319190979, + "learning_rate": 1.999388362866618e-05, + "loss": 0.8149, + "mean_token_accuracy": 0.7733279794454575, + "num_tokens": 395152658.0, + "step": 190 + }, + { + "epoch": 0.1931171081950978, + "grad_norm": 0.26007845997810364, + "learning_rate": 1.9992119170309188e-05, + "loss": 0.8236, + "mean_token_accuracy": 0.7709115266799926, + "num_tokens": 405574496.0, + "step": 195 + }, + { + "epoch": 0.19806882891804903, + "grad_norm": 0.2737730145454407, + "learning_rate": 1.9990131528611783e-05, + "loss": 0.8064, + "mean_token_accuracy": 0.7754090011119843, + "num_tokens": 415954581.0, + "step": 200 + }, + { + "epoch": 0.20302054964100025, + "grad_norm": 0.2782779037952423, + "learning_rate": 1.9987920762783978e-05, + "loss": 0.8066, + "mean_token_accuracy": 0.7754531413316726, + "num_tokens": 426347583.0, + "step": 205 + }, + { + "epoch": 0.20797227036395147, + "grad_norm": 0.2829101085662842, + "learning_rate": 1.998548693868246e-05, + "loss": 0.8159, + "mean_token_accuracy": 0.7728827208280563, + "num_tokens": 436775593.0, + "step": 210 + }, + { + "epoch": 0.2129239910869027, + "grad_norm": 0.25524118542671204, + "learning_rate": 1.998283012880861e-05, + "loss": 0.8112, + "mean_token_accuracy": 0.7736958414316177, + "num_tokens": 447193940.0, + "step": 215 + }, + { + "epoch": 0.21787571180985393, + "grad_norm": 0.25364068150520325, + "learning_rate": 1.9979950412306347e-05, + "loss": 0.8159, + "mean_token_accuracy": 0.7723533689975739, + "num_tokens": 457607812.0, + "step": 220 + }, + { + "epoch": 0.22282743253280515, + "grad_norm": 0.23794567584991455, + "learning_rate": 1.9976847874959783e-05, + "loss": 0.815, + "mean_token_accuracy": 0.772477874159813, + "num_tokens": 468019473.0, + "step": 225 + }, + { + "epoch": 0.22777915325575637, + "grad_norm": 0.20496641099452972, + "learning_rate": 1.9973522609190644e-05, + "loss": 0.8244, + "mean_token_accuracy": 0.7698849588632584, + "num_tokens": 478428340.0, + "step": 230 + }, + { + "epoch": 0.2327308739787076, + "grad_norm": 0.20776529610157013, + "learning_rate": 1.996997471405555e-05, + "loss": 0.7983, + "mean_token_accuracy": 0.7768283516168595, + "num_tokens": 488823476.0, + "step": 235 + }, + { + "epoch": 0.23768259470165884, + "grad_norm": 0.21405838429927826, + "learning_rate": 1.9966204295243027e-05, + "loss": 0.8111, + "mean_token_accuracy": 0.7732695490121841, + "num_tokens": 499220039.0, + "step": 240 + }, + { + "epoch": 0.24263431542461006, + "grad_norm": 0.27764108777046204, + "learning_rate": 1.9962211465070386e-05, + "loss": 0.824, + "mean_token_accuracy": 0.7728296130895614, + "num_tokens": 509612333.0, + "step": 245 + }, + { + "epoch": 0.24758603614756128, + "grad_norm": 0.2567944824695587, + "learning_rate": 1.9957996342480367e-05, + "loss": 0.7987, + "mean_token_accuracy": 0.7764184325933456, + "num_tokens": 520026049.0, + "step": 250 + }, + { + "epoch": 0.2525377568705125, + "grad_norm": 0.283932626247406, + "learning_rate": 1.995355905303759e-05, + "loss": 0.7872, + "mean_token_accuracy": 0.779523742198944, + "num_tokens": 530417759.0, + "step": 255 + }, + { + "epoch": 0.25748947759346374, + "grad_norm": 0.22360840439796448, + "learning_rate": 1.9948899728924837e-05, + "loss": 0.8079, + "mean_token_accuracy": 0.7741246551275254, + "num_tokens": 540815169.0, + "step": 260 + }, + { + "epoch": 0.26244119831641494, + "grad_norm": 0.5189192891120911, + "learning_rate": 1.9944018508939086e-05, + "loss": 0.8065, + "mean_token_accuracy": 0.7760889768600464, + "num_tokens": 551209699.0, + "step": 265 + }, + { + "epoch": 0.2673929190393662, + "grad_norm": 0.34829357266426086, + "learning_rate": 1.9938915538487387e-05, + "loss": 0.8066, + "mean_token_accuracy": 0.7739049255847931, + "num_tokens": 561624183.0, + "step": 270 + }, + { + "epoch": 0.27234463976231743, + "grad_norm": 0.40409669280052185, + "learning_rate": 1.993359096958254e-05, + "loss": 0.7932, + "mean_token_accuracy": 0.7777039527893066, + "num_tokens": 572002532.0, + "step": 275 + }, + { + "epoch": 0.2772963604852686, + "grad_norm": 0.6105888485908508, + "learning_rate": 1.9928044960838556e-05, + "loss": 0.8167, + "mean_token_accuracy": 0.7729485094547272, + "num_tokens": 582426743.0, + "step": 280 + }, + { + "epoch": 0.28224808120821987, + "grad_norm": 0.2834183871746063, + "learning_rate": 1.9922277677465934e-05, + "loss": 0.7975, + "mean_token_accuracy": 0.7765595734119415, + "num_tokens": 592847130.0, + "step": 285 + }, + { + "epoch": 0.28719980193117106, + "grad_norm": 0.3645786643028259, + "learning_rate": 1.9916289291266744e-05, + "loss": 0.79, + "mean_token_accuracy": 0.778738671541214, + "num_tokens": 603265287.0, + "step": 290 + }, + { + "epoch": 0.2921515226541223, + "grad_norm": 0.29506251215934753, + "learning_rate": 1.9910079980629505e-05, + "loss": 0.8055, + "mean_token_accuracy": 0.7779126793146134, + "num_tokens": 613676977.0, + "step": 295 + }, + { + "epoch": 0.29710324337707356, + "grad_norm": 3.3045666217803955, + "learning_rate": 1.9903649930523868e-05, + "loss": 0.7906, + "mean_token_accuracy": 0.7783419877290726, + "num_tokens": 624102322.0, + "step": 300 + }, + { + "epoch": 0.30205496410002475, + "grad_norm": 0.363760769367218, + "learning_rate": 1.9896999332495116e-05, + "loss": 0.794, + "mean_token_accuracy": 0.7770331531763077, + "num_tokens": 634476562.0, + "step": 305 + }, + { + "epoch": 0.307006684822976, + "grad_norm": 1.9502851963043213, + "learning_rate": 1.9890128384658442e-05, + "loss": 0.8748, + "mean_token_accuracy": 0.772979810833931, + "num_tokens": 644878697.0, + "step": 310 + }, + { + "epoch": 0.3119584055459272, + "grad_norm": 0.45439156889915466, + "learning_rate": 1.9883037291693072e-05, + "loss": 0.801, + "mean_token_accuracy": 0.7750454843044281, + "num_tokens": 655255275.0, + "step": 315 + }, + { + "epoch": 0.31691012626887843, + "grad_norm": 0.2824910283088684, + "learning_rate": 1.9875726264836138e-05, + "loss": 0.7944, + "mean_token_accuracy": 0.7767194181680679, + "num_tokens": 665660530.0, + "step": 320 + }, + { + "epoch": 0.3218618469918297, + "grad_norm": 0.31275248527526855, + "learning_rate": 1.9868195521876413e-05, + "loss": 0.7855, + "mean_token_accuracy": 0.7794755816459655, + "num_tokens": 676083912.0, + "step": 325 + }, + { + "epoch": 0.3268135677147809, + "grad_norm": 0.3284243643283844, + "learning_rate": 1.9860445287147798e-05, + "loss": 0.8037, + "mean_token_accuracy": 0.7744458526372909, + "num_tokens": 686500722.0, + "step": 330 + }, + { + "epoch": 0.3317652884377321, + "grad_norm": 0.24745342135429382, + "learning_rate": 1.9852475791522662e-05, + "loss": 0.7739, + "mean_token_accuracy": 0.7818856418132782, + "num_tokens": 696909303.0, + "step": 335 + }, + { + "epoch": 0.3367170091606833, + "grad_norm": 0.21039633452892303, + "learning_rate": 1.9844287272404952e-05, + "loss": 0.7936, + "mean_token_accuracy": 0.7766739934682846, + "num_tokens": 707315048.0, + "step": 340 + }, + { + "epoch": 0.34166872988363456, + "grad_norm": 0.2070273607969284, + "learning_rate": 1.9835879973723123e-05, + "loss": 0.8042, + "mean_token_accuracy": 0.7737419903278351, + "num_tokens": 717701231.0, + "step": 345 + }, + { + "epoch": 0.3466204506065858, + "grad_norm": 0.224740132689476, + "learning_rate": 1.9827254145922865e-05, + "loss": 0.7765, + "mean_token_accuracy": 0.7810128808021546, + "num_tokens": 728069468.0, + "step": 350 + }, + { + "epoch": 0.351572171329537, + "grad_norm": 0.21241162717342377, + "learning_rate": 1.9818410045959663e-05, + "loss": 0.7802, + "mean_token_accuracy": 0.7799350917339325, + "num_tokens": 738497629.0, + "step": 355 + }, + { + "epoch": 0.35652389205248824, + "grad_norm": 0.26337409019470215, + "learning_rate": 1.980934793729112e-05, + "loss": 0.779, + "mean_token_accuracy": 0.7800916910171509, + "num_tokens": 748894797.0, + "step": 360 + }, + { + "epoch": 0.3614756127754395, + "grad_norm": 0.21925003826618195, + "learning_rate": 1.9800068089869118e-05, + "loss": 0.7959, + "mean_token_accuracy": 0.7759343236684799, + "num_tokens": 759315710.0, + "step": 365 + }, + { + "epoch": 0.3664273334983907, + "grad_norm": 0.19071699678897858, + "learning_rate": 1.979057078013178e-05, + "loss": 0.7867, + "mean_token_accuracy": 0.7781553447246552, + "num_tokens": 769688709.0, + "step": 370 + }, + { + "epoch": 0.37137905422134193, + "grad_norm": 0.19490298628807068, + "learning_rate": 1.978085629099523e-05, + "loss": 0.7846, + "mean_token_accuracy": 0.7785351812839508, + "num_tokens": 780101279.0, + "step": 375 + }, + { + "epoch": 0.3763307749442931, + "grad_norm": 0.20952484011650085, + "learning_rate": 1.977092491184516e-05, + "loss": 0.7948, + "mean_token_accuracy": 0.7759048402309418, + "num_tokens": 790491283.0, + "step": 380 + }, + { + "epoch": 0.38128249566724437, + "grad_norm": 0.22583240270614624, + "learning_rate": 1.9760776938528233e-05, + "loss": 0.7779, + "mean_token_accuracy": 0.7805261939764023, + "num_tokens": 800875319.0, + "step": 385 + }, + { + "epoch": 0.3862342163901956, + "grad_norm": 0.21482065320014954, + "learning_rate": 1.9750412673343237e-05, + "loss": 0.7886, + "mean_token_accuracy": 0.7774913161993027, + "num_tokens": 811283375.0, + "step": 390 + }, + { + "epoch": 0.3911859371131468, + "grad_norm": 0.22041663527488708, + "learning_rate": 1.9739832425032098e-05, + "loss": 0.7808, + "mean_token_accuracy": 0.7795657008886338, + "num_tokens": 821657607.0, + "step": 395 + }, + { + "epoch": 0.39613765783609806, + "grad_norm": 0.2266547530889511, + "learning_rate": 1.9729036508770684e-05, + "loss": 0.7776, + "mean_token_accuracy": 0.7801724016666413, + "num_tokens": 832048749.0, + "step": 400 + }, + { + "epoch": 0.40108937855904925, + "grad_norm": 0.2227035015821457, + "learning_rate": 1.9718025246159414e-05, + "loss": 0.7841, + "mean_token_accuracy": 0.77851602435112, + "num_tokens": 842471488.0, + "step": 405 + }, + { + "epoch": 0.4060410992820005, + "grad_norm": 0.23665013909339905, + "learning_rate": 1.970679896521368e-05, + "loss": 0.7849, + "mean_token_accuracy": 0.7782983154058456, + "num_tokens": 852876590.0, + "step": 410 + }, + { + "epoch": 0.41099282000495174, + "grad_norm": 0.19461902976036072, + "learning_rate": 1.9695358000354063e-05, + "loss": 0.7755, + "mean_token_accuracy": 0.7806118249893188, + "num_tokens": 863284557.0, + "step": 415 + }, + { + "epoch": 0.41594454072790293, + "grad_norm": 0.20683637261390686, + "learning_rate": 1.968370269239638e-05, + "loss": 0.7735, + "mean_token_accuracy": 0.7812349110841751, + "num_tokens": 873712187.0, + "step": 420 + }, + { + "epoch": 0.4208962614508542, + "grad_norm": 0.18914039433002472, + "learning_rate": 1.967183338854155e-05, + "loss": 0.7769, + "mean_token_accuracy": 0.7802020996809006, + "num_tokens": 884107732.0, + "step": 425 + }, + { + "epoch": 0.4258479821738054, + "grad_norm": 0.20901145040988922, + "learning_rate": 1.9659750442365207e-05, + "loss": 0.7775, + "mean_token_accuracy": 0.7796809077262878, + "num_tokens": 894519434.0, + "step": 430 + }, + { + "epoch": 0.4307997028967566, + "grad_norm": 0.19267426431179047, + "learning_rate": 1.964745421380722e-05, + "loss": 0.785, + "mean_token_accuracy": 0.7779094964265824, + "num_tokens": 904926587.0, + "step": 435 + }, + { + "epoch": 0.43575142361970787, + "grad_norm": 0.21364259719848633, + "learning_rate": 1.9634945069160915e-05, + "loss": 0.7784, + "mean_token_accuracy": 0.779586797952652, + "num_tokens": 915329182.0, + "step": 440 + }, + { + "epoch": 0.44070314434265906, + "grad_norm": 0.22054027020931244, + "learning_rate": 1.9622223381062216e-05, + "loss": 0.7799, + "mean_token_accuracy": 0.7788717359304428, + "num_tokens": 925694666.0, + "step": 445 + }, + { + "epoch": 0.4456548650656103, + "grad_norm": 10.010891914367676, + "learning_rate": 1.960928952847851e-05, + "loss": 1.0615, + "mean_token_accuracy": 0.7801062256097794, + "num_tokens": 936081193.0, + "step": 450 + }, + { + "epoch": 0.4506065857885615, + "grad_norm": 0.2595425546169281, + "learning_rate": 1.9596143896697377e-05, + "loss": 0.7803, + "mean_token_accuracy": 0.7792818278074265, + "num_tokens": 946491782.0, + "step": 455 + }, + { + "epoch": 0.45555830651151275, + "grad_norm": 0.24293220043182373, + "learning_rate": 1.9582786877315097e-05, + "loss": 0.7737, + "mean_token_accuracy": 0.7810850977897644, + "num_tokens": 956901948.0, + "step": 460 + }, + { + "epoch": 0.460510027234464, + "grad_norm": 0.23179250955581665, + "learning_rate": 1.956921886822499e-05, + "loss": 0.7714, + "mean_token_accuracy": 0.7815377414226532, + "num_tokens": 967293443.0, + "step": 465 + }, + { + "epoch": 0.4654617479574152, + "grad_norm": 0.21295353770256042, + "learning_rate": 1.9555440273605573e-05, + "loss": 0.7856, + "mean_token_accuracy": 0.777508807182312, + "num_tokens": 977693050.0, + "step": 470 + }, + { + "epoch": 0.47041346868036643, + "grad_norm": 0.2277674376964569, + "learning_rate": 1.954145150390851e-05, + "loss": 0.7798, + "mean_token_accuracy": 0.7789700716733933, + "num_tokens": 988067281.0, + "step": 475 + }, + { + "epoch": 0.4753651894033177, + "grad_norm": 0.22468669712543488, + "learning_rate": 1.9527252975846385e-05, + "loss": 0.7745, + "mean_token_accuracy": 0.7800474315881729, + "num_tokens": 998455093.0, + "step": 480 + }, + { + "epoch": 0.48031691012626887, + "grad_norm": 0.2532201111316681, + "learning_rate": 1.9512845112380293e-05, + "loss": 0.7886, + "mean_token_accuracy": 0.7768169581890106, + "num_tokens": 1008880800.0, + "step": 485 + }, + { + "epoch": 0.4852686308492201, + "grad_norm": 0.24313411116600037, + "learning_rate": 1.9498228342707236e-05, + "loss": 0.7781, + "mean_token_accuracy": 0.7799170076847076, + "num_tokens": 1019275396.0, + "step": 490 + }, + { + "epoch": 0.4902203515721713, + "grad_norm": 0.24066568911075592, + "learning_rate": 1.948340310224734e-05, + "loss": 0.7864, + "mean_token_accuracy": 0.7775434225797653, + "num_tokens": 1029689644.0, + "step": 495 + }, + { + "epoch": 0.49517207229512256, + "grad_norm": 0.2265813648700714, + "learning_rate": 1.946836983263089e-05, + "loss": 0.7904, + "mean_token_accuracy": 0.7760259002447129, + "num_tokens": 1040080774.0, + "step": 500 + }, + { + "epoch": 0.5001237930180737, + "grad_norm": 0.22151488065719604, + "learning_rate": 1.945312898168516e-05, + "loss": 0.7578, + "mean_token_accuracy": 0.7849026411771775, + "num_tokens": 1050471944.0, + "step": 505 + }, + { + "epoch": 0.505075513741025, + "grad_norm": 0.28647103905677795, + "learning_rate": 1.9437681003421085e-05, + "loss": 0.7702, + "mean_token_accuracy": 0.7815856724977494, + "num_tokens": 1060865733.0, + "step": 510 + }, + { + "epoch": 0.5100272344639762, + "grad_norm": 0.2885425090789795, + "learning_rate": 1.942202635801973e-05, + "loss": 0.7734, + "mean_token_accuracy": 0.7801808267831802, + "num_tokens": 1071278092.0, + "step": 515 + }, + { + "epoch": 0.5149789551869275, + "grad_norm": 0.3188728094100952, + "learning_rate": 1.940616551181859e-05, + "loss": 0.7874, + "mean_token_accuracy": 0.7791253894567489, + "num_tokens": 1081688765.0, + "step": 520 + }, + { + "epoch": 0.5199306759098787, + "grad_norm": 0.27370673418045044, + "learning_rate": 1.9390098937297685e-05, + "loss": 0.774, + "mean_token_accuracy": 0.7808292925357818, + "num_tokens": 1092085647.0, + "step": 525 + }, + { + "epoch": 0.5248823966328299, + "grad_norm": 0.19921670854091644, + "learning_rate": 1.9373827113065493e-05, + "loss": 0.7554, + "mean_token_accuracy": 0.7854314386844635, + "num_tokens": 1102479708.0, + "step": 530 + }, + { + "epoch": 0.5298341173557811, + "grad_norm": 0.23133447766304016, + "learning_rate": 1.93573505238447e-05, + "loss": 0.7686, + "mean_token_accuracy": 0.7816448330879211, + "num_tokens": 1112886556.0, + "step": 535 + }, + { + "epoch": 0.5347858380787324, + "grad_norm": 0.210884690284729, + "learning_rate": 1.934066966045774e-05, + "loss": 0.7708, + "mean_token_accuracy": 0.7807249516248703, + "num_tokens": 1123299992.0, + "step": 540 + }, + { + "epoch": 0.5397375588016836, + "grad_norm": 0.20094110071659088, + "learning_rate": 1.93237850198122e-05, + "loss": 0.7579, + "mean_token_accuracy": 0.7845065593719482, + "num_tokens": 1133664183.0, + "step": 545 + }, + { + "epoch": 0.5446892795246349, + "grad_norm": 0.21680229902267456, + "learning_rate": 1.9306697104885997e-05, + "loss": 0.776, + "mean_token_accuracy": 0.7794145315885543, + "num_tokens": 1144090055.0, + "step": 550 + }, + { + "epoch": 0.549641000247586, + "grad_norm": 0.3211749494075775, + "learning_rate": 1.92894064247124e-05, + "loss": 0.7896, + "mean_token_accuracy": 0.7807279646396637, + "num_tokens": 1154463475.0, + "step": 555 + }, + { + "epoch": 0.5545927209705372, + "grad_norm": 0.24195314943790436, + "learning_rate": 1.9271913494364874e-05, + "loss": 0.7807, + "mean_token_accuracy": 0.7783623158931732, + "num_tokens": 1164866644.0, + "step": 560 + }, + { + "epoch": 0.5595444416934885, + "grad_norm": 0.4342971444129944, + "learning_rate": 1.9254218834941727e-05, + "loss": 0.7838, + "mean_token_accuracy": 0.7803391963243484, + "num_tokens": 1175260483.0, + "step": 565 + }, + { + "epoch": 0.5644961624164397, + "grad_norm": 0.6932039856910706, + "learning_rate": 1.923632297355059e-05, + "loss": 0.7772, + "mean_token_accuracy": 0.7796769648790359, + "num_tokens": 1185656569.0, + "step": 570 + }, + { + "epoch": 0.569447883139391, + "grad_norm": 0.5484419465065002, + "learning_rate": 1.9218226443292717e-05, + "loss": 0.7733, + "mean_token_accuracy": 0.780466890335083, + "num_tokens": 1196077796.0, + "step": 575 + }, + { + "epoch": 0.5743996038623421, + "grad_norm": 3.0035197734832764, + "learning_rate": 1.9199929783247106e-05, + "loss": 0.7854, + "mean_token_accuracy": 0.78033567070961, + "num_tokens": 1206462746.0, + "step": 580 + }, + { + "epoch": 0.5793513245852934, + "grad_norm": 0.7452694773674011, + "learning_rate": 1.918143353845443e-05, + "loss": 0.7699, + "mean_token_accuracy": 0.7816843211650848, + "num_tokens": 1216838456.0, + "step": 585 + }, + { + "epoch": 0.5843030453082446, + "grad_norm": 0.4381994307041168, + "learning_rate": 1.9162738259900817e-05, + "loss": 0.766, + "mean_token_accuracy": 0.7822736114263534, + "num_tokens": 1227242988.0, + "step": 590 + }, + { + "epoch": 0.5892547660311959, + "grad_norm": 1.9659717082977295, + "learning_rate": 1.9143844504501417e-05, + "loss": 0.7787, + "mean_token_accuracy": 0.779282808303833, + "num_tokens": 1237636071.0, + "step": 595 + }, + { + "epoch": 0.5942064867541471, + "grad_norm": 0.2631269693374634, + "learning_rate": 1.912475283508384e-05, + "loss": 0.776, + "mean_token_accuracy": 0.7794870853424072, + "num_tokens": 1248057314.0, + "step": 600 + }, + { + "epoch": 0.5991582074770982, + "grad_norm": 0.429813414812088, + "learning_rate": 1.9105463820371346e-05, + "loss": 0.7617, + "mean_token_accuracy": 0.7824564576148987, + "num_tokens": 1258457239.0, + "step": 605 + }, + { + "epoch": 0.6041099282000495, + "grad_norm": 0.4058431386947632, + "learning_rate": 1.9085978034965957e-05, + "loss": 0.7665, + "mean_token_accuracy": 0.7821383327245712, + "num_tokens": 1268873042.0, + "step": 610 + }, + { + "epoch": 0.6090616489230007, + "grad_norm": 0.2880268692970276, + "learning_rate": 1.9066296059331297e-05, + "loss": 0.7774, + "mean_token_accuracy": 0.7790426641702652, + "num_tokens": 1279256133.0, + "step": 615 + }, + { + "epoch": 0.614013369645952, + "grad_norm": 0.28571274876594543, + "learning_rate": 1.904641847977532e-05, + "loss": 0.7649, + "mean_token_accuracy": 0.7826848417520523, + "num_tokens": 1289654801.0, + "step": 620 + }, + { + "epoch": 0.6189650903689032, + "grad_norm": 0.2443043738603592, + "learning_rate": 1.902634588843285e-05, + "loss": 0.7845, + "mean_token_accuracy": 0.7772348463535309, + "num_tokens": 1300049426.0, + "step": 625 + }, + { + "epoch": 0.6239168110918544, + "grad_norm": 0.2589752674102783, + "learning_rate": 1.9006078883247913e-05, + "loss": 0.7682, + "mean_token_accuracy": 0.7823996514081955, + "num_tokens": 1310479480.0, + "step": 630 + }, + { + "epoch": 0.6288685318148056, + "grad_norm": 0.21809199452400208, + "learning_rate": 1.898561806795596e-05, + "loss": 0.7831, + "mean_token_accuracy": 0.7773066818714142, + "num_tokens": 1320885725.0, + "step": 635 + }, + { + "epoch": 0.6338202525377569, + "grad_norm": 0.24305787682533264, + "learning_rate": 1.8964964052065865e-05, + "loss": 0.7689, + "mean_token_accuracy": 0.781117245554924, + "num_tokens": 1331277887.0, + "step": 640 + }, + { + "epoch": 0.6387719732607081, + "grad_norm": 0.2178671658039093, + "learning_rate": 1.894411745084177e-05, + "loss": 0.7737, + "mean_token_accuracy": 0.7797011733055115, + "num_tokens": 1341663949.0, + "step": 645 + }, + { + "epoch": 0.6437236939836594, + "grad_norm": 0.21718356013298035, + "learning_rate": 1.892307888528475e-05, + "loss": 0.7599, + "mean_token_accuracy": 0.7836619168519974, + "num_tokens": 1352058279.0, + "step": 650 + }, + { + "epoch": 0.6486754147066105, + "grad_norm": 0.20011211931705475, + "learning_rate": 1.890184898211433e-05, + "loss": 0.7621, + "mean_token_accuracy": 0.7829108744859695, + "num_tokens": 1362482075.0, + "step": 655 + }, + { + "epoch": 0.6536271354295617, + "grad_norm": 0.21650190651416779, + "learning_rate": 1.88804283737498e-05, + "loss": 0.7757, + "mean_token_accuracy": 0.7795401126146316, + "num_tokens": 1372901259.0, + "step": 660 + }, + { + "epoch": 0.658578856152513, + "grad_norm": 0.19840224087238312, + "learning_rate": 1.8858817698291386e-05, + "loss": 0.7673, + "mean_token_accuracy": 0.7813610762357712, + "num_tokens": 1383315389.0, + "step": 665 + }, + { + "epoch": 0.6635305768754642, + "grad_norm": 0.20623134076595306, + "learning_rate": 1.883701759950124e-05, + "loss": 0.7665, + "mean_token_accuracy": 0.7813021749258041, + "num_tokens": 1393726215.0, + "step": 670 + }, + { + "epoch": 0.6684822975984155, + "grad_norm": 0.199232816696167, + "learning_rate": 1.881502872678425e-05, + "loss": 0.7648, + "mean_token_accuracy": 0.782157027721405, + "num_tokens": 1404148209.0, + "step": 675 + }, + { + "epoch": 0.6734340183213666, + "grad_norm": 0.1855035424232483, + "learning_rate": 1.879285173516872e-05, + "loss": 0.7603, + "mean_token_accuracy": 0.78330679833889, + "num_tokens": 1414573560.0, + "step": 680 + }, + { + "epoch": 0.6783857390443179, + "grad_norm": 0.2409905195236206, + "learning_rate": 1.877048728528684e-05, + "loss": 0.7887, + "mean_token_accuracy": 0.7781977474689483, + "num_tokens": 1424968590.0, + "step": 685 + }, + { + "epoch": 0.6833374597672691, + "grad_norm": 0.2532198131084442, + "learning_rate": 1.8747936043354994e-05, + "loss": 0.7678, + "mean_token_accuracy": 0.7814545810222626, + "num_tokens": 1435377805.0, + "step": 690 + }, + { + "epoch": 0.6882891804902204, + "grad_norm": 0.2087397277355194, + "learning_rate": 1.8725198681153955e-05, + "loss": 0.7627, + "mean_token_accuracy": 0.7825340062379837, + "num_tokens": 1445800164.0, + "step": 695 + }, + { + "epoch": 0.6932409012131716, + "grad_norm": 0.20391027629375458, + "learning_rate": 1.8702275876008823e-05, + "loss": 0.7574, + "mean_token_accuracy": 0.783563169836998, + "num_tokens": 1456203460.0, + "step": 700 + }, + { + "epoch": 0.6981926219361227, + "grad_norm": 0.21374066174030304, + "learning_rate": 1.867916831076889e-05, + "loss": 0.7737, + "mean_token_accuracy": 0.7793075799942016, + "num_tokens": 1466617339.0, + "step": 705 + }, + { + "epoch": 0.703144342659074, + "grad_norm": 0.45011261105537415, + "learning_rate": 1.8655876673787272e-05, + "loss": 0.7534, + "mean_token_accuracy": 0.7848767697811126, + "num_tokens": 1477041742.0, + "step": 710 + }, + { + "epoch": 0.7080960633820252, + "grad_norm": 0.19761380553245544, + "learning_rate": 1.8632401658900415e-05, + "loss": 0.7583, + "mean_token_accuracy": 0.783224281668663, + "num_tokens": 1487468293.0, + "step": 715 + }, + { + "epoch": 0.7130477841049765, + "grad_norm": 0.17496533691883087, + "learning_rate": 1.860874396540743e-05, + "loss": 0.7567, + "mean_token_accuracy": 0.7842571973800659, + "num_tokens": 1497855239.0, + "step": 720 + }, + { + "epoch": 0.7179995048279277, + "grad_norm": 0.19634433090686798, + "learning_rate": 1.8584904298049244e-05, + "loss": 0.7628, + "mean_token_accuracy": 0.7824541956186295, + "num_tokens": 1508239301.0, + "step": 725 + }, + { + "epoch": 0.722951225550879, + "grad_norm": 0.19373337924480438, + "learning_rate": 1.8560883366987633e-05, + "loss": 0.759, + "mean_token_accuracy": 0.7833509474992753, + "num_tokens": 1518639882.0, + "step": 730 + }, + { + "epoch": 0.7279029462738301, + "grad_norm": 0.18676340579986572, + "learning_rate": 1.8536681887784037e-05, + "loss": 0.7512, + "mean_token_accuracy": 0.7852223426103592, + "num_tokens": 1529056777.0, + "step": 735 + }, + { + "epoch": 0.7328546669967814, + "grad_norm": 0.20926012098789215, + "learning_rate": 1.8512300581378267e-05, + "loss": 0.7502, + "mean_token_accuracy": 0.7857935458421708, + "num_tokens": 1539442858.0, + "step": 740 + }, + { + "epoch": 0.7378063877197326, + "grad_norm": 0.1997007578611374, + "learning_rate": 1.848774017406703e-05, + "loss": 0.7557, + "mean_token_accuracy": 0.7842215925455094, + "num_tokens": 1549852249.0, + "step": 745 + }, + { + "epoch": 0.7427581084426839, + "grad_norm": 0.19928854703903198, + "learning_rate": 1.8463001397482257e-05, + "loss": 0.7481, + "mean_token_accuracy": 0.7860120952129364, + "num_tokens": 1560241267.0, + "step": 750 + }, + { + "epoch": 0.7477098291656351, + "grad_norm": 0.19554533064365387, + "learning_rate": 1.843808498856937e-05, + "loss": 0.7583, + "mean_token_accuracy": 0.7831324756145477, + "num_tokens": 1570635303.0, + "step": 755 + }, + { + "epoch": 0.7526615498885862, + "grad_norm": 0.1927034854888916, + "learning_rate": 1.8412991689565264e-05, + "loss": 0.7616, + "mean_token_accuracy": 0.7826423794031143, + "num_tokens": 1581053637.0, + "step": 760 + }, + { + "epoch": 0.7576132706115375, + "grad_norm": 0.20213893055915833, + "learning_rate": 1.8387722247976248e-05, + "loss": 0.7664, + "mean_token_accuracy": 0.7809010535478592, + "num_tokens": 1591457543.0, + "step": 765 + }, + { + "epoch": 0.7625649913344887, + "grad_norm": 0.1917760670185089, + "learning_rate": 1.836227741655575e-05, + "loss": 0.7554, + "mean_token_accuracy": 0.7842649966478348, + "num_tokens": 1601863294.0, + "step": 770 + }, + { + "epoch": 0.76751671205744, + "grad_norm": 0.20989423990249634, + "learning_rate": 1.8336657953281896e-05, + "loss": 0.7615, + "mean_token_accuracy": 0.78232641518116, + "num_tokens": 1612243148.0, + "step": 775 + }, + { + "epoch": 0.7724684327803912, + "grad_norm": 0.20752640068531036, + "learning_rate": 1.8310864621334936e-05, + "loss": 0.768, + "mean_token_accuracy": 0.781131848692894, + "num_tokens": 1622648438.0, + "step": 780 + }, + { + "epoch": 0.7774201535033424, + "grad_norm": 0.6338415145874023, + "learning_rate": 1.8284898189074514e-05, + "loss": 0.753, + "mean_token_accuracy": 0.784282636642456, + "num_tokens": 1633053857.0, + "step": 785 + }, + { + "epoch": 0.7823718742262936, + "grad_norm": 0.22199805080890656, + "learning_rate": 1.8258759430016767e-05, + "loss": 0.7636, + "mean_token_accuracy": 0.7832720100879669, + "num_tokens": 1643462725.0, + "step": 790 + }, + { + "epoch": 0.7873235949492449, + "grad_norm": 0.19597461819648743, + "learning_rate": 1.8232449122811293e-05, + "loss": 0.7557, + "mean_token_accuracy": 0.7838189631700516, + "num_tokens": 1653883349.0, + "step": 795 + }, + { + "epoch": 0.7922753156721961, + "grad_norm": 0.2407362461090088, + "learning_rate": 1.8205968051217945e-05, + "loss": 0.7606, + "mean_token_accuracy": 0.7830138593912125, + "num_tokens": 1664275959.0, + "step": 800 + }, + { + "epoch": 0.7972270363951474, + "grad_norm": 0.23943914473056793, + "learning_rate": 1.8179317004083495e-05, + "loss": 0.7651, + "mean_token_accuracy": 0.7815976798534393, + "num_tokens": 1674671126.0, + "step": 805 + }, + { + "epoch": 0.8021787571180985, + "grad_norm": 0.25406405329704285, + "learning_rate": 1.815249677531813e-05, + "loss": 0.7547, + "mean_token_accuracy": 0.7842951714992523, + "num_tokens": 1685037213.0, + "step": 810 + }, + { + "epoch": 0.8071304778410497, + "grad_norm": 0.7270509600639343, + "learning_rate": 1.8125508163871798e-05, + "loss": 0.7583, + "mean_token_accuracy": 0.7827031821012497, + "num_tokens": 1695458364.0, + "step": 815 + }, + { + "epoch": 0.812082198564001, + "grad_norm": 0.2561294436454773, + "learning_rate": 1.809835197371042e-05, + "loss": 0.7696, + "mean_token_accuracy": 0.7809718787670136, + "num_tokens": 1705848122.0, + "step": 820 + }, + { + "epoch": 0.8170339192869522, + "grad_norm": 0.23471757769584656, + "learning_rate": 1.807102901379193e-05, + "loss": 0.743, + "mean_token_accuracy": 0.7874339818954468, + "num_tokens": 1716242283.0, + "step": 825 + }, + { + "epoch": 0.8219856400099035, + "grad_norm": 0.19936513900756836, + "learning_rate": 1.804354009804217e-05, + "loss": 0.7516, + "mean_token_accuracy": 0.7847243756055832, + "num_tokens": 1726606051.0, + "step": 830 + }, + { + "epoch": 0.8269373607328546, + "grad_norm": 0.23077034950256348, + "learning_rate": 1.801588604533067e-05, + "loss": 0.7478, + "mean_token_accuracy": 0.7857007443904876, + "num_tokens": 1736984854.0, + "step": 835 + }, + { + "epoch": 0.8318890814558059, + "grad_norm": 0.27735021710395813, + "learning_rate": 1.7988067679446223e-05, + "loss": 0.7542, + "mean_token_accuracy": 0.7848597913980484, + "num_tokens": 1747365413.0, + "step": 840 + }, + { + "epoch": 0.8368408021787571, + "grad_norm": 0.19908802211284637, + "learning_rate": 1.7960085829072373e-05, + "loss": 0.7485, + "mean_token_accuracy": 0.7859860509634018, + "num_tokens": 1757782617.0, + "step": 845 + }, + { + "epoch": 0.8417925229017084, + "grad_norm": 0.19668538868427277, + "learning_rate": 1.7931941327762708e-05, + "loss": 0.7563, + "mean_token_accuracy": 0.7834689766168594, + "num_tokens": 1768140236.0, + "step": 850 + }, + { + "epoch": 0.8467442436246596, + "grad_norm": 0.2182615101337433, + "learning_rate": 1.790363501391604e-05, + "loss": 0.7564, + "mean_token_accuracy": 0.7837036728858948, + "num_tokens": 1778501788.0, + "step": 855 + }, + { + "epoch": 0.8516959643476107, + "grad_norm": 1.2455408573150635, + "learning_rate": 1.7875167730751432e-05, + "loss": 0.7495, + "mean_token_accuracy": 0.7853644698858261, + "num_tokens": 1788895036.0, + "step": 860 + }, + { + "epoch": 0.856647685070562, + "grad_norm": 0.22847770154476166, + "learning_rate": 1.7846540326283067e-05, + "loss": 0.7466, + "mean_token_accuracy": 0.786415946483612, + "num_tokens": 1799277517.0, + "step": 865 + }, + { + "epoch": 0.8615994057935132, + "grad_norm": 0.22231429815292358, + "learning_rate": 1.7817753653295e-05, + "loss": 0.748, + "mean_token_accuracy": 0.7856645524501801, + "num_tokens": 1809703236.0, + "step": 870 + }, + { + "epoch": 0.8665511265164645, + "grad_norm": 0.9592680931091309, + "learning_rate": 1.7788808569315753e-05, + "loss": 0.7503, + "mean_token_accuracy": 0.7850320547819137, + "num_tokens": 1820112331.0, + "step": 875 + }, + { + "epoch": 0.8715028472394157, + "grad_norm": 0.2056133896112442, + "learning_rate": 1.775970593659275e-05, + "loss": 0.7597, + "mean_token_accuracy": 0.7830165416002274, + "num_tokens": 1830487579.0, + "step": 880 + }, + { + "epoch": 0.8764545679623669, + "grad_norm": 0.29589369893074036, + "learning_rate": 1.7730446622066663e-05, + "loss": 0.7633, + "mean_token_accuracy": 0.7831792920827866, + "num_tokens": 1840864368.0, + "step": 885 + }, + { + "epoch": 0.8814062886853181, + "grad_norm": 0.23890021443367004, + "learning_rate": 1.7701031497345563e-05, + "loss": 0.751, + "mean_token_accuracy": 0.7846818000078202, + "num_tokens": 1851268551.0, + "step": 890 + }, + { + "epoch": 0.8863580094082694, + "grad_norm": 0.20969191193580627, + "learning_rate": 1.7671461438678968e-05, + "loss": 0.7579, + "mean_token_accuracy": 0.7834391385316849, + "num_tokens": 1861629010.0, + "step": 895 + }, + { + "epoch": 0.8913097301312206, + "grad_norm": 0.3054754436016083, + "learning_rate": 1.7641737326931735e-05, + "loss": 0.7774, + "mean_token_accuracy": 0.7797193497419357, + "num_tokens": 1872020332.0, + "step": 900 + }, + { + "epoch": 0.8962614508541719, + "grad_norm": 0.2975171208381653, + "learning_rate": 1.7611860047557817e-05, + "loss": 0.7565, + "mean_token_accuracy": 0.7843641757965087, + "num_tokens": 1882408874.0, + "step": 905 + }, + { + "epoch": 0.901213171577123, + "grad_norm": 0.5816043019294739, + "learning_rate": 1.7581830490573887e-05, + "loss": 0.7507, + "mean_token_accuracy": 0.7849104583263398, + "num_tokens": 1892771965.0, + "step": 910 + }, + { + "epoch": 0.9061648923000742, + "grad_norm": 1.1918472051620483, + "learning_rate": 1.7551649550532842e-05, + "loss": 0.7535, + "mean_token_accuracy": 0.7841880857944489, + "num_tokens": 1903131329.0, + "step": 915 + }, + { + "epoch": 0.9111166130230255, + "grad_norm": 0.21812961995601654, + "learning_rate": 1.7521318126497124e-05, + "loss": 0.7646, + "mean_token_accuracy": 0.7810812026262284, + "num_tokens": 1913531682.0, + "step": 920 + }, + { + "epoch": 0.9160683337459767, + "grad_norm": 0.18054869771003723, + "learning_rate": 1.7490837122011965e-05, + "loss": 0.7648, + "mean_token_accuracy": 0.7809185832738876, + "num_tokens": 1923930881.0, + "step": 925 + }, + { + "epoch": 0.921020054468928, + "grad_norm": 0.2776661813259125, + "learning_rate": 1.7460207445078458e-05, + "loss": 0.7598, + "mean_token_accuracy": 0.7860252737998963, + "num_tokens": 1934361093.0, + "step": 930 + }, + { + "epoch": 0.9259717751918792, + "grad_norm": 0.4657640755176544, + "learning_rate": 1.7429430008126517e-05, + "loss": 0.78, + "mean_token_accuracy": 0.7799131125211716, + "num_tokens": 1944750850.0, + "step": 935 + }, + { + "epoch": 0.9309234959148304, + "grad_norm": 0.47089236974716187, + "learning_rate": 1.739850572798768e-05, + "loss": 0.7609, + "mean_token_accuracy": 0.783119586110115, + "num_tokens": 1955154134.0, + "step": 940 + }, + { + "epoch": 0.9358752166377816, + "grad_norm": 0.20868314802646637, + "learning_rate": 1.736743552586782e-05, + "loss": 0.7449, + "mean_token_accuracy": 0.7862414479255676, + "num_tokens": 1965547370.0, + "step": 945 + }, + { + "epoch": 0.9408269373607329, + "grad_norm": 0.2700161337852478, + "learning_rate": 1.733622032731968e-05, + "loss": 0.7501, + "mean_token_accuracy": 0.7854364097118378, + "num_tokens": 1975948544.0, + "step": 950 + }, + { + "epoch": 0.9457786580836841, + "grad_norm": 0.25757983326911926, + "learning_rate": 1.7304861062215326e-05, + "loss": 0.7605, + "mean_token_accuracy": 0.7821576863527298, + "num_tokens": 1986371366.0, + "step": 955 + }, + { + "epoch": 0.9507303788066354, + "grad_norm": 0.2574819326400757, + "learning_rate": 1.727335866471842e-05, + "loss": 0.7684, + "mean_token_accuracy": 0.7806459426879883, + "num_tokens": 1996738288.0, + "step": 960 + }, + { + "epoch": 0.9556820995295865, + "grad_norm": 0.22203104197978973, + "learning_rate": 1.724171407325642e-05, + "loss": 0.7546, + "mean_token_accuracy": 0.786052080988884, + "num_tokens": 2007140915.0, + "step": 965 + }, + { + "epoch": 0.9606338202525377, + "grad_norm": 0.23472048342227936, + "learning_rate": 1.7209928230492606e-05, + "loss": 0.7517, + "mean_token_accuracy": 0.7842936605215073, + "num_tokens": 2017561316.0, + "step": 970 + }, + { + "epoch": 0.965585540975489, + "grad_norm": 0.36148348450660706, + "learning_rate": 1.7178002083298005e-05, + "loss": 0.7549, + "mean_token_accuracy": 0.7834816932678222, + "num_tokens": 2027964049.0, + "step": 975 + }, + { + "epoch": 0.9705372616984402, + "grad_norm": 0.19759418070316315, + "learning_rate": 1.714593658272318e-05, + "loss": 0.7725, + "mean_token_accuracy": 0.7829167902469635, + "num_tokens": 2038362855.0, + "step": 980 + }, + { + "epoch": 0.9754889824213915, + "grad_norm": 0.21052470803260803, + "learning_rate": 1.7113732683969905e-05, + "loss": 0.7633, + "mean_token_accuracy": 0.7814028680324554, + "num_tokens": 2048772898.0, + "step": 985 + }, + { + "epoch": 0.9804407031443426, + "grad_norm": 0.31858742237091064, + "learning_rate": 1.7081391346362717e-05, + "loss": 0.7624, + "mean_token_accuracy": 0.7836154013872146, + "num_tokens": 2059179537.0, + "step": 990 + }, + { + "epoch": 0.9853924238672939, + "grad_norm": 0.19801688194274902, + "learning_rate": 1.7048913533320307e-05, + "loss": 0.7295, + "mean_token_accuracy": 0.7904259711503983, + "num_tokens": 2069588587.0, + "step": 995 + }, + { + "epoch": 0.9903441445902451, + "grad_norm": 0.2109430432319641, + "learning_rate": 1.7016300212326875e-05, + "loss": 0.7482, + "mean_token_accuracy": 0.7853594183921814, + "num_tokens": 2080010530.0, + "step": 1000 + }, + { + "epoch": 0.9952958653131964, + "grad_norm": 0.2018500715494156, + "learning_rate": 1.6983552354903257e-05, + "loss": 0.7622, + "mean_token_accuracy": 0.7815503656864167, + "num_tokens": 2090415532.0, + "step": 1005 + }, + { + "epoch": 1.0, + "grad_norm": 0.2757660448551178, + "learning_rate": 1.6950670936578013e-05, + "loss": 0.7279, + "mean_token_accuracy": 0.7902535639311138, + "num_tokens": 2100284200.0, + "step": 1010 + }, + { + "epoch": 1.0049517207229512, + "grad_norm": 0.22684067487716675, + "learning_rate": 1.6917656936858347e-05, + "loss": 0.7505, + "mean_token_accuracy": 0.7881647497415543, + "num_tokens": 2110682669.0, + "step": 1015 + }, + { + "epoch": 1.0099034414459025, + "grad_norm": 0.19842083752155304, + "learning_rate": 1.688451133920097e-05, + "loss": 0.7352, + "mean_token_accuracy": 0.787742218375206, + "num_tokens": 2121073690.0, + "step": 1020 + }, + { + "epoch": 1.0148551621688537, + "grad_norm": 0.637569785118103, + "learning_rate": 1.6851235130982737e-05, + "loss": 0.7309, + "mean_token_accuracy": 0.7883238166570663, + "num_tokens": 2131473799.0, + "step": 1025 + }, + { + "epoch": 1.019806882891805, + "grad_norm": 0.19143810868263245, + "learning_rate": 1.68178293034713e-05, + "loss": 0.7227, + "mean_token_accuracy": 0.7909730762243271, + "num_tokens": 2141863864.0, + "step": 1030 + }, + { + "epoch": 1.0247586036147562, + "grad_norm": 0.20483145117759705, + "learning_rate": 1.6784294851795544e-05, + "loss": 0.7404, + "mean_token_accuracy": 0.786355146765709, + "num_tokens": 2152272201.0, + "step": 1035 + }, + { + "epoch": 1.0297103243377073, + "grad_norm": 0.22370055317878723, + "learning_rate": 1.675063277491594e-05, + "loss": 0.7345, + "mean_token_accuracy": 0.7874440640211106, + "num_tokens": 2162687256.0, + "step": 1040 + }, + { + "epoch": 1.0346620450606585, + "grad_norm": 0.18761159479618073, + "learning_rate": 1.671684407559481e-05, + "loss": 0.7396, + "mean_token_accuracy": 0.788395956158638, + "num_tokens": 2173083569.0, + "step": 1045 + }, + { + "epoch": 1.0396137657836098, + "grad_norm": 0.6076595783233643, + "learning_rate": 1.6682929760366425e-05, + "loss": 0.7257, + "mean_token_accuracy": 0.7897282272577286, + "num_tokens": 2183435840.0, + "step": 1050 + }, + { + "epoch": 1.044565486506561, + "grad_norm": 0.20622926950454712, + "learning_rate": 1.6648890839507058e-05, + "loss": 0.7346, + "mean_token_accuracy": 0.7876258313655853, + "num_tokens": 2193808006.0, + "step": 1055 + }, + { + "epoch": 1.0495172072295123, + "grad_norm": 0.1940770000219345, + "learning_rate": 1.661472832700485e-05, + "loss": 0.7258, + "mean_token_accuracy": 0.789948183298111, + "num_tokens": 2204182448.0, + "step": 1060 + }, + { + "epoch": 1.0544689279524635, + "grad_norm": 0.18651843070983887, + "learning_rate": 1.658044324052964e-05, + "loss": 0.7237, + "mean_token_accuracy": 0.7907967299222947, + "num_tokens": 2214609200.0, + "step": 1065 + }, + { + "epoch": 1.0594206486754147, + "grad_norm": 0.19266949594020844, + "learning_rate": 1.6546036601402628e-05, + "loss": 0.7305, + "mean_token_accuracy": 0.7886613547801972, + "num_tokens": 2224999377.0, + "step": 1070 + }, + { + "epoch": 1.064372369398366, + "grad_norm": 0.1916077733039856, + "learning_rate": 1.6511509434565948e-05, + "loss": 0.7208, + "mean_token_accuracy": 0.791583576798439, + "num_tokens": 2235407967.0, + "step": 1075 + }, + { + "epoch": 1.0693240901213172, + "grad_norm": 0.19016622006893158, + "learning_rate": 1.647686276855215e-05, + "loss": 0.7281, + "mean_token_accuracy": 0.7892949759960175, + "num_tokens": 2245796359.0, + "step": 1080 + }, + { + "epoch": 1.0742758108442685, + "grad_norm": 0.19595059752464294, + "learning_rate": 1.6442097635453558e-05, + "loss": 0.767, + "mean_token_accuracy": 0.7892323851585388, + "num_tokens": 2256165537.0, + "step": 1085 + }, + { + "epoch": 1.0792275315672195, + "grad_norm": 0.2070004940032959, + "learning_rate": 1.6407215070891513e-05, + "loss": 0.7254, + "mean_token_accuracy": 0.7902043342590332, + "num_tokens": 2266536896.0, + "step": 1090 + }, + { + "epoch": 1.0841792522901708, + "grad_norm": 0.20020951330661774, + "learning_rate": 1.6372216113985538e-05, + "loss": 0.7315, + "mean_token_accuracy": 0.7884757041931152, + "num_tokens": 2276919634.0, + "step": 1095 + }, + { + "epoch": 1.089130973013122, + "grad_norm": 0.20975516736507416, + "learning_rate": 1.633710180732237e-05, + "loss": 0.719, + "mean_token_accuracy": 0.7908209532499313, + "num_tokens": 2287296356.0, + "step": 1100 + }, + { + "epoch": 1.0940826937360733, + "grad_norm": 0.18807543814182281, + "learning_rate": 1.630187319692492e-05, + "loss": 0.7315, + "mean_token_accuracy": 0.7885193258523941, + "num_tokens": 2297692580.0, + "step": 1105 + }, + { + "epoch": 1.0990344144590245, + "grad_norm": 0.21502210199832916, + "learning_rate": 1.6266531332221097e-05, + "loss": 0.7279, + "mean_token_accuracy": 0.7890672951936721, + "num_tokens": 2308090705.0, + "step": 1110 + }, + { + "epoch": 1.1039861351819757, + "grad_norm": 0.1803969144821167, + "learning_rate": 1.6231077266012545e-05, + "loss": 0.7281, + "mean_token_accuracy": 0.7891331762075424, + "num_tokens": 2318497722.0, + "step": 1115 + }, + { + "epoch": 1.108937855904927, + "grad_norm": 0.18801762163639069, + "learning_rate": 1.6195512054443294e-05, + "loss": 0.7226, + "mean_token_accuracy": 0.7904093325138092, + "num_tokens": 2328861172.0, + "step": 1120 + }, + { + "epoch": 1.1138895766278782, + "grad_norm": 0.20595437288284302, + "learning_rate": 1.6159836756968296e-05, + "loss": 0.7325, + "mean_token_accuracy": 0.787862503528595, + "num_tokens": 2339258549.0, + "step": 1125 + }, + { + "epoch": 1.1188412973508295, + "grad_norm": 0.20336276292800903, + "learning_rate": 1.6124052436321846e-05, + "loss": 0.7289, + "mean_token_accuracy": 0.7888852953910828, + "num_tokens": 2349685068.0, + "step": 1130 + }, + { + "epoch": 1.1237930180737807, + "grad_norm": 0.198221817612648, + "learning_rate": 1.608816015848596e-05, + "loss": 0.7249, + "mean_token_accuracy": 0.7904265612363816, + "num_tokens": 2360095914.0, + "step": 1135 + }, + { + "epoch": 1.128744738796732, + "grad_norm": 0.19925156235694885, + "learning_rate": 1.6052160992658584e-05, + "loss": 0.726, + "mean_token_accuracy": 0.7895150870084763, + "num_tokens": 2370491516.0, + "step": 1140 + }, + { + "epoch": 1.133696459519683, + "grad_norm": 0.2183285653591156, + "learning_rate": 1.6016056011221764e-05, + "loss": 0.7223, + "mean_token_accuracy": 0.7908050715923309, + "num_tokens": 2380881645.0, + "step": 1145 + }, + { + "epoch": 1.1386481802426343, + "grad_norm": 0.23983266949653625, + "learning_rate": 1.5979846289709695e-05, + "loss": 0.7261, + "mean_token_accuracy": 0.7894744217395783, + "num_tokens": 2391278281.0, + "step": 1150 + }, + { + "epoch": 1.1435999009655855, + "grad_norm": 0.18753401935100555, + "learning_rate": 1.5943532906776683e-05, + "loss": 0.7185, + "mean_token_accuracy": 0.7921951532363891, + "num_tokens": 2401703056.0, + "step": 1155 + }, + { + "epoch": 1.1485516216885368, + "grad_norm": 0.20767702162265778, + "learning_rate": 1.5907116944165017e-05, + "loss": 0.7294, + "mean_token_accuracy": 0.7886975765228271, + "num_tokens": 2412114255.0, + "step": 1160 + }, + { + "epoch": 1.153503342411488, + "grad_norm": 0.2048075795173645, + "learning_rate": 1.5870599486672725e-05, + "loss": 0.7298, + "mean_token_accuracy": 0.7886143833398819, + "num_tokens": 2422532845.0, + "step": 1165 + }, + { + "epoch": 1.1584550631344392, + "grad_norm": 0.20132844150066376, + "learning_rate": 1.583398162212129e-05, + "loss": 0.723, + "mean_token_accuracy": 0.7906926244497299, + "num_tokens": 2432907783.0, + "step": 1170 + }, + { + "epoch": 1.1634067838573905, + "grad_norm": 0.24149557948112488, + "learning_rate": 1.5797264441323227e-05, + "loss": 0.7279, + "mean_token_accuracy": 0.7897651940584183, + "num_tokens": 2443315467.0, + "step": 1175 + }, + { + "epoch": 1.1683585045803417, + "grad_norm": 0.22159512341022491, + "learning_rate": 1.576044903804958e-05, + "loss": 0.7292, + "mean_token_accuracy": 0.7884950757026672, + "num_tokens": 2453711162.0, + "step": 1180 + }, + { + "epoch": 1.173310225303293, + "grad_norm": 0.20547987520694733, + "learning_rate": 1.572353650899737e-05, + "loss": 0.7184, + "mean_token_accuracy": 0.7916571944952011, + "num_tokens": 2464145240.0, + "step": 1185 + }, + { + "epoch": 1.178261946026244, + "grad_norm": 0.21845729649066925, + "learning_rate": 1.568652795375688e-05, + "loss": 0.7266, + "mean_token_accuracy": 0.7894676387310028, + "num_tokens": 2474536824.0, + "step": 1190 + }, + { + "epoch": 1.1832136667491953, + "grad_norm": 0.20706267654895782, + "learning_rate": 1.5649424474778943e-05, + "loss": 0.7465, + "mean_token_accuracy": 0.7838758409023285, + "num_tokens": 2484953507.0, + "step": 1195 + }, + { + "epoch": 1.1881653874721465, + "grad_norm": 0.20287451148033142, + "learning_rate": 1.5612227177342075e-05, + "loss": 0.7305, + "mean_token_accuracy": 0.788197448849678, + "num_tokens": 2495362940.0, + "step": 1200 + }, + { + "epoch": 1.1931171081950978, + "grad_norm": 0.19987879693508148, + "learning_rate": 1.5574937169519567e-05, + "loss": 0.7227, + "mean_token_accuracy": 0.7905563831329345, + "num_tokens": 2505783339.0, + "step": 1205 + }, + { + "epoch": 1.198068828918049, + "grad_norm": 0.20694401860237122, + "learning_rate": 1.5537555562146455e-05, + "loss": 0.7249, + "mean_token_accuracy": 0.7901411831378937, + "num_tokens": 2516210266.0, + "step": 1210 + }, + { + "epoch": 1.2030205496410002, + "grad_norm": 0.19224272668361664, + "learning_rate": 1.5500083468786452e-05, + "loss": 0.7273, + "mean_token_accuracy": 0.7893536627292633, + "num_tokens": 2526623484.0, + "step": 1215 + }, + { + "epoch": 1.2079722703639515, + "grad_norm": 0.2035578191280365, + "learning_rate": 1.546252200569875e-05, + "loss": 0.7284, + "mean_token_accuracy": 0.7887589871883393, + "num_tokens": 2537014982.0, + "step": 1220 + }, + { + "epoch": 1.2129239910869027, + "grad_norm": 1.5885528326034546, + "learning_rate": 1.5424872291804807e-05, + "loss": 0.7253, + "mean_token_accuracy": 0.792636951804161, + "num_tokens": 2547427138.0, + "step": 1225 + }, + { + "epoch": 1.217875711809854, + "grad_norm": 0.18842822313308716, + "learning_rate": 1.5387135448654968e-05, + "loss": 0.7292, + "mean_token_accuracy": 0.7887994408607483, + "num_tokens": 2557828037.0, + "step": 1230 + }, + { + "epoch": 1.2228274325328052, + "grad_norm": 0.19046075642108917, + "learning_rate": 1.534931260039509e-05, + "loss": 0.7311, + "mean_token_accuracy": 0.7879103571176529, + "num_tokens": 2568241802.0, + "step": 1235 + }, + { + "epoch": 1.2277791532557565, + "grad_norm": 0.18367241322994232, + "learning_rate": 1.5311404873733043e-05, + "loss": 0.7258, + "mean_token_accuracy": 0.7896815687417984, + "num_tokens": 2578599008.0, + "step": 1240 + }, + { + "epoch": 1.2327308739787075, + "grad_norm": 0.18008632957935333, + "learning_rate": 1.5273413397905142e-05, + "loss": 0.7133, + "mean_token_accuracy": 0.7930183291435242, + "num_tokens": 2589008154.0, + "step": 1245 + }, + { + "epoch": 1.2376825947016588, + "grad_norm": 0.20123504102230072, + "learning_rate": 1.5235339304642521e-05, + "loss": 0.7247, + "mean_token_accuracy": 0.7899129718542099, + "num_tokens": 2599435416.0, + "step": 1250 + }, + { + "epoch": 1.24263431542461, + "grad_norm": 0.19925780594348907, + "learning_rate": 1.5197183728137402e-05, + "loss": 0.7339, + "mean_token_accuracy": 0.787640643119812, + "num_tokens": 2609816008.0, + "step": 1255 + }, + { + "epoch": 1.2475860361475613, + "grad_norm": 0.19969385862350464, + "learning_rate": 1.5158947805009317e-05, + "loss": 0.715, + "mean_token_accuracy": 0.7925884634256363, + "num_tokens": 2620222628.0, + "step": 1260 + }, + { + "epoch": 1.2525377568705125, + "grad_norm": 0.1753593236207962, + "learning_rate": 1.5120632674271265e-05, + "loss": 0.7236, + "mean_token_accuracy": 0.790338072180748, + "num_tokens": 2630612525.0, + "step": 1265 + }, + { + "epoch": 1.2574894775934637, + "grad_norm": 0.19997967779636383, + "learning_rate": 1.5082239477295745e-05, + "loss": 0.7236, + "mean_token_accuracy": 0.7902151554822922, + "num_tokens": 2640999606.0, + "step": 1270 + }, + { + "epoch": 1.262441198316415, + "grad_norm": 2.163191080093384, + "learning_rate": 1.5043769357780798e-05, + "loss": 0.743, + "mean_token_accuracy": 0.7919340342283249, + "num_tokens": 2651386549.0, + "step": 1275 + }, + { + "epoch": 1.2673929190393662, + "grad_norm": 0.201646625995636, + "learning_rate": 1.5005223461715907e-05, + "loss": 0.7177, + "mean_token_accuracy": 0.7917960375547409, + "num_tokens": 2661786975.0, + "step": 1280 + }, + { + "epoch": 1.2723446397623175, + "grad_norm": 0.21226562559604645, + "learning_rate": 1.4966602937347863e-05, + "loss": 0.7316, + "mean_token_accuracy": 0.7908751904964447, + "num_tokens": 2672165640.0, + "step": 1285 + }, + { + "epoch": 1.2772963604852685, + "grad_norm": 0.20382718741893768, + "learning_rate": 1.4927908935146576e-05, + "loss": 0.7258, + "mean_token_accuracy": 0.7896455824375153, + "num_tokens": 2682549545.0, + "step": 1290 + }, + { + "epoch": 1.2822480812082198, + "grad_norm": 0.2759467661380768, + "learning_rate": 1.488914260777079e-05, + "loss": 0.7178, + "mean_token_accuracy": 0.7913758844137192, + "num_tokens": 2692955151.0, + "step": 1295 + }, + { + "epoch": 1.287199801931171, + "grad_norm": 0.18792249262332916, + "learning_rate": 1.4850305110033747e-05, + "loss": 0.7221, + "mean_token_accuracy": 0.7903568297624588, + "num_tokens": 2703360987.0, + "step": 1300 + }, + { + "epoch": 1.2921515226541223, + "grad_norm": 0.20369011163711548, + "learning_rate": 1.4811397598868789e-05, + "loss": 0.7261, + "mean_token_accuracy": 0.7895828515291214, + "num_tokens": 2713747572.0, + "step": 1305 + }, + { + "epoch": 1.2971032433770735, + "grad_norm": 0.21133604645729065, + "learning_rate": 1.4772421233294898e-05, + "loss": 0.7224, + "mean_token_accuracy": 0.7906721025705338, + "num_tokens": 2724160321.0, + "step": 1310 + }, + { + "epoch": 1.3020549641000247, + "grad_norm": 0.1944030076265335, + "learning_rate": 1.473337717438216e-05, + "loss": 0.7186, + "mean_token_accuracy": 0.7915155500173569, + "num_tokens": 2734551031.0, + "step": 1315 + }, + { + "epoch": 1.307006684822976, + "grad_norm": 0.18083949387073517, + "learning_rate": 1.4694266585217185e-05, + "loss": 0.7232, + "mean_token_accuracy": 0.7904892802238465, + "num_tokens": 2744930879.0, + "step": 1320 + }, + { + "epoch": 1.3119584055459272, + "grad_norm": 0.18224333226680756, + "learning_rate": 1.4655090630868458e-05, + "loss": 0.7249, + "mean_token_accuracy": 0.7898193567991256, + "num_tokens": 2755301410.0, + "step": 1325 + }, + { + "epoch": 1.3169101262688785, + "grad_norm": 0.2043742537498474, + "learning_rate": 1.4615850478351637e-05, + "loss": 0.7327, + "mean_token_accuracy": 0.7872727513313293, + "num_tokens": 2765707788.0, + "step": 1330 + }, + { + "epoch": 1.3218618469918297, + "grad_norm": 0.22111938893795013, + "learning_rate": 1.4576547296594774e-05, + "loss": 0.7276, + "mean_token_accuracy": 0.7887694299221039, + "num_tokens": 2776117645.0, + "step": 1335 + }, + { + "epoch": 1.326813567714781, + "grad_norm": 0.19156336784362793, + "learning_rate": 1.4537182256403522e-05, + "loss": 0.7338, + "mean_token_accuracy": 0.7873490035533905, + "num_tokens": 2786526348.0, + "step": 1340 + }, + { + "epoch": 1.3317652884377322, + "grad_norm": 0.1943420171737671, + "learning_rate": 1.4497756530426218e-05, + "loss": 0.725, + "mean_token_accuracy": 0.789910814166069, + "num_tokens": 2796941910.0, + "step": 1345 + }, + { + "epoch": 1.3367170091606833, + "grad_norm": 0.17877770960330963, + "learning_rate": 1.4458271293118986e-05, + "loss": 0.7305, + "mean_token_accuracy": 0.7884226262569427, + "num_tokens": 2807339154.0, + "step": 1350 + }, + { + "epoch": 1.3416687298836345, + "grad_norm": 0.20378288626670837, + "learning_rate": 1.4418727720710736e-05, + "loss": 0.7261, + "mean_token_accuracy": 0.7890681326389313, + "num_tokens": 2817759971.0, + "step": 1355 + }, + { + "epoch": 1.3466204506065858, + "grad_norm": 0.19834725558757782, + "learning_rate": 1.4379126991168126e-05, + "loss": 0.7116, + "mean_token_accuracy": 0.7931619733572006, + "num_tokens": 2828136169.0, + "step": 1360 + }, + { + "epoch": 1.351572171329537, + "grad_norm": 0.1840638518333435, + "learning_rate": 1.4339470284160477e-05, + "loss": 0.7304, + "mean_token_accuracy": 0.7882639706134796, + "num_tokens": 2838563696.0, + "step": 1365 + }, + { + "epoch": 1.3565238920524882, + "grad_norm": 0.18524734675884247, + "learning_rate": 1.4299758781024615e-05, + "loss": 0.7088, + "mean_token_accuracy": 0.793875104188919, + "num_tokens": 2848965953.0, + "step": 1370 + }, + { + "epoch": 1.3614756127754395, + "grad_norm": 0.1821308732032776, + "learning_rate": 1.4259993664729708e-05, + "loss": 0.7234, + "mean_token_accuracy": 0.7902139544486999, + "num_tokens": 2859343681.0, + "step": 1375 + }, + { + "epoch": 1.3664273334983907, + "grad_norm": 0.1937830150127411, + "learning_rate": 1.4220176119841995e-05, + "loss": 0.7211, + "mean_token_accuracy": 0.790678608417511, + "num_tokens": 2869738016.0, + "step": 1380 + }, + { + "epoch": 1.371379054221342, + "grad_norm": 0.18466956913471222, + "learning_rate": 1.4180307332489532e-05, + "loss": 0.7228, + "mean_token_accuracy": 0.7902078241109848, + "num_tokens": 2880142465.0, + "step": 1385 + }, + { + "epoch": 1.376330774944293, + "grad_norm": 0.1925409734249115, + "learning_rate": 1.4140388490326822e-05, + "loss": 0.728, + "mean_token_accuracy": 0.7886234402656556, + "num_tokens": 2890524591.0, + "step": 1390 + }, + { + "epoch": 1.3812824956672443, + "grad_norm": 0.17395733296871185, + "learning_rate": 1.4100420782499466e-05, + "loss": 0.7159, + "mean_token_accuracy": 0.7921683073043824, + "num_tokens": 2900903230.0, + "step": 1395 + }, + { + "epoch": 1.3862342163901955, + "grad_norm": 0.17405180633068085, + "learning_rate": 1.4060405399608732e-05, + "loss": 0.716, + "mean_token_accuracy": 0.7919426709413528, + "num_tokens": 2911289063.0, + "step": 1400 + }, + { + "epoch": 1.3911859371131468, + "grad_norm": 0.17874355614185333, + "learning_rate": 1.4020343533676076e-05, + "loss": 0.7161, + "mean_token_accuracy": 0.79238740503788, + "num_tokens": 2921712270.0, + "step": 1405 + }, + { + "epoch": 1.396137657836098, + "grad_norm": 0.19728051126003265, + "learning_rate": 1.398023637810764e-05, + "loss": 0.7098, + "mean_token_accuracy": 0.7940772980451584, + "num_tokens": 2932122921.0, + "step": 1410 + }, + { + "epoch": 1.4010893785590492, + "grad_norm": 0.20457538962364197, + "learning_rate": 1.3940085127658707e-05, + "loss": 0.7338, + "mean_token_accuracy": 0.7870628893375397, + "num_tokens": 2942516461.0, + "step": 1415 + }, + { + "epoch": 1.4060410992820005, + "grad_norm": 0.18084751069545746, + "learning_rate": 1.389989097839811e-05, + "loss": 0.7167, + "mean_token_accuracy": 0.7917933404445648, + "num_tokens": 2952889986.0, + "step": 1420 + }, + { + "epoch": 1.4109928200049517, + "grad_norm": 0.17652776837348938, + "learning_rate": 1.385965512767259e-05, + "loss": 0.7155, + "mean_token_accuracy": 0.7922806650400162, + "num_tokens": 2963310857.0, + "step": 1425 + }, + { + "epoch": 1.415944540727903, + "grad_norm": 0.18736056983470917, + "learning_rate": 1.3819378774071139e-05, + "loss": 0.7179, + "mean_token_accuracy": 0.7915126740932464, + "num_tokens": 2973715953.0, + "step": 1430 + }, + { + "epoch": 1.4208962614508542, + "grad_norm": 0.1932843029499054, + "learning_rate": 1.3779063117389297e-05, + "loss": 0.7066, + "mean_token_accuracy": 0.7942998945713043, + "num_tokens": 2984126283.0, + "step": 1435 + }, + { + "epoch": 1.4258479821738055, + "grad_norm": 0.2286568284034729, + "learning_rate": 1.3738709358593398e-05, + "loss": 0.7268, + "mean_token_accuracy": 0.789307501912117, + "num_tokens": 2994525073.0, + "step": 1440 + }, + { + "epoch": 1.4307997028967567, + "grad_norm": 0.19728349149227142, + "learning_rate": 1.3698318699784812e-05, + "loss": 0.7233, + "mean_token_accuracy": 0.7898936212062836, + "num_tokens": 3004926101.0, + "step": 1445 + }, + { + "epoch": 1.435751423619708, + "grad_norm": 0.20021368563175201, + "learning_rate": 1.3657892344164116e-05, + "loss": 0.7093, + "mean_token_accuracy": 0.7938795328140259, + "num_tokens": 3015334391.0, + "step": 1450 + }, + { + "epoch": 1.440703144342659, + "grad_norm": 0.17851568758487701, + "learning_rate": 1.3617431495995276e-05, + "loss": 0.7241, + "mean_token_accuracy": 0.7900084257125854, + "num_tokens": 3025730657.0, + "step": 1455 + }, + { + "epoch": 1.4456548650656103, + "grad_norm": 0.17161023616790771, + "learning_rate": 1.3576937360569747e-05, + "loss": 0.729, + "mean_token_accuracy": 0.7888924747705459, + "num_tokens": 3036137389.0, + "step": 1460 + }, + { + "epoch": 1.4506065857885615, + "grad_norm": 0.19351643323898315, + "learning_rate": 1.3536411144170584e-05, + "loss": 0.7074, + "mean_token_accuracy": 0.7941448390483856, + "num_tokens": 3046543020.0, + "step": 1465 + }, + { + "epoch": 1.4555583065115127, + "grad_norm": 0.18472740054130554, + "learning_rate": 1.349585405403651e-05, + "loss": 0.7203, + "mean_token_accuracy": 0.7905257880687714, + "num_tokens": 3056948582.0, + "step": 1470 + }, + { + "epoch": 1.460510027234464, + "grad_norm": 0.1972552090883255, + "learning_rate": 1.345526729832594e-05, + "loss": 0.7299, + "mean_token_accuracy": 0.7882571071386337, + "num_tokens": 3067348021.0, + "step": 1475 + }, + { + "epoch": 1.4654617479574152, + "grad_norm": 0.19973404705524445, + "learning_rate": 1.341465208608101e-05, + "loss": 0.718, + "mean_token_accuracy": 0.7912285208702088, + "num_tokens": 3077753784.0, + "step": 1480 + }, + { + "epoch": 1.4704134686803665, + "grad_norm": 0.19882094860076904, + "learning_rate": 1.3374009627191535e-05, + "loss": 0.718, + "mean_token_accuracy": 0.7916070431470871, + "num_tokens": 3088154461.0, + "step": 1485 + }, + { + "epoch": 1.4753651894033177, + "grad_norm": 0.18356384336948395, + "learning_rate": 1.3333341132358998e-05, + "loss": 0.7102, + "mean_token_accuracy": 0.7933901071548461, + "num_tokens": 3098576272.0, + "step": 1490 + }, + { + "epoch": 1.4803169101262688, + "grad_norm": 0.17706668376922607, + "learning_rate": 1.3292647813060462e-05, + "loss": 0.7145, + "mean_token_accuracy": 0.7923637479543686, + "num_tokens": 3108958161.0, + "step": 1495 + }, + { + "epoch": 1.48526863084922, + "grad_norm": 0.7771270275115967, + "learning_rate": 1.325193088151248e-05, + "loss": 0.7196, + "mean_token_accuracy": 0.790700152516365, + "num_tokens": 3119332234.0, + "step": 1500 + }, + { + "epoch": 1.4902203515721713, + "grad_norm": 0.1887047439813614, + "learning_rate": 1.3211191550635008e-05, + "loss": 0.7204, + "mean_token_accuracy": 0.7904630064964294, + "num_tokens": 3129741365.0, + "step": 1505 + }, + { + "epoch": 1.4951720722951225, + "grad_norm": 0.18025922775268555, + "learning_rate": 1.3170431034015242e-05, + "loss": 0.7219, + "mean_token_accuracy": 0.7903009295463562, + "num_tokens": 3140137525.0, + "step": 1510 + }, + { + "epoch": 1.5001237930180737, + "grad_norm": 0.1742526888847351, + "learning_rate": 1.31296505458715e-05, + "loss": 0.7227, + "mean_token_accuracy": 0.7903249442577363, + "num_tokens": 3150551214.0, + "step": 1515 + }, + { + "epoch": 1.505075513741025, + "grad_norm": 0.17877598106861115, + "learning_rate": 1.3088851301017012e-05, + "loss": 0.7196, + "mean_token_accuracy": 0.7908366918563843, + "num_tokens": 3160963988.0, + "step": 1520 + }, + { + "epoch": 1.5100272344639762, + "grad_norm": 0.27315303683280945, + "learning_rate": 1.3048034514823768e-05, + "loss": 0.72, + "mean_token_accuracy": 0.7911725819110871, + "num_tokens": 3171338083.0, + "step": 1525 + }, + { + "epoch": 1.5149789551869275, + "grad_norm": 0.20963208377361298, + "learning_rate": 1.3007201403186293e-05, + "loss": 0.7189, + "mean_token_accuracy": 0.7909859776496887, + "num_tokens": 3181728440.0, + "step": 1530 + }, + { + "epoch": 1.5199306759098787, + "grad_norm": 0.184099942445755, + "learning_rate": 1.2966353182485435e-05, + "loss": 0.7179, + "mean_token_accuracy": 0.7914166182279587, + "num_tokens": 3192143067.0, + "step": 1535 + }, + { + "epoch": 1.52488239663283, + "grad_norm": 0.24152904748916626, + "learning_rate": 1.2925491069552126e-05, + "loss": 0.8021, + "mean_token_accuracy": 0.7904890239238739, + "num_tokens": 3202562250.0, + "step": 1540 + }, + { + "epoch": 1.5298341173557812, + "grad_norm": 1.4814072847366333, + "learning_rate": 1.2884616281631129e-05, + "loss": 0.7134, + "mean_token_accuracy": 0.7926855504512786, + "num_tokens": 3212962934.0, + "step": 1545 + }, + { + "epoch": 1.5347858380787325, + "grad_norm": 1.811551809310913, + "learning_rate": 1.284373003634479e-05, + "loss": 0.8015, + "mean_token_accuracy": 0.7890555679798126, + "num_tokens": 3223382798.0, + "step": 1550 + }, + { + "epoch": 1.5397375588016837, + "grad_norm": 0.2511199414730072, + "learning_rate": 1.2802833551656764e-05, + "loss": 0.7195, + "mean_token_accuracy": 0.7906852185726165, + "num_tokens": 3233791747.0, + "step": 1555 + }, + { + "epoch": 1.544689279524635, + "grad_norm": 0.20060916244983673, + "learning_rate": 1.276192804583571e-05, + "loss": 0.7225, + "mean_token_accuracy": 0.7896364331245422, + "num_tokens": 3244165580.0, + "step": 1560 + }, + { + "epoch": 1.549641000247586, + "grad_norm": 0.20217087864875793, + "learning_rate": 1.272101473741904e-05, + "loss": 0.7316, + "mean_token_accuracy": 0.7900652945041656, + "num_tokens": 3254572677.0, + "step": 1565 + }, + { + "epoch": 1.5545927209705372, + "grad_norm": 0.20432990789413452, + "learning_rate": 1.2680094845176584e-05, + "loss": 0.726, + "mean_token_accuracy": 0.7905694574117661, + "num_tokens": 3264942215.0, + "step": 1570 + }, + { + "epoch": 1.5595444416934885, + "grad_norm": 0.1891479641199112, + "learning_rate": 1.2639169588074305e-05, + "loss": 0.7143, + "mean_token_accuracy": 0.7926017671823502, + "num_tokens": 3275326524.0, + "step": 1575 + }, + { + "epoch": 1.5644961624164397, + "grad_norm": 45.705196380615234, + "learning_rate": 1.2598240185237973e-05, + "loss": 0.738, + "mean_token_accuracy": 0.7900298535823822, + "num_tokens": 3285666593.0, + "step": 1580 + }, + { + "epoch": 1.569447883139391, + "grad_norm": 0.21276827156543732, + "learning_rate": 1.2557307855916864e-05, + "loss": 0.7268, + "mean_token_accuracy": 0.7888426870107651, + "num_tokens": 3296073717.0, + "step": 1585 + }, + { + "epoch": 1.574399603862342, + "grad_norm": 0.17950324714183807, + "learning_rate": 1.251637381944743e-05, + "loss": 0.7126, + "mean_token_accuracy": 0.7925331711769104, + "num_tokens": 3306500839.0, + "step": 1590 + }, + { + "epoch": 1.5793513245852933, + "grad_norm": 0.18962812423706055, + "learning_rate": 1.2475439295216968e-05, + "loss": 0.7245, + "mean_token_accuracy": 0.7894937306642532, + "num_tokens": 3316903642.0, + "step": 1595 + }, + { + "epoch": 1.5843030453082445, + "grad_norm": 0.1753559410572052, + "learning_rate": 1.2434505502627321e-05, + "loss": 0.7163, + "mean_token_accuracy": 0.7914654940366745, + "num_tokens": 3327282929.0, + "step": 1600 + }, + { + "epoch": 1.5892547660311958, + "grad_norm": 0.17054420709609985, + "learning_rate": 1.239357366105852e-05, + "loss": 0.7189, + "mean_token_accuracy": 0.7911906123161316, + "num_tokens": 3337714929.0, + "step": 1605 + }, + { + "epoch": 1.594206486754147, + "grad_norm": 0.1967150866985321, + "learning_rate": 1.2352644989832485e-05, + "loss": 0.7176, + "mean_token_accuracy": 0.7913451254367828, + "num_tokens": 3348136859.0, + "step": 1610 + }, + { + "epoch": 1.5991582074770982, + "grad_norm": 0.1789763867855072, + "learning_rate": 1.2311720708176697e-05, + "loss": 0.7157, + "mean_token_accuracy": 0.792006203532219, + "num_tokens": 3358564789.0, + "step": 1615 + }, + { + "epoch": 1.6041099282000495, + "grad_norm": 0.17656336724758148, + "learning_rate": 1.2270802035187876e-05, + "loss": 0.7106, + "mean_token_accuracy": 0.7932015925645828, + "num_tokens": 3368938593.0, + "step": 1620 + }, + { + "epoch": 1.6090616489230007, + "grad_norm": 0.19095465540885925, + "learning_rate": 1.2229890189795659e-05, + "loss": 0.7113, + "mean_token_accuracy": 0.7929900795221329, + "num_tokens": 3379336222.0, + "step": 1625 + }, + { + "epoch": 1.614013369645952, + "grad_norm": 0.18307463824748993, + "learning_rate": 1.2188986390726293e-05, + "loss": 0.7164, + "mean_token_accuracy": 0.792149567604065, + "num_tokens": 3389758319.0, + "step": 1630 + }, + { + "epoch": 1.6189650903689032, + "grad_norm": 0.18088825047016144, + "learning_rate": 1.2148091856466348e-05, + "loss": 0.7123, + "mean_token_accuracy": 0.7924705147743225, + "num_tokens": 3400160230.0, + "step": 1635 + }, + { + "epoch": 1.6239168110918545, + "grad_norm": 0.1794070303440094, + "learning_rate": 1.2107207805226388e-05, + "loss": 0.7191, + "mean_token_accuracy": 0.791198554635048, + "num_tokens": 3410521557.0, + "step": 1640 + }, + { + "epoch": 1.6288685318148057, + "grad_norm": 0.17816896736621857, + "learning_rate": 1.20663354549047e-05, + "loss": 0.7254, + "mean_token_accuracy": 0.7890619933605194, + "num_tokens": 3420912198.0, + "step": 1645 + }, + { + "epoch": 1.633820252537757, + "grad_norm": 0.1971050202846527, + "learning_rate": 1.2025476023051022e-05, + "loss": 0.7218, + "mean_token_accuracy": 0.7902753293514252, + "num_tokens": 3431305496.0, + "step": 1650 + }, + { + "epoch": 1.6387719732607082, + "grad_norm": 0.19463616609573364, + "learning_rate": 1.1984630726830245e-05, + "loss": 0.7125, + "mean_token_accuracy": 0.7927158504724503, + "num_tokens": 3441730859.0, + "step": 1655 + }, + { + "epoch": 1.6437236939836595, + "grad_norm": 0.19123196601867676, + "learning_rate": 1.194380078298619e-05, + "loss": 0.7154, + "mean_token_accuracy": 0.7916401147842407, + "num_tokens": 3452131219.0, + "step": 1660 + }, + { + "epoch": 1.6486754147066105, + "grad_norm": 0.1910821497440338, + "learning_rate": 1.1902987407805338e-05, + "loss": 0.7188, + "mean_token_accuracy": 0.7912047028541564, + "num_tokens": 3462547360.0, + "step": 1665 + }, + { + "epoch": 1.6536271354295617, + "grad_norm": 0.18839426338672638, + "learning_rate": 1.1862191817080597e-05, + "loss": 0.7172, + "mean_token_accuracy": 0.791377791762352, + "num_tokens": 3472954780.0, + "step": 1670 + }, + { + "epoch": 1.658578856152513, + "grad_norm": 0.189104825258255, + "learning_rate": 1.1821415226075104e-05, + "loss": 0.7141, + "mean_token_accuracy": 0.7924461841583252, + "num_tokens": 3483377399.0, + "step": 1675 + }, + { + "epoch": 1.6635305768754642, + "grad_norm": 0.18031081557273865, + "learning_rate": 1.1780658849486011e-05, + "loss": 0.722, + "mean_token_accuracy": 0.7899476200342178, + "num_tokens": 3493799556.0, + "step": 1680 + }, + { + "epoch": 1.6684822975984155, + "grad_norm": 0.18072226643562317, + "learning_rate": 1.1739923901408292e-05, + "loss": 0.7066, + "mean_token_accuracy": 0.7940356642007828, + "num_tokens": 3504215269.0, + "step": 1685 + }, + { + "epoch": 1.6734340183213665, + "grad_norm": 0.2156570702791214, + "learning_rate": 1.1699211595298589e-05, + "loss": 0.7312, + "mean_token_accuracy": 0.7876215547323226, + "num_tokens": 3514619987.0, + "step": 1690 + }, + { + "epoch": 1.6783857390443178, + "grad_norm": 0.18053479492664337, + "learning_rate": 1.1658523143939073e-05, + "loss": 0.7179, + "mean_token_accuracy": 0.7913117349147797, + "num_tokens": 3525029614.0, + "step": 1695 + }, + { + "epoch": 1.683337459767269, + "grad_norm": 0.17777222394943237, + "learning_rate": 1.1617859759401282e-05, + "loss": 0.7248, + "mean_token_accuracy": 0.7891001671552658, + "num_tokens": 3535423120.0, + "step": 1700 + }, + { + "epoch": 1.6882891804902203, + "grad_norm": 0.1804647594690323, + "learning_rate": 1.1577222653010054e-05, + "loss": 0.699, + "mean_token_accuracy": 0.7962267965078353, + "num_tokens": 3545809893.0, + "step": 1705 + }, + { + "epoch": 1.6932409012131715, + "grad_norm": 0.177909716963768, + "learning_rate": 1.1536613035307416e-05, + "loss": 0.7351, + "mean_token_accuracy": 0.7915727466344833, + "num_tokens": 3556189451.0, + "step": 1710 + }, + { + "epoch": 1.6981926219361227, + "grad_norm": 0.19596891105175018, + "learning_rate": 1.1496032116016536e-05, + "loss": 0.7146, + "mean_token_accuracy": 0.7918591350317001, + "num_tokens": 3566604450.0, + "step": 1715 + }, + { + "epoch": 1.703144342659074, + "grad_norm": 0.1962021440267563, + "learning_rate": 1.1455481104005682e-05, + "loss": 0.7174, + "mean_token_accuracy": 0.79123954474926, + "num_tokens": 3577021485.0, + "step": 1720 + }, + { + "epoch": 1.7080960633820252, + "grad_norm": 0.18883569538593292, + "learning_rate": 1.1414961207252215e-05, + "loss": 0.7258, + "mean_token_accuracy": 0.788951313495636, + "num_tokens": 3587445493.0, + "step": 1725 + }, + { + "epoch": 1.7130477841049765, + "grad_norm": 0.8343937397003174, + "learning_rate": 1.137447363280659e-05, + "loss": 0.7242, + "mean_token_accuracy": 0.7895885169506073, + "num_tokens": 3597840209.0, + "step": 1730 + }, + { + "epoch": 1.7179995048279277, + "grad_norm": 1.3068397045135498, + "learning_rate": 1.1334019586756423e-05, + "loss": 0.725, + "mean_token_accuracy": 0.7885597527027131, + "num_tokens": 3608260465.0, + "step": 1735 + }, + { + "epoch": 1.722951225550879, + "grad_norm": 0.38125309348106384, + "learning_rate": 1.1293600274190548e-05, + "loss": 0.7258, + "mean_token_accuracy": 0.7903095871210098, + "num_tokens": 3618641232.0, + "step": 1740 + }, + { + "epoch": 1.7279029462738302, + "grad_norm": 0.19886992871761322, + "learning_rate": 1.125321689916311e-05, + "loss": 0.7226, + "mean_token_accuracy": 0.7916781276464462, + "num_tokens": 3629053078.0, + "step": 1745 + }, + { + "epoch": 1.7328546669967815, + "grad_norm": 0.2138087898492813, + "learning_rate": 1.1212870664657718e-05, + "loss": 0.7066, + "mean_token_accuracy": 0.7945046842098236, + "num_tokens": 3639448411.0, + "step": 1750 + }, + { + "epoch": 1.7378063877197327, + "grad_norm": 0.8190533518791199, + "learning_rate": 1.1172562772551598e-05, + "loss": 0.7087, + "mean_token_accuracy": 0.7934040397405624, + "num_tokens": 3649820357.0, + "step": 1755 + }, + { + "epoch": 1.742758108442684, + "grad_norm": 0.2707054316997528, + "learning_rate": 1.1132294423579786e-05, + "loss": 0.7295, + "mean_token_accuracy": 0.78960200548172, + "num_tokens": 3660192793.0, + "step": 1760 + }, + { + "epoch": 1.7477098291656352, + "grad_norm": 0.21164794266223907, + "learning_rate": 1.1092066817299369e-05, + "loss": 0.7005, + "mean_token_accuracy": 0.7957571446895599, + "num_tokens": 3670581943.0, + "step": 1765 + }, + { + "epoch": 1.7526615498885862, + "grad_norm": 0.18912960588932037, + "learning_rate": 1.1051881152053747e-05, + "loss": 0.7226, + "mean_token_accuracy": 0.789606511592865, + "num_tokens": 3680945884.0, + "step": 1770 + }, + { + "epoch": 1.7576132706115375, + "grad_norm": 0.17918053269386292, + "learning_rate": 1.1011738624936928e-05, + "loss": 0.715, + "mean_token_accuracy": 0.7915814012289047, + "num_tokens": 3691335157.0, + "step": 1775 + }, + { + "epoch": 1.7625649913344887, + "grad_norm": 0.16619303822517395, + "learning_rate": 1.0971640431757887e-05, + "loss": 0.7049, + "mean_token_accuracy": 0.7950347632169723, + "num_tokens": 3701768397.0, + "step": 1780 + }, + { + "epoch": 1.76751671205744, + "grad_norm": 0.18657127022743225, + "learning_rate": 1.0931587767004919e-05, + "loss": 0.7103, + "mean_token_accuracy": 0.792952173948288, + "num_tokens": 3712191467.0, + "step": 1785 + }, + { + "epoch": 1.7724684327803912, + "grad_norm": 0.1765134632587433, + "learning_rate": 1.089158182381008e-05, + "loss": 0.7078, + "mean_token_accuracy": 0.7937531501054764, + "num_tokens": 3722562525.0, + "step": 1790 + }, + { + "epoch": 1.7774201535033423, + "grad_norm": 1.2603615522384644, + "learning_rate": 1.0851623793913623e-05, + "loss": 0.7374, + "mean_token_accuracy": 0.7894641309976578, + "num_tokens": 3732974589.0, + "step": 1795 + }, + { + "epoch": 1.7823718742262935, + "grad_norm": 0.17135503888130188, + "learning_rate": 1.081171486762852e-05, + "loss": 0.7159, + "mean_token_accuracy": 0.7918219327926636, + "num_tokens": 3743352129.0, + "step": 1800 + }, + { + "epoch": 1.7873235949492448, + "grad_norm": 0.1757970154285431, + "learning_rate": 1.077185623380498e-05, + "loss": 0.7071, + "mean_token_accuracy": 0.7942558020353317, + "num_tokens": 3753772930.0, + "step": 1805 + }, + { + "epoch": 1.792275315672196, + "grad_norm": 0.19564303755760193, + "learning_rate": 1.0732049079795055e-05, + "loss": 0.7025, + "mean_token_accuracy": 0.7953001827001571, + "num_tokens": 3764169376.0, + "step": 1810 + }, + { + "epoch": 1.7972270363951472, + "grad_norm": 0.1925610452890396, + "learning_rate": 1.0692294591417258e-05, + "loss": 0.7257, + "mean_token_accuracy": 0.7887873202562332, + "num_tokens": 3774596373.0, + "step": 1815 + }, + { + "epoch": 1.8021787571180985, + "grad_norm": 0.17859354615211487, + "learning_rate": 1.0652593952921233e-05, + "loss": 0.7181, + "mean_token_accuracy": 0.791176849603653, + "num_tokens": 3784988118.0, + "step": 1820 + }, + { + "epoch": 1.8071304778410497, + "grad_norm": 0.16981032490730286, + "learning_rate": 1.0612948346952496e-05, + "loss": 0.7166, + "mean_token_accuracy": 0.7915939807891845, + "num_tokens": 3795380218.0, + "step": 1825 + }, + { + "epoch": 1.812082198564001, + "grad_norm": 0.1762877255678177, + "learning_rate": 1.0573358954517196e-05, + "loss": 0.7168, + "mean_token_accuracy": 0.7909587293863296, + "num_tokens": 3805800553.0, + "step": 1830 + }, + { + "epoch": 1.8170339192869522, + "grad_norm": 0.17652231454849243, + "learning_rate": 1.053382695494692e-05, + "loss": 0.7102, + "mean_token_accuracy": 0.7932070910930633, + "num_tokens": 3816185346.0, + "step": 1835 + }, + { + "epoch": 1.8219856400099035, + "grad_norm": 0.1817074567079544, + "learning_rate": 1.0494353525863585e-05, + "loss": 0.7217, + "mean_token_accuracy": 0.7901755303144455, + "num_tokens": 3826596047.0, + "step": 1840 + }, + { + "epoch": 1.8269373607328547, + "grad_norm": 0.17128807306289673, + "learning_rate": 1.0454939843144347e-05, + "loss": 0.7118, + "mean_token_accuracy": 0.7924787253141403, + "num_tokens": 3837027683.0, + "step": 1845 + }, + { + "epoch": 1.831889081455806, + "grad_norm": 0.17962965369224548, + "learning_rate": 1.0415587080886566e-05, + "loss": 0.7098, + "mean_token_accuracy": 0.7930670648813247, + "num_tokens": 3847429754.0, + "step": 1850 + }, + { + "epoch": 1.8368408021787572, + "grad_norm": 1.3683093786239624, + "learning_rate": 1.0376296411372842e-05, + "loss": 0.7515, + "mean_token_accuracy": 0.7930996656417847, + "num_tokens": 3857793627.0, + "step": 1855 + }, + { + "epoch": 1.8417925229017085, + "grad_norm": 0.1839543879032135, + "learning_rate": 1.0337069005036092e-05, + "loss": 0.7227, + "mean_token_accuracy": 0.7897769540548325, + "num_tokens": 3868206908.0, + "step": 1860 + }, + { + "epoch": 1.8467442436246597, + "grad_norm": 0.18110202252864838, + "learning_rate": 1.0297906030424673e-05, + "loss": 0.704, + "mean_token_accuracy": 0.7950334936380387, + "num_tokens": 3878616740.0, + "step": 1865 + }, + { + "epoch": 1.8516959643476107, + "grad_norm": 0.17166997492313385, + "learning_rate": 1.0258808654167587e-05, + "loss": 0.7203, + "mean_token_accuracy": 0.7901737481355667, + "num_tokens": 3889017125.0, + "step": 1870 + }, + { + "epoch": 1.856647685070562, + "grad_norm": 0.19010046124458313, + "learning_rate": 1.0219778040939721e-05, + "loss": 0.7127, + "mean_token_accuracy": 0.7925639122724533, + "num_tokens": 3899392222.0, + "step": 1875 + }, + { + "epoch": 1.8615994057935132, + "grad_norm": 0.18874190747737885, + "learning_rate": 1.0180815353427145e-05, + "loss": 0.7226, + "mean_token_accuracy": 0.7896841555833817, + "num_tokens": 3909822575.0, + "step": 1880 + }, + { + "epoch": 1.8665511265164645, + "grad_norm": 0.18219928443431854, + "learning_rate": 1.0141921752292496e-05, + "loss": 0.7173, + "mean_token_accuracy": 0.7909169375896454, + "num_tokens": 3920246298.0, + "step": 1885 + }, + { + "epoch": 1.8715028472394157, + "grad_norm": 0.1604100465774536, + "learning_rate": 1.0103098396140385e-05, + "loss": 0.7144, + "mean_token_accuracy": 0.7919532001018524, + "num_tokens": 3930666935.0, + "step": 1890 + }, + { + "epoch": 1.8764545679623668, + "grad_norm": 0.18080103397369385, + "learning_rate": 1.0064346441482886e-05, + "loss": 0.7215, + "mean_token_accuracy": 0.7898438304662705, + "num_tokens": 3941086913.0, + "step": 1895 + }, + { + "epoch": 1.881406288685318, + "grad_norm": 0.1699831187725067, + "learning_rate": 1.0025667042705098e-05, + "loss": 0.713, + "mean_token_accuracy": 0.7922929584980011, + "num_tokens": 3951506026.0, + "step": 1900 + }, + { + "epoch": 1.8863580094082693, + "grad_norm": 0.1767495721578598, + "learning_rate": 9.98706135203074e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7899861186742783, + "num_tokens": 3961934535.0, + "step": 1905 + }, + { + "epoch": 1.8913097301312205, + "grad_norm": 0.1642487794160843, + "learning_rate": 9.948530519487834e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7950653046369552, + "num_tokens": 3972338304.0, + "step": 1910 + }, + { + "epoch": 1.8962614508541717, + "grad_norm": 0.1722268909215927, + "learning_rate": 9.910075692874449e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7902722954750061, + "num_tokens": 3982719240.0, + "step": 1915 + }, + { + "epoch": 1.901213171577123, + "grad_norm": 0.19566771388053894, + "learning_rate": 9.871698017724509e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7892561614513397, + "num_tokens": 3993108969.0, + "step": 1920 + }, + { + "epoch": 1.9061648923000742, + "grad_norm": 0.18446719646453857, + "learning_rate": 9.833398637273662e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7935942828655242, + "num_tokens": 4003510049.0, + "step": 1925 + }, + { + "epoch": 1.9111166130230255, + "grad_norm": 0.20839688181877136, + "learning_rate": 9.795178692425235e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7952834218740463, + "num_tokens": 4013918937.0, + "step": 1930 + }, + { + "epoch": 1.9160683337459767, + "grad_norm": 0.22153820097446442, + "learning_rate": 9.757039321716239e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7898961067199707, + "num_tokens": 4024319886.0, + "step": 1935 + }, + { + "epoch": 1.921020054468928, + "grad_norm": 0.19979068636894226, + "learning_rate": 9.718981661283451e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7883312404155731, + "num_tokens": 4034745853.0, + "step": 1940 + }, + { + "epoch": 1.9259717751918792, + "grad_norm": 0.2004430890083313, + "learning_rate": 9.681006844829582e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7920163214206696, + "num_tokens": 4045177229.0, + "step": 1945 + }, + { + "epoch": 1.9309234959148305, + "grad_norm": 0.1783420294523239, + "learning_rate": 9.643116003589491e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7934363096952438, + "num_tokens": 4055559410.0, + "step": 1950 + }, + { + "epoch": 1.9358752166377817, + "grad_norm": 0.18284736573696136, + "learning_rate": 9.605310266296491e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7920771360397338, + "num_tokens": 4065916546.0, + "step": 1955 + }, + { + "epoch": 1.940826937360733, + "grad_norm": 0.16877685487270355, + "learning_rate": 9.567590759148738e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7910897344350815, + "num_tokens": 4076304579.0, + "step": 1960 + }, + { + "epoch": 1.9457786580836842, + "grad_norm": 0.17678384482860565, + "learning_rate": 9.529958605775658e-06, + "loss": 0.7065, + "mean_token_accuracy": 0.7940395891666412, + "num_tokens": 4086724782.0, + "step": 1965 + }, + { + "epoch": 1.9507303788066355, + "grad_norm": 0.1765850931406021, + "learning_rate": 9.492414927204496e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7888664186000824, + "num_tokens": 4097094346.0, + "step": 1970 + }, + { + "epoch": 1.9556820995295865, + "grad_norm": 2.5579123497009277, + "learning_rate": 9.454960841826909e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7934763431549072, + "num_tokens": 4107490115.0, + "step": 1975 + }, + { + "epoch": 1.9606338202525377, + "grad_norm": 0.20106090605258942, + "learning_rate": 9.417597465365664e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7909161776304245, + "num_tokens": 4117890651.0, + "step": 1980 + }, + { + "epoch": 1.965585540975489, + "grad_norm": 0.1879979968070984, + "learning_rate": 9.38032591084138e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7916456311941147, + "num_tokens": 4128290404.0, + "step": 1985 + }, + { + "epoch": 1.9705372616984402, + "grad_norm": 0.1819096952676773, + "learning_rate": 9.343147288539396e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7923238337039947, + "num_tokens": 4138709424.0, + "step": 1990 + }, + { + "epoch": 1.9754889824213915, + "grad_norm": 0.1816423237323761, + "learning_rate": 9.306062705976678e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7894536226987838, + "num_tokens": 4149127478.0, + "step": 1995 + }, + { + "epoch": 1.9804407031443425, + "grad_norm": 0.17591971158981323, + "learning_rate": 9.26907326786884e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7897718787193299, + "num_tokens": 4159511005.0, + "step": 2000 + }, + { + "epoch": 1.9853924238672938, + "grad_norm": 0.17198185622692108, + "learning_rate": 9.232180076097231e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7906899690628052, + "num_tokens": 4169924279.0, + "step": 2005 + }, + { + "epoch": 1.990344144590245, + "grad_norm": 0.19029901921749115, + "learning_rate": 9.195384229676104e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7901058614253997, + "num_tokens": 4180294171.0, + "step": 2010 + }, + { + "epoch": 1.9952958653131962, + "grad_norm": 9.313966751098633, + "learning_rate": 9.158686824719886e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7914641767740249, + "num_tokens": 4190674704.0, + "step": 2015 + }, + { + "epoch": 2.0, + "grad_norm": 0.20199906826019287, + "learning_rate": 9.122088954410527e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7905858504144769, + "num_tokens": 4200567795.0, + "step": 2020 + }, + { + "epoch": 2.0049517207229512, + "grad_norm": 0.2077370285987854, + "learning_rate": 9.085591708964929e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7967542797327042, + "num_tokens": 4210992609.0, + "step": 2025 + }, + { + "epoch": 2.0099034414459025, + "grad_norm": 0.20394857227802277, + "learning_rate": 9.049196175602463e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7959148943424225, + "num_tokens": 4221400809.0, + "step": 2030 + }, + { + "epoch": 2.0148551621688537, + "grad_norm": 0.1789093166589737, + "learning_rate": 9.012903438512609e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.7977543532848358, + "num_tokens": 4231784819.0, + "step": 2035 + }, + { + "epoch": 2.019806882891805, + "grad_norm": 0.18632952868938446, + "learning_rate": 8.97671457882263e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7952105075120925, + "num_tokens": 4242174103.0, + "step": 2040 + }, + { + "epoch": 2.0247586036147562, + "grad_norm": 0.1823669672012329, + "learning_rate": 8.940630674565371e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7960813373327256, + "num_tokens": 4252592838.0, + "step": 2045 + }, + { + "epoch": 2.0297103243377075, + "grad_norm": 0.16959789395332336, + "learning_rate": 8.904652800647158e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7990966290235519, + "num_tokens": 4263004435.0, + "step": 2050 + }, + { + "epoch": 2.0346620450606587, + "grad_norm": 0.16897161304950714, + "learning_rate": 8.86878202881578e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7975217163562774, + "num_tokens": 4273410737.0, + "step": 2055 + }, + { + "epoch": 2.03961376578361, + "grad_norm": 0.189055398106575, + "learning_rate": 8.833019427628531e-06, + "loss": 0.6828, + "mean_token_accuracy": 0.7994821727275848, + "num_tokens": 4283826894.0, + "step": 2060 + }, + { + "epoch": 2.0445654865065612, + "grad_norm": 0.18197213113307953, + "learning_rate": 8.797366062420416e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7973693281412124, + "num_tokens": 4294236593.0, + "step": 2065 + }, + { + "epoch": 2.0495172072295125, + "grad_norm": 0.18738311529159546, + "learning_rate": 8.761822995272407e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7976522415876388, + "num_tokens": 4304594243.0, + "step": 2070 + }, + { + "epoch": 2.0544689279524633, + "grad_norm": 0.19521503150463104, + "learning_rate": 8.726391284979782e-06, + "loss": 0.692, + "mean_token_accuracy": 0.7969583392143249, + "num_tokens": 4315024587.0, + "step": 2075 + }, + { + "epoch": 2.0594206486754145, + "grad_norm": 0.18976199626922607, + "learning_rate": 8.691071987020608e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7970456808805466, + "num_tokens": 4325447239.0, + "step": 2080 + }, + { + "epoch": 2.0643723693983658, + "grad_norm": 0.17422591149806976, + "learning_rate": 8.655866153524293e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7987291067838669, + "num_tokens": 4335838712.0, + "step": 2085 + }, + { + "epoch": 2.069324090121317, + "grad_norm": 0.18757224082946777, + "learning_rate": 8.620774833240242e-06, + "loss": 0.692, + "mean_token_accuracy": 0.7963369220495224, + "num_tokens": 4346236727.0, + "step": 2090 + }, + { + "epoch": 2.0742758108442683, + "grad_norm": 0.17361269891262054, + "learning_rate": 8.585799071506617e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.7964360475540161, + "num_tokens": 4356601719.0, + "step": 2095 + }, + { + "epoch": 2.0792275315672195, + "grad_norm": 0.18412835896015167, + "learning_rate": 8.550939910219198e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7968982100486756, + "num_tokens": 4367011033.0, + "step": 2100 + }, + { + "epoch": 2.0841792522901708, + "grad_norm": 0.17574380338191986, + "learning_rate": 8.516198387800341e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7958818316459656, + "num_tokens": 4377372385.0, + "step": 2105 + }, + { + "epoch": 2.089130973013122, + "grad_norm": 0.1881859004497528, + "learning_rate": 8.481575539168052e-06, + "loss": 0.6886, + "mean_token_accuracy": 0.7973954766988754, + "num_tokens": 4387782302.0, + "step": 2110 + }, + { + "epoch": 2.0940826937360733, + "grad_norm": 0.17383810877799988, + "learning_rate": 8.447072395705154e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7977871656417846, + "num_tokens": 4398182888.0, + "step": 2115 + }, + { + "epoch": 2.0990344144590245, + "grad_norm": 0.18243476748466492, + "learning_rate": 8.412689985228561e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7934758394956589, + "num_tokens": 4408560198.0, + "step": 2120 + }, + { + "epoch": 2.1039861351819757, + "grad_norm": 0.1822386533021927, + "learning_rate": 8.378429331958663e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.7983981937170028, + "num_tokens": 4418974578.0, + "step": 2125 + }, + { + "epoch": 2.108937855904927, + "grad_norm": 0.1714823991060257, + "learning_rate": 8.34429145648882e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7948953956365585, + "num_tokens": 4429365226.0, + "step": 2130 + }, + { + "epoch": 2.1138895766278782, + "grad_norm": 0.1750471293926239, + "learning_rate": 8.310277375754938e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7947075188159942, + "num_tokens": 4439768423.0, + "step": 2135 + }, + { + "epoch": 2.1188412973508295, + "grad_norm": 0.19049614667892456, + "learning_rate": 8.276388103005215e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7956634342670441, + "num_tokens": 4450165574.0, + "step": 2140 + }, + { + "epoch": 2.1237930180737807, + "grad_norm": 2.0906457901000977, + "learning_rate": 8.24262464776992e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.7988680183887482, + "num_tokens": 4460582540.0, + "step": 2145 + }, + { + "epoch": 2.128744738796732, + "grad_norm": 0.1667407751083374, + "learning_rate": 8.208988015831328e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7962657779455184, + "num_tokens": 4471001266.0, + "step": 2150 + }, + { + "epoch": 2.1336964595196832, + "grad_norm": 0.17646174132823944, + "learning_rate": 8.17547920919378e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7941409230232239, + "num_tokens": 4481422686.0, + "step": 2155 + }, + { + "epoch": 2.1386481802426345, + "grad_norm": 0.1852479875087738, + "learning_rate": 8.14209922605381e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7935562491416931, + "num_tokens": 4491809382.0, + "step": 2160 + }, + { + "epoch": 2.1435999009655857, + "grad_norm": 0.22626833617687225, + "learning_rate": 8.108849060770418e-06, + "loss": 0.68, + "mean_token_accuracy": 0.7999586701393128, + "num_tokens": 4502218692.0, + "step": 2165 + }, + { + "epoch": 2.148551621688537, + "grad_norm": 0.1789206862449646, + "learning_rate": 8.075729703835452e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.797278082370758, + "num_tokens": 4512642434.0, + "step": 2170 + }, + { + "epoch": 2.153503342411488, + "grad_norm": 0.17580756545066833, + "learning_rate": 8.042742141844102e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.7973107606172561, + "num_tokens": 4523055194.0, + "step": 2175 + }, + { + "epoch": 2.158455063134439, + "grad_norm": 0.16940629482269287, + "learning_rate": 8.0098873574655e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7944120824337005, + "num_tokens": 4533448427.0, + "step": 2180 + }, + { + "epoch": 2.1634067838573903, + "grad_norm": 0.17267808318138123, + "learning_rate": 7.977166329413462e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7920106083154679, + "num_tokens": 4543838846.0, + "step": 2185 + }, + { + "epoch": 2.1683585045803415, + "grad_norm": 0.1663428246974945, + "learning_rate": 7.944580032417327e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.7975029587745667, + "num_tokens": 4554246437.0, + "step": 2190 + }, + { + "epoch": 2.1733102253032928, + "grad_norm": 0.1794314682483673, + "learning_rate": 7.912129437192911e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7946099638938904, + "num_tokens": 4564611339.0, + "step": 2195 + }, + { + "epoch": 2.178261946026244, + "grad_norm": 0.17238175868988037, + "learning_rate": 7.879815510413607e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7968462198972702, + "num_tokens": 4575030580.0, + "step": 2200 + }, + { + "epoch": 2.1832136667491953, + "grad_norm": 0.17889876663684845, + "learning_rate": 7.847639214681575e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7952412754297257, + "num_tokens": 4585447968.0, + "step": 2205 + }, + { + "epoch": 2.1881653874721465, + "grad_norm": 0.17377404868602753, + "learning_rate": 7.815601508499082e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.7982388496398926, + "num_tokens": 4595852149.0, + "step": 2210 + }, + { + "epoch": 2.1931171081950978, + "grad_norm": 0.16488561034202576, + "learning_rate": 7.783703346239931e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7979268878698349, + "num_tokens": 4606229002.0, + "step": 2215 + }, + { + "epoch": 2.198068828918049, + "grad_norm": 0.17260773479938507, + "learning_rate": 7.75194567812104e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.796611812710762, + "num_tokens": 4616635865.0, + "step": 2220 + }, + { + "epoch": 2.2030205496410002, + "grad_norm": 0.19267019629478455, + "learning_rate": 7.720329450174143e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7973758280277252, + "num_tokens": 4627053505.0, + "step": 2225 + }, + { + "epoch": 2.2079722703639515, + "grad_norm": 0.17944496870040894, + "learning_rate": 7.68885560421759e-06, + "loss": 0.693, + "mean_token_accuracy": 0.795927032828331, + "num_tokens": 4637478677.0, + "step": 2230 + }, + { + "epoch": 2.2129239910869027, + "grad_norm": 0.17361298203468323, + "learning_rate": 7.657525077828317e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7980309188365936, + "num_tokens": 4647889813.0, + "step": 2235 + }, + { + "epoch": 2.217875711809854, + "grad_norm": 0.18098437786102295, + "learning_rate": 7.626338804313888e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7962464898824692, + "num_tokens": 4658293288.0, + "step": 2240 + }, + { + "epoch": 2.2228274325328052, + "grad_norm": 0.1930907964706421, + "learning_rate": 7.595297712684715e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.7985998451709747, + "num_tokens": 4668667335.0, + "step": 2245 + }, + { + "epoch": 2.2277791532557565, + "grad_norm": 0.17226119339466095, + "learning_rate": 7.564402727626374e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7977766156196594, + "num_tokens": 4679079824.0, + "step": 2250 + }, + { + "epoch": 2.2327308739787077, + "grad_norm": 0.17223741114139557, + "learning_rate": 7.533654769472053e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.7960464268922806, + "num_tokens": 4689460943.0, + "step": 2255 + }, + { + "epoch": 2.237682594701659, + "grad_norm": 0.1818878948688507, + "learning_rate": 7.5030547541751564e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.7953739076852798, + "num_tokens": 4699876523.0, + "step": 2260 + }, + { + "epoch": 2.2426343154246102, + "grad_norm": 0.1681259423494339, + "learning_rate": 7.472603593281999e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7943088680505752, + "num_tokens": 4710260413.0, + "step": 2265 + }, + { + "epoch": 2.2475860361475615, + "grad_norm": 0.16561447083950043, + "learning_rate": 7.442302193904658e-06, + "loss": 0.6824, + "mean_token_accuracy": 0.7992607623338699, + "num_tokens": 4720663420.0, + "step": 2270 + }, + { + "epoch": 2.2525377568705123, + "grad_norm": 0.16674137115478516, + "learning_rate": 7.4121514586939595e-06, + "loss": 0.6907, + "mean_token_accuracy": 0.7970253258943558, + "num_tokens": 4731065687.0, + "step": 2275 + }, + { + "epoch": 2.257489477593464, + "grad_norm": 0.18529455363750458, + "learning_rate": 7.3821522858125806e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.798730805516243, + "num_tokens": 4741479922.0, + "step": 2280 + }, + { + "epoch": 2.2624411983164148, + "grad_norm": 0.17490680515766144, + "learning_rate": 7.35230556890829e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.8002272933721543, + "num_tokens": 4751874078.0, + "step": 2285 + }, + { + "epoch": 2.267392919039366, + "grad_norm": 0.18105871975421906, + "learning_rate": 7.322612197087338e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7940381079912185, + "num_tokens": 4762257864.0, + "step": 2290 + }, + { + "epoch": 2.2723446397623173, + "grad_norm": 0.1849709004163742, + "learning_rate": 7.2930730548879705e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.7976529210805893, + "num_tokens": 4772666074.0, + "step": 2295 + }, + { + "epoch": 2.2772963604852685, + "grad_norm": 0.1883031725883484, + "learning_rate": 7.263689022254065e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.7999155551195145, + "num_tokens": 4783060716.0, + "step": 2300 + }, + { + "epoch": 2.2822480812082198, + "grad_norm": 0.16609027981758118, + "learning_rate": 7.234460974508933e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.7969788700342179, + "num_tokens": 4793440305.0, + "step": 2305 + }, + { + "epoch": 2.287199801931171, + "grad_norm": 0.16849201917648315, + "learning_rate": 7.205389782329239e-06, + "loss": 0.6921, + "mean_token_accuracy": 0.7968606740236283, + "num_tokens": 4803826379.0, + "step": 2310 + }, + { + "epoch": 2.2921515226541223, + "grad_norm": 0.19212591648101807, + "learning_rate": 7.176476311719067e-06, + "loss": 0.6811, + "mean_token_accuracy": 0.7993761092424393, + "num_tokens": 4814231994.0, + "step": 2315 + }, + { + "epoch": 2.2971032433770735, + "grad_norm": 0.1726553738117218, + "learning_rate": 7.147721423984116e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7971784621477127, + "num_tokens": 4824582323.0, + "step": 2320 + }, + { + "epoch": 2.3020549641000247, + "grad_norm": 0.18482866883277893, + "learning_rate": 7.119125975706047e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.79871164560318, + "num_tokens": 4835008318.0, + "step": 2325 + }, + { + "epoch": 2.307006684822976, + "grad_norm": 0.18125054240226746, + "learning_rate": 7.0906908187169745e-06, + "loss": 0.7065, + "mean_token_accuracy": 0.7926204591989517, + "num_tokens": 4845424240.0, + "step": 2330 + }, + { + "epoch": 2.3119584055459272, + "grad_norm": 0.1714540421962738, + "learning_rate": 7.06241680007407e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7951288729906082, + "num_tokens": 4855856505.0, + "step": 2335 + }, + { + "epoch": 2.3169101262688785, + "grad_norm": 0.1746184229850769, + "learning_rate": 7.034304762034352e-06, + "loss": 0.701, + "mean_token_accuracy": 0.794016820192337, + "num_tokens": 4866252261.0, + "step": 2340 + }, + { + "epoch": 2.3218618469918297, + "grad_norm": 0.16859765350818634, + "learning_rate": 7.0063555420295795e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7958215087652206, + "num_tokens": 4876680209.0, + "step": 2345 + }, + { + "epoch": 2.326813567714781, + "grad_norm": 0.18128663301467896, + "learning_rate": 6.978569972641317e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.7966524481773376, + "num_tokens": 4887065937.0, + "step": 2350 + }, + { + "epoch": 2.3317652884377322, + "grad_norm": 0.1726839542388916, + "learning_rate": 6.950948881576125e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7947970867156983, + "num_tokens": 4897475666.0, + "step": 2355 + }, + { + "epoch": 2.3367170091606835, + "grad_norm": 0.16852743923664093, + "learning_rate": 6.923493091640905e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7934527039527893, + "num_tokens": 4907847428.0, + "step": 2360 + }, + { + "epoch": 2.3416687298836347, + "grad_norm": 0.17816710472106934, + "learning_rate": 6.896203420718393e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7959372758865356, + "num_tokens": 4918224579.0, + "step": 2365 + }, + { + "epoch": 2.346620450606586, + "grad_norm": 0.1683845818042755, + "learning_rate": 6.869080681742788e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7984416544437408, + "num_tokens": 4928627391.0, + "step": 2370 + }, + { + "epoch": 2.351572171329537, + "grad_norm": 0.1718398779630661, + "learning_rate": 6.842125682675543e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.795421975851059, + "num_tokens": 4939037179.0, + "step": 2375 + }, + { + "epoch": 2.356523892052488, + "grad_norm": 0.1837027370929718, + "learning_rate": 6.8153392264812955e-06, + "loss": 0.6866, + "mean_token_accuracy": 0.7978436768054962, + "num_tokens": 4949421774.0, + "step": 2380 + }, + { + "epoch": 2.3614756127754397, + "grad_norm": 1.9804030656814575, + "learning_rate": 6.7887221111039404e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7950782805681229, + "num_tokens": 4959814615.0, + "step": 2385 + }, + { + "epoch": 2.3664273334983905, + "grad_norm": 0.17864683270454407, + "learning_rate": 6.76227512944287e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7949712723493576, + "num_tokens": 4970222473.0, + "step": 2390 + }, + { + "epoch": 2.3713790542213418, + "grad_norm": 0.17018389701843262, + "learning_rate": 6.735999069329342e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.79873506128788, + "num_tokens": 4980635075.0, + "step": 2395 + }, + { + "epoch": 2.376330774944293, + "grad_norm": 0.177368625998497, + "learning_rate": 6.709894713503031e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7968572080135345, + "num_tokens": 4991054472.0, + "step": 2400 + }, + { + "epoch": 2.3812824956672443, + "grad_norm": 0.1928243339061737, + "learning_rate": 6.68396283958869e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7964109271764755, + "num_tokens": 5001450135.0, + "step": 2405 + }, + { + "epoch": 2.3862342163901955, + "grad_norm": 0.1850307583808899, + "learning_rate": 6.658204220072993e-06, + "loss": 0.7007, + "mean_token_accuracy": 0.7958345085382461, + "num_tokens": 5011862584.0, + "step": 2410 + }, + { + "epoch": 2.3911859371131468, + "grad_norm": 0.166819766163826, + "learning_rate": 6.63261962228153e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7932611912488937, + "num_tokens": 5022262611.0, + "step": 2415 + }, + { + "epoch": 2.396137657836098, + "grad_norm": 0.16802161931991577, + "learning_rate": 6.607209808355945e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.795117124915123, + "num_tokens": 5032675845.0, + "step": 2420 + }, + { + "epoch": 2.4010893785590492, + "grad_norm": 0.16795934736728668, + "learning_rate": 6.5819755352312245e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7978654116392135, + "num_tokens": 5043069802.0, + "step": 2425 + }, + { + "epoch": 2.4060410992820005, + "grad_norm": 0.17665541172027588, + "learning_rate": 6.556917554613157e-06, + "loss": 0.6886, + "mean_token_accuracy": 0.7966871380805969, + "num_tokens": 5053447114.0, + "step": 2430 + }, + { + "epoch": 2.4109928200049517, + "grad_norm": 0.17774540185928345, + "learning_rate": 6.53203661295595e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7966781735420227, + "num_tokens": 5063846810.0, + "step": 2435 + }, + { + "epoch": 2.415944540727903, + "grad_norm": 0.1691160798072815, + "learning_rate": 6.50733345143997e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7978695631027222, + "num_tokens": 5074257776.0, + "step": 2440 + }, + { + "epoch": 2.4208962614508542, + "grad_norm": 0.18364639580249786, + "learning_rate": 6.4828088059496794e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7960184782743454, + "num_tokens": 5084664363.0, + "step": 2445 + }, + { + "epoch": 2.4258479821738055, + "grad_norm": 0.18796691298484802, + "learning_rate": 6.458463407051722e-06, + "loss": 0.692, + "mean_token_accuracy": 0.7963404774665832, + "num_tokens": 5095062991.0, + "step": 2450 + }, + { + "epoch": 2.4307997028967567, + "grad_norm": 0.200786754488945, + "learning_rate": 6.434297979973138e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7957387864589691, + "num_tokens": 5105475938.0, + "step": 2455 + }, + { + "epoch": 2.435751423619708, + "grad_norm": 0.18357941508293152, + "learning_rate": 6.41031324457978e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.793591657280922, + "num_tokens": 5115888461.0, + "step": 2460 + }, + { + "epoch": 2.4407031443426592, + "grad_norm": 0.17939510941505432, + "learning_rate": 6.386509915354858e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.79359290599823, + "num_tokens": 5126273976.0, + "step": 2465 + }, + { + "epoch": 2.4456548650656105, + "grad_norm": 0.17583215236663818, + "learning_rate": 6.362888701377661e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7956161171197891, + "num_tokens": 5136658720.0, + "step": 2470 + }, + { + "epoch": 2.4506065857885613, + "grad_norm": 0.1758909374475479, + "learning_rate": 6.3394503063024305e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7964052438735962, + "num_tokens": 5147086462.0, + "step": 2475 + }, + { + "epoch": 2.455558306511513, + "grad_norm": 0.17750118672847748, + "learning_rate": 6.3161954283374026e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.7991193652153015, + "num_tokens": 5157489366.0, + "step": 2480 + }, + { + "epoch": 2.4605100272344638, + "grad_norm": 0.17545007169246674, + "learning_rate": 6.293124760224007e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7971043676137924, + "num_tokens": 5167911943.0, + "step": 2485 + }, + { + "epoch": 2.465461747957415, + "grad_norm": 0.411289781332016, + "learning_rate": 6.270238989216233e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7979872524738312, + "num_tokens": 5178337540.0, + "step": 2490 + }, + { + "epoch": 2.4704134686803663, + "grad_norm": 0.1767181158065796, + "learning_rate": 6.247538797060153e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.7989832788705826, + "num_tokens": 5188737081.0, + "step": 2495 + }, + { + "epoch": 2.4753651894033175, + "grad_norm": 0.18555793166160583, + "learning_rate": 6.225024859973614e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7958449482917785, + "num_tokens": 5199147597.0, + "step": 2500 + }, + { + "epoch": 2.4803169101262688, + "grad_norm": 0.17499588429927826, + "learning_rate": 6.202697848626098e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7998959481716156, + "num_tokens": 5209553370.0, + "step": 2505 + }, + { + "epoch": 2.48526863084922, + "grad_norm": 0.1752438247203827, + "learning_rate": 6.180558428118748e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.794254133105278, + "num_tokens": 5219967341.0, + "step": 2510 + }, + { + "epoch": 2.4902203515721713, + "grad_norm": 0.18336161971092224, + "learning_rate": 6.158607257964531e-06, + "loss": 0.695, + "mean_token_accuracy": 0.795548141002655, + "num_tokens": 5230343704.0, + "step": 2515 + }, + { + "epoch": 2.4951720722951225, + "grad_norm": 0.1759299337863922, + "learning_rate": 6.136844992068629e-06, + "loss": 0.6811, + "mean_token_accuracy": 0.7996001183986664, + "num_tokens": 5240752756.0, + "step": 2520 + }, + { + "epoch": 2.5001237930180737, + "grad_norm": 0.1809236705303192, + "learning_rate": 6.115272278708928e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.7996293097734452, + "num_tokens": 5251165282.0, + "step": 2525 + }, + { + "epoch": 2.505075513741025, + "grad_norm": 0.17464205622673035, + "learning_rate": 6.093889760516718e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7979602038860321, + "num_tokens": 5261564857.0, + "step": 2530 + }, + { + "epoch": 2.5100272344639762, + "grad_norm": 0.19239003956317902, + "learning_rate": 6.072698074457556e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7947456300258636, + "num_tokens": 5271950567.0, + "step": 2535 + }, + { + "epoch": 2.5149789551869275, + "grad_norm": 0.1785266101360321, + "learning_rate": 6.051697851812283e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7954322129487992, + "num_tokens": 5282342097.0, + "step": 2540 + }, + { + "epoch": 2.5199306759098787, + "grad_norm": 0.1663612723350525, + "learning_rate": 6.030889718158217e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.799082663655281, + "num_tokens": 5292769240.0, + "step": 2545 + }, + { + "epoch": 2.52488239663283, + "grad_norm": 0.17553378641605377, + "learning_rate": 6.010274293350524e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7952435702085495, + "num_tokens": 5303167246.0, + "step": 2550 + }, + { + "epoch": 2.5298341173557812, + "grad_norm": 0.17817820608615875, + "learning_rate": 5.9898521915037575e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7968169569969177, + "num_tokens": 5313559465.0, + "step": 2555 + }, + { + "epoch": 2.5347858380787325, + "grad_norm": 0.17062614858150482, + "learning_rate": 5.969624020973543e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7981322228908538, + "num_tokens": 5323956814.0, + "step": 2560 + }, + { + "epoch": 2.5397375588016837, + "grad_norm": 0.16806785762310028, + "learning_rate": 5.949590384338486e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7956575661897659, + "num_tokens": 5334313471.0, + "step": 2565 + }, + { + "epoch": 2.544689279524635, + "grad_norm": 0.6380743980407715, + "learning_rate": 5.929751878382195e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7971752375364304, + "num_tokens": 5344703937.0, + "step": 2570 + }, + { + "epoch": 2.549641000247586, + "grad_norm": 0.18080659210681915, + "learning_rate": 5.910109094075522e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7960569471120834, + "num_tokens": 5355109953.0, + "step": 2575 + }, + { + "epoch": 2.554592720970537, + "grad_norm": 0.17711760103702545, + "learning_rate": 5.890662616558946e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7974663138389587, + "num_tokens": 5365491606.0, + "step": 2580 + }, + { + "epoch": 2.5595444416934887, + "grad_norm": 0.17866727709770203, + "learning_rate": 5.87141302512515e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7948154181241989, + "num_tokens": 5375911835.0, + "step": 2585 + }, + { + "epoch": 2.5644961624164395, + "grad_norm": 0.1814994066953659, + "learning_rate": 5.852360893201762e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7965142160654068, + "num_tokens": 5386334595.0, + "step": 2590 + }, + { + "epoch": 2.569447883139391, + "grad_norm": 0.16615930199623108, + "learning_rate": 5.833506788334271e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.79442700445652, + "num_tokens": 5396742866.0, + "step": 2595 + }, + { + "epoch": 2.574399603862342, + "grad_norm": 9.703447341918945, + "learning_rate": 5.814851272169123e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7948618471622467, + "num_tokens": 5407147677.0, + "step": 2600 + }, + { + "epoch": 2.5793513245852933, + "grad_norm": 0.16909906268119812, + "learning_rate": 5.796394900436989e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7968372017145157, + "num_tokens": 5417532745.0, + "step": 2605 + }, + { + "epoch": 2.5843030453082445, + "grad_norm": 0.17828305065631866, + "learning_rate": 5.7781382229362125e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7956779956817627, + "num_tokens": 5427899195.0, + "step": 2610 + }, + { + "epoch": 2.5892547660311958, + "grad_norm": 0.18194948136806488, + "learning_rate": 5.760081783516425e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7965070128440856, + "num_tokens": 5438308119.0, + "step": 2615 + }, + { + "epoch": 2.594206486754147, + "grad_norm": 0.16723376512527466, + "learning_rate": 5.742226120062353e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7979160338640213, + "num_tokens": 5448715457.0, + "step": 2620 + }, + { + "epoch": 2.5991582074770982, + "grad_norm": 0.1706143170595169, + "learning_rate": 5.724571764477793e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7919691979885102, + "num_tokens": 5459086857.0, + "step": 2625 + }, + { + "epoch": 2.6041099282000495, + "grad_norm": 0.19008710980415344, + "learning_rate": 5.707119242669762e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.794280007481575, + "num_tokens": 5469483164.0, + "step": 2630 + }, + { + "epoch": 2.6090616489230007, + "grad_norm": 0.1769382357597351, + "learning_rate": 5.689869074532833e-06, + "loss": 0.6939, + "mean_token_accuracy": 0.7961185723543167, + "num_tokens": 5479864004.0, + "step": 2635 + }, + { + "epoch": 2.614013369645952, + "grad_norm": 0.2750358581542969, + "learning_rate": 5.6728217739336525e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7972733825445175, + "num_tokens": 5490263383.0, + "step": 2640 + }, + { + "epoch": 2.6189650903689032, + "grad_norm": 0.17598873376846313, + "learning_rate": 5.655977848695634e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7967590630054474, + "num_tokens": 5500650861.0, + "step": 2645 + }, + { + "epoch": 2.6239168110918545, + "grad_norm": 3.034182071685791, + "learning_rate": 5.639337800583815e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.7975974589586258, + "num_tokens": 5511002438.0, + "step": 2650 + }, + { + "epoch": 2.6288685318148057, + "grad_norm": 0.17671412229537964, + "learning_rate": 5.622902125289927e-06, + "loss": 0.6939, + "mean_token_accuracy": 0.7963446021080017, + "num_tokens": 5521417165.0, + "step": 2655 + }, + { + "epoch": 2.633820252537757, + "grad_norm": 0.16627521812915802, + "learning_rate": 5.606671312417627e-06, + "loss": 0.6949, + "mean_token_accuracy": 0.7957168728113174, + "num_tokens": 5531803294.0, + "step": 2660 + }, + { + "epoch": 2.6387719732607082, + "grad_norm": 0.1676947921514511, + "learning_rate": 5.590645845467902e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.7975447326898575, + "num_tokens": 5542196226.0, + "step": 2665 + }, + { + "epoch": 2.6437236939836595, + "grad_norm": 0.16450296342372894, + "learning_rate": 5.574826201824675e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7963021039962769, + "num_tokens": 5552607704.0, + "step": 2670 + }, + { + "epoch": 2.6486754147066103, + "grad_norm": 0.16522452235221863, + "learning_rate": 5.559212852740585e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.7966877698898316, + "num_tokens": 5563008514.0, + "step": 2675 + }, + { + "epoch": 2.653627135429562, + "grad_norm": 0.17141982913017273, + "learning_rate": 5.543806263322942e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.8007991701364517, + "num_tokens": 5573414557.0, + "step": 2680 + }, + { + "epoch": 2.6585788561525128, + "grad_norm": 0.17562955617904663, + "learning_rate": 5.5286068925198765e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7948395520448684, + "num_tokens": 5583823220.0, + "step": 2685 + }, + { + "epoch": 2.6635305768754645, + "grad_norm": 0.17200249433517456, + "learning_rate": 5.513615193106667e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.7980019062757492, + "num_tokens": 5594181697.0, + "step": 2690 + }, + { + "epoch": 2.6684822975984153, + "grad_norm": 0.17699338495731354, + "learning_rate": 5.4988316116722555e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7940513700246811, + "num_tokens": 5604576881.0, + "step": 2695 + }, + { + "epoch": 2.6734340183213665, + "grad_norm": 0.16659849882125854, + "learning_rate": 5.484256588605935e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.797638863325119, + "num_tokens": 5614973477.0, + "step": 2700 + }, + { + "epoch": 2.6783857390443178, + "grad_norm": 0.1654004156589508, + "learning_rate": 5.469890558084242e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7997693717479706, + "num_tokens": 5625405892.0, + "step": 2705 + }, + { + "epoch": 2.683337459767269, + "grad_norm": 0.16663256287574768, + "learning_rate": 5.455733948058014e-06, + "loss": 0.6762, + "mean_token_accuracy": 0.8006632030010223, + "num_tokens": 5635805330.0, + "step": 2710 + }, + { + "epoch": 2.6882891804902203, + "grad_norm": 0.1796761006116867, + "learning_rate": 5.441787180239646e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7965221852064133, + "num_tokens": 5646169943.0, + "step": 2715 + }, + { + "epoch": 2.6932409012131715, + "grad_norm": 0.1693544238805771, + "learning_rate": 5.428050670090523e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7954508185386657, + "num_tokens": 5656576200.0, + "step": 2720 + }, + { + "epoch": 2.6981926219361227, + "grad_norm": 0.1761597841978073, + "learning_rate": 5.414524826808656e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7965936124324798, + "num_tokens": 5666959999.0, + "step": 2725 + }, + { + "epoch": 2.703144342659074, + "grad_norm": 0.17307810485363007, + "learning_rate": 5.401210053316473e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.7984808474779129, + "num_tokens": 5677341800.0, + "step": 2730 + }, + { + "epoch": 2.7080960633820252, + "grad_norm": 0.17702001333236694, + "learning_rate": 5.388106746248834e-06, + "loss": 0.6888, + "mean_token_accuracy": 0.7974687486886978, + "num_tokens": 5687738690.0, + "step": 2735 + }, + { + "epoch": 2.7130477841049765, + "grad_norm": 0.1849050223827362, + "learning_rate": 5.3752152959412085e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.7980557024478913, + "num_tokens": 5698119448.0, + "step": 2740 + }, + { + "epoch": 2.7179995048279277, + "grad_norm": 0.18502669036388397, + "learning_rate": 5.3625360864180435e-06, + "loss": 0.692, + "mean_token_accuracy": 0.7966047704219819, + "num_tokens": 5708509953.0, + "step": 2745 + }, + { + "epoch": 2.722951225550879, + "grad_norm": 0.17855456471443176, + "learning_rate": 5.350069495381334e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.7987709194421768, + "num_tokens": 5718908781.0, + "step": 2750 + }, + { + "epoch": 2.7279029462738302, + "grad_norm": 0.16434764862060547, + "learning_rate": 5.33781589419936e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7953177213668823, + "num_tokens": 5729301053.0, + "step": 2755 + }, + { + "epoch": 2.7328546669967815, + "grad_norm": 0.18237145245075226, + "learning_rate": 5.325775647895633e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.7954672813415528, + "num_tokens": 5739704125.0, + "step": 2760 + }, + { + "epoch": 2.7378063877197327, + "grad_norm": 0.1780317723751068, + "learning_rate": 5.313949115138022e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7975771814584732, + "num_tokens": 5750099908.0, + "step": 2765 + }, + { + "epoch": 2.742758108442684, + "grad_norm": 0.17902548611164093, + "learning_rate": 5.302336648228057e-06, + "loss": 0.6927, + "mean_token_accuracy": 0.7963612645864486, + "num_tokens": 5760493836.0, + "step": 2770 + }, + { + "epoch": 2.747709829165635, + "grad_norm": 0.17271076142787933, + "learning_rate": 5.290938593090451e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7966414868831635, + "num_tokens": 5770888761.0, + "step": 2775 + }, + { + "epoch": 2.752661549888586, + "grad_norm": 0.18384882807731628, + "learning_rate": 5.279755289262787e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7987092226743698, + "num_tokens": 5781269276.0, + "step": 2780 + }, + { + "epoch": 2.7576132706115377, + "grad_norm": 0.1778169870376587, + "learning_rate": 5.268787069885398e-06, + "loss": 0.6823, + "mean_token_accuracy": 0.7990261852741242, + "num_tokens": 5791688485.0, + "step": 2785 + }, + { + "epoch": 2.7625649913344885, + "grad_norm": 0.16034454107284546, + "learning_rate": 5.258034261691453e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7992457032203675, + "num_tokens": 5802114157.0, + "step": 2790 + }, + { + "epoch": 2.76751671205744, + "grad_norm": 0.16529497504234314, + "learning_rate": 5.2474971849972255e-06, + "loss": 0.6847, + "mean_token_accuracy": 0.7985988527536392, + "num_tokens": 5812530733.0, + "step": 2795 + }, + { + "epoch": 2.772468432780391, + "grad_norm": 0.16325139999389648, + "learning_rate": 5.237176153692536e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.796726056933403, + "num_tokens": 5822937339.0, + "step": 2800 + }, + { + "epoch": 2.7774201535033423, + "grad_norm": 0.16805361211299896, + "learning_rate": 5.2270714752314165e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7955878287553787, + "num_tokens": 5833351943.0, + "step": 2805 + }, + { + "epoch": 2.7823718742262935, + "grad_norm": 0.1664312332868576, + "learning_rate": 5.217183450622946e-06, + "loss": 0.6949, + "mean_token_accuracy": 0.7956264704465866, + "num_tokens": 5843761929.0, + "step": 2810 + }, + { + "epoch": 2.7873235949492448, + "grad_norm": 0.17172367870807648, + "learning_rate": 5.207512374422286e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7955891370773316, + "num_tokens": 5854173653.0, + "step": 2815 + }, + { + "epoch": 2.792275315672196, + "grad_norm": 0.1718735694885254, + "learning_rate": 5.198058534721901e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7985197186470032, + "num_tokens": 5864593712.0, + "step": 2820 + }, + { + "epoch": 2.7972270363951472, + "grad_norm": 0.16640879213809967, + "learning_rate": 5.188822213142984e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7935741305351257, + "num_tokens": 5875001155.0, + "step": 2825 + }, + { + "epoch": 2.8021787571180985, + "grad_norm": 0.17045748233795166, + "learning_rate": 5.179803684827059e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.795042735338211, + "num_tokens": 5885336862.0, + "step": 2830 + }, + { + "epoch": 2.8071304778410497, + "grad_norm": 0.16793885827064514, + "learning_rate": 5.1710032184277946e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7980520844459533, + "num_tokens": 5895740091.0, + "step": 2835 + }, + { + "epoch": 2.812082198564001, + "grad_norm": 0.1670074313879013, + "learning_rate": 5.1624210761029904e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7995127648115158, + "num_tokens": 5906130721.0, + "step": 2840 + }, + { + "epoch": 2.8170339192869522, + "grad_norm": 1.144707441329956, + "learning_rate": 5.154057513506772e-06, + "loss": 0.6905, + "mean_token_accuracy": 0.7972240179777146, + "num_tokens": 5916514276.0, + "step": 2845 + }, + { + "epoch": 2.8219856400099035, + "grad_norm": 0.16775400936603546, + "learning_rate": 5.145912779781981e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7972176611423493, + "num_tokens": 5926925572.0, + "step": 2850 + }, + { + "epoch": 2.8269373607328547, + "grad_norm": 0.16344526410102844, + "learning_rate": 5.137987117552745e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7939132809638977, + "num_tokens": 5937352792.0, + "step": 2855 + }, + { + "epoch": 2.831889081455806, + "grad_norm": 0.16780716180801392, + "learning_rate": 5.130280762917252e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7929495930671692, + "num_tokens": 5947785733.0, + "step": 2860 + }, + { + "epoch": 2.8368408021787572, + "grad_norm": 0.16962361335754395, + "learning_rate": 5.12279394544072e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.7973616749048233, + "num_tokens": 5958207857.0, + "step": 2865 + }, + { + "epoch": 2.8417925229017085, + "grad_norm": 0.17526820302009583, + "learning_rate": 5.11552688814856e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.7963843405246734, + "num_tokens": 5968621626.0, + "step": 2870 + }, + { + "epoch": 2.8467442436246597, + "grad_norm": 0.17461900413036346, + "learning_rate": 5.10847980751972e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.7982411950826644, + "num_tokens": 5979010182.0, + "step": 2875 + }, + { + "epoch": 2.851695964347611, + "grad_norm": 0.17215755581855774, + "learning_rate": 5.101652913480259e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7959240406751633, + "num_tokens": 5989447180.0, + "step": 2880 + }, + { + "epoch": 2.8566476850705618, + "grad_norm": 0.17169933021068573, + "learning_rate": 5.095046409397067e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7955931961536408, + "num_tokens": 5999857552.0, + "step": 2885 + }, + { + "epoch": 2.8615994057935135, + "grad_norm": 0.17529842257499695, + "learning_rate": 5.088660492071825e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.798897260427475, + "num_tokens": 6010272989.0, + "step": 2890 + }, + { + "epoch": 2.8665511265164643, + "grad_norm": 0.1800571084022522, + "learning_rate": 5.0824953517351405e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7946085125207901, + "num_tokens": 6020661047.0, + "step": 2895 + }, + { + "epoch": 2.871502847239416, + "grad_norm": 0.17649812996387482, + "learning_rate": 5.076551172040875e-06, + "loss": 0.6959, + "mean_token_accuracy": 0.7954695582389831, + "num_tokens": 6031045781.0, + "step": 2900 + }, + { + "epoch": 2.8764545679623668, + "grad_norm": 0.16285355389118195, + "learning_rate": 5.070828130060673e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.798177295923233, + "num_tokens": 6041474006.0, + "step": 2905 + }, + { + "epoch": 2.881406288685318, + "grad_norm": 0.16973961889743805, + "learning_rate": 5.065326396278693e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.796346840262413, + "num_tokens": 6051837040.0, + "step": 2910 + }, + { + "epoch": 2.8863580094082693, + "grad_norm": 0.16173960268497467, + "learning_rate": 5.0600461345865285e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7969261765480041, + "num_tokens": 6062203773.0, + "step": 2915 + }, + { + "epoch": 2.8913097301312205, + "grad_norm": 0.1653313934803009, + "learning_rate": 5.054987502278314e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7956225961446762, + "num_tokens": 6072598868.0, + "step": 2920 + }, + { + "epoch": 2.8962614508541717, + "grad_norm": 0.17829228937625885, + "learning_rate": 5.050150650046057e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7967943876981736, + "num_tokens": 6082977828.0, + "step": 2925 + }, + { + "epoch": 2.901213171577123, + "grad_norm": 0.17155008018016815, + "learning_rate": 5.045535721975139e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.7963936567306519, + "num_tokens": 6093369909.0, + "step": 2930 + }, + { + "epoch": 2.9061648923000742, + "grad_norm": 0.17488369345664978, + "learning_rate": 5.04114285554002e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7955116271972656, + "num_tokens": 6103768699.0, + "step": 2935 + }, + { + "epoch": 2.9111166130230255, + "grad_norm": 0.16874125599861145, + "learning_rate": 5.036972181600154e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7975889980793, + "num_tokens": 6114197901.0, + "step": 2940 + }, + { + "epoch": 2.9160683337459767, + "grad_norm": 0.1702135056257248, + "learning_rate": 5.033023824396082e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7945420503616333, + "num_tokens": 6124566479.0, + "step": 2945 + }, + { + "epoch": 2.921020054468928, + "grad_norm": 0.16955436766147614, + "learning_rate": 5.0292979015457325e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.7963980317115784, + "num_tokens": 6134987535.0, + "step": 2950 + }, + { + "epoch": 2.9259717751918792, + "grad_norm": 0.1785793900489807, + "learning_rate": 5.025794524040922e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7987745195627213, + "num_tokens": 6145413537.0, + "step": 2955 + }, + { + "epoch": 2.9309234959148305, + "grad_norm": 0.17144745588302612, + "learning_rate": 5.0225137962440465e-06, + "loss": 0.697, + "mean_token_accuracy": 0.7954545766115189, + "num_tokens": 6155794207.0, + "step": 2960 + }, + { + "epoch": 2.9358752166377817, + "grad_norm": 0.18841952085494995, + "learning_rate": 5.019455815884967e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7973819732666015, + "num_tokens": 6166202934.0, + "step": 2965 + }, + { + "epoch": 2.940826937360733, + "grad_norm": 0.17883515357971191, + "learning_rate": 5.016620674058109e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.7950917452573776, + "num_tokens": 6176607746.0, + "step": 2970 + }, + { + "epoch": 2.945778658083684, + "grad_norm": 0.17126841843128204, + "learning_rate": 5.014008455219739e-06, + "loss": 0.694, + "mean_token_accuracy": 0.795988991856575, + "num_tokens": 6187015848.0, + "step": 2975 + }, + { + "epoch": 2.9507303788066355, + "grad_norm": 0.17282164096832275, + "learning_rate": 5.011619237185454e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7966022878885269, + "num_tokens": 6197382990.0, + "step": 2980 + }, + { + "epoch": 2.9556820995295867, + "grad_norm": 0.1636972278356552, + "learning_rate": 5.0094530911278624e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7977098762989044, + "num_tokens": 6207775709.0, + "step": 2985 + }, + { + "epoch": 2.9606338202525375, + "grad_norm": 0.1713937222957611, + "learning_rate": 5.007510081574464e-06, + "loss": 0.6793, + "mean_token_accuracy": 0.7998199731111526, + "num_tokens": 6218181373.0, + "step": 2990 + }, + { + "epoch": 2.965585540975489, + "grad_norm": 0.17157921195030212, + "learning_rate": 5.005790266405726e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.7975385636091232, + "num_tokens": 6228604799.0, + "step": 2995 + }, + { + "epoch": 2.97053726169844, + "grad_norm": 0.17099162936210632, + "learning_rate": 5.004293696853359e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.8003452003002167, + "num_tokens": 6238963168.0, + "step": 3000 + }, + { + "epoch": 2.9754889824213917, + "grad_norm": 0.16633567214012146, + "learning_rate": 5.003020417498797e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7993758976459503, + "num_tokens": 6249324175.0, + "step": 3005 + }, + { + "epoch": 2.9804407031443425, + "grad_norm": 0.1645272821187973, + "learning_rate": 5.001970466271859e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.7992853671312332, + "num_tokens": 6259719042.0, + "step": 3010 + }, + { + "epoch": 2.9853924238672938, + "grad_norm": 0.16467167437076569, + "learning_rate": 5.001143874449626e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.7971955716609955, + "num_tokens": 6270117034.0, + "step": 3015 + }, + { + "epoch": 2.990344144590245, + "grad_norm": 0.1758957952260971, + "learning_rate": 5.000540666655511e-06, + "loss": 0.6922, + "mean_token_accuracy": 0.7962426990270615, + "num_tokens": 6280527402.0, + "step": 3020 + }, + { + "epoch": 2.9952958653131962, + "grad_norm": 0.16724340617656708, + "learning_rate": 5.000160860858517e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7955468863248825, + "num_tokens": 6290951944.0, + "step": 3025 + }, + { + "epoch": 3.0, + "grad_norm": 0.1897646188735962, + "learning_rate": 5.00000446837271e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.7958593180305079, + "num_tokens": 6300851025.0, + "step": 3030 + } + ], + "logging_steps": 5, + "max_steps": 3030, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 450, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3290528460679152e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..f15d907 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71160e82654feeadf7cf9fc628b07cb2ada8c311cee0b0394bfdad3c074b23ce +size 6417