commit ead9e3c533f0c27d1916a8fcc1b1acad583d0181 Author: ModelHub XC Date: Fri May 22 09:40:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: BSC-LT/salamandra-7b-instruct-tools-16k Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ac7b0fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,41 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +salamandra-7b-tools-16k-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/GGUF_README.md b/GGUF_README.md new file mode 100644 index 0000000..d507e98 --- /dev/null +++ b/GGUF_README.md @@ -0,0 +1,210 @@ +--- +metrics: + format: gguf + method: gguf + quantization_type: Q4_K_M + context_length: 2048 +tags: +- quantization +name: GGUF +description: GGUF quantization using llama.cpp for efficient CPU and GPU inference +intended_use: Efficient inference on CPU and GPU with llama.cpp +limitations: Requires llama.cpp conversion tools and specific model architectures +citations: +- https://github.com/ggml-org/llama.cpp +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..8a8744d --- /dev/null +++ b/README.md @@ -0,0 +1,148 @@ +--- +license: apache-2.0 +library_name: transformers +pipeline_tag: text-generation +--- + +> [!WARNING] +> **WARNING:** This is a language model that has undergone instruction tuning for conversational settings that exploit function calling capabilities. It has not been aligned with human preferences. As a result, it may generate outputs that are inappropriate, misleading, biased, or unsafe. These risks can be mitigated through additional post-training stages, which is strongly recommended before deployment in any production system, especially for high-stakes applications. +> +### How to use +``` +from datetime import datetime +from transformers import AutoTokenizer, AutoModelForCausalLM +import transformers +import torch + +model_id = "BSC-LT/salamandra-7b-instruct" + +text = "What is the weather like in Paris today?" + +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + torch_dtype=torch.bfloat16 + ) + +message = [ { "role": "user", "content": text } ] + +tools = [{ + "type": "function", + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + } +}] + + +prompt = tokenizer.apply_chat_template( + message, + tokenize=False, + add_generation_prompt=True, + tools=tools +) + +inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt") +outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=1000) + +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +#### Output: +```text + +{"name": "get_weather", "arguments": {"location": "Paris, France"}} + +``` + + +### Deploy with vllm +**Deploy the model using vllm docker image.** +``` +docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 80:80 \ + vllm/vllm-openai:latest \ + --model BSC-LT/salamandra-7b-instruct-tools \ + --enable-auto-tool-choice \ + --tool-call-parser hermes \ + --max_model_len 8196 \ + --port 80 +``` + +**Then use it with openai api** +``` +pip install openai +``` +``` +from openai import OpenAI +client = OpenAI( + base_url="http://localhost:8080/v1/", + api_key="hf_xxxx" + ) + +models = client.models.list() +model = models.data[0].id + +system_message = "" +messages = [{ "role": "system", "content": system_message}] if system_message else [] +messages.append( {"role":"user", "content": "What is the weather like in Paris today?"}) +print(messages) +chat_completion = client.chat.completions.create( + model=model, + tools=tools + messages=messages, + stream=False, + max_tokens=1000, + temperature=0.1, + frequency_penalty=0.2, +) + +msg = chat_completion.choices[0].message + +# --- HANDLE TOOL CALL OR NORMAL CONTENT --- + +if not getattr(msg, "tool_calls", None): + # Normal assistant message + print(msg.content) + + messages.append({ + "role": "assistant", + "content": msg.content + }) + +else: + # Assistant tool call message + print(msg.tool_calls) + + messages.append({"role": "assistant", "tool_calls": msg.tool_calls}) + + # --- Fake tool execution example --- + tool_call = msg.tool_calls[0] + # Example: handle the get_weather tool + if tool_call.function.name == "get_weather": + # Fake tool result (this would come from your actual backend) + fake_tool_result = '{"temperature": 18, "unit": "C", "description": "Partly cloudy in Paris"}' + + # Append the tool result message so the model can use it in the next turn + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "name": tool_call.function.name, + "content": fake_tool_result, + }) +``` + diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..fd5141b --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,8 @@ +{ + "": 256003, + "": 256004, + "": 256005, + "": 256002, + "": 256000, + "": 256001 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..789ca56 --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "/gpfs/projects/bsc88/hf-models/Salamandra-7b_pre-1.3-160k_sft-2.0_openlicenses_ankush", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 16384, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 20.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.44.0", + "use_cache": true, + "vocab_size": 256006 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..9f95297 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 5, + "max_length": 16384, + "pad_token_id": 0, + "transformers_version": "4.44.0" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..47c87ed --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561ad0097b93136b099d53e2a0ba31c3ef006afebb09e3cdf63d905bb84aab94 +size 4983022200 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..80f180d --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1016f9d7df3f7ed2452104a95ed9bd6740bfeec1fba426f8c006b359612067d +size 4995660232 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..222e478 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0919249808dda81f2b00c70e42d712c7e539252c6b949925c046b9614270cd02 +size 3460482936 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..3dc4982 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:571202f8f1c4a33bf34c7889b79f90bb1e7a795bfb9dc204ac2f1df0d32dc1a0 +size 2097201280 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..024dd8d --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 15536332800 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00003-of-00004.safetensors" + } +} diff --git a/salamandra-7b-tools-16k-Q8_0.gguf b/salamandra-7b-tools-16k-Q8_0.gguf new file mode 100644 index 0000000..3e21525 --- /dev/null +++ b/salamandra-7b-tools-16k-Q8_0.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148444f25f45ee2143c30bfc728a7f18bbc59eb65d60ab89ece13572ac62b1eb +size 8260914048 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..c909875 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|im_end|>", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..721cf17 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:decfbbc4fc2152155f20ec8d8dad6971e3da68a6b53268c744ed9f950b34685f +size 19093074 diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..3307f8d --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa490e57cebce5cb1a0a5b1a5d3fa4de05aee53dc3a44791f1c3401db44d802d +size 4813274 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3f28e96 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,1152 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "5": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "6": { + "content": "<|reserved_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "7": { + "content": "<|reserved_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "8": { + "content": "<|reserved_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "9": { + "content": "<|reserved_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "10": { + "content": "<|reserved_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "11": { + "content": "<|reserved_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "12": { + "content": "<|reserved_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "13": { + "content": "<|reserved_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "14": { + "content": "<|reserved_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "15": { + "content": "<|reserved_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "16": { + "content": "<|reserved_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "17": { + "content": "<|reserved_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "18": { + "content": "<|reserved_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "19": { + "content": "<|reserved_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "20": { + "content": "<|reserved_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "21": { + "content": "<|reserved_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "22": { + "content": "<|reserved_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "23": { + "content": "<|reserved_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "24": { + "content": "<|reserved_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "25": { + "content": "<|reserved_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "26": { + "content": "<|reserved_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "27": { + "content": "<|reserved_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "28": { + "content": "<|reserved_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "29": { + "content": "<|reserved_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "30": { + "content": "<|reserved_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "31": { + "content": "<|reserved_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32": { + "content": "<|reserved_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "33": { + "content": "<|reserved_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "34": { + "content": "<|reserved_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "35": { + "content": "<|reserved_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "36": { + "content": "<|reserved_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "37": { + "content": "<|reserved_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "38": { + "content": "<|reserved_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "39": { + "content": "<|reserved_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "40": { + "content": "<|reserved_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "41": { + "content": "<|reserved_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "42": { + "content": "<|reserved_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "43": { + "content": "<|reserved_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "44": { + "content": "<|reserved_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "45": { + "content": "<|reserved_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "46": { + "content": "<|reserved_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "47": { + "content": "<|reserved_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "48": { + "content": "<|reserved_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "49": { + "content": "<|reserved_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "50": { + "content": "<|reserved_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "51": { + "content": "<|reserved_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "52": { + "content": "<|reserved_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "53": { + "content": "<|reserved_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "54": { + "content": "<|reserved_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "55": { + "content": "<|reserved_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "56": { + "content": "<|reserved_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "57": { + "content": "<|reserved_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "58": { + "content": "<|reserved_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "59": { + "content": "<|reserved_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "60": { + "content": "<|reserved_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "61": { + "content": "<|reserved_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "62": { + "content": "<|reserved_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "63": { + "content": "<|reserved_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64": { + "content": "<|reserved_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65": { + "content": "<|reserved_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "66": { + "content": "<|reserved_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "67": { + "content": "<|reserved_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "68": { + "content": "<|reserved_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "69": { + "content": "<|reserved_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "70": { + "content": "<|reserved_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "71": { + "content": "<|reserved_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "72": { + "content": "<|reserved_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "73": { + "content": "<|reserved_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "74": { + "content": "<|reserved_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "75": { + "content": "<|reserved_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "76": { + "content": "<|reserved_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "77": { + "content": "<|reserved_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "78": { + "content": "<|reserved_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "79": { + "content": "<|reserved_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "80": { + "content": "<|reserved_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "81": { + "content": "<|reserved_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "82": { + "content": "<|reserved_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "83": { + "content": "<|reserved_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "84": { + "content": "<|reserved_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "85": { + "content": "<|reserved_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "86": { + "content": "<|reserved_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "87": { + "content": "<|reserved_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "88": { + "content": "<|reserved_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "89": { + "content": "<|reserved_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "90": { + "content": "<|reserved_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "91": { + "content": "<|reserved_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92": { + "content": "<|reserved_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "93": { + "content": "<|reserved_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "94": { + "content": "<|reserved_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "95": { + "content": "<|reserved_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "96": { + "content": "<|reserved_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "97": { + "content": "<|reserved_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "98": { + "content": "<|reserved_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "99": { + "content": "<|reserved_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "100": { + "content": "<|reserved_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "101": { + "content": "<|reserved_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "102": { + "content": "<|reserved_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "103": { + "content": "<|reserved_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "104": { + "content": "\\r", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256001": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256002": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256003": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256004": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256005": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": "", + "chat_template": "{%- set tools = tools if tools is defined else None -%}\n{%- set date_string = date_string if date_string is defined else \"1 Sep 2024\" -%}\n\n{%- set system_message = messages[0].content if messages[0].role == \"system\" else \"\" -%}\n{%- if messages[0].role == \"system\" -%}\n {%- set messages = messages[1:] -%}\n{%- endif -%}\n\n{%- if not tool_prompt -%}\n {%- set tool_prompt = \"For each function call return a json object with function name and arguments within tags with the following schema:\n\n{\\\"name\\\": , \\\"arguments\\\": }\n\" -%}\n{%- endif -%}\n\n{%- if system_message or tools -%}\n {{- '<|im_start|>system\n'}}\n{%- endif -%}\n\n{%- if system_message %}\n {{- system_message + \"\n\"}}\n{%- endif -%}\n\n{%- if tools -%}\n {{- \"You have function-calling capabilities. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.\n\" }}\n {{- \"\n\" }}\n {{- tools }}\n {{- \"\n\n\" }}\n {{- tool_prompt -}}\n{%- endif -%}\n\n{%- if system_message or tools -%}\n {{- '<|im_end|>\n'}}\n{%- endif -%}\n\n{# Main message loop #}\n{%- for message in messages -%}\n {%- if message.role == \"user\" or message.role == \"assistant\" or message.role == \"tool\" -%}\n {%- if loop.first and message.role != \"user\" -%}\n {{ raise_exception(\"Invalid sequence: The first message role must be 'user' after 'system' if provided .\") }}\n {%- endif -%}\n\n {%- if not loop.first and message.role in [\"user\", \"assistant\"] and message.role == loop.previtem.role -%}\n {{ raise_exception(\"Invalid sequence: Consecutive messages cannot have the same role ('user' or 'assistant').\") }}\n {%- endif -%}\n\n {%- if message.role == \"user\" and not loop.first and loop.previtem.role != \"assistant\" -%}\n {{ raise_exception(\"Invalid sequence: A 'user' message must be preceded by an 'assistant' message.\") }}\n {%- endif -%}\n\n {%- if message.role == \"tool\" and not loop.first and loop.previtem.role not in [\"assistant\", \"tool\"] -%}\n {{ raise_exception(\"Invalid sequence: A 'tool' message must be preceded by 'assistant' or 'tool'.\") }}\n {%- endif -%}\n {%- else -%}\n {{- raise_exception(\"Invalid role detected: only 'user', 'assistant', or 'tool' roles are accepted.\") }}\n {%- endif -%}\n {%- if message.role == \"user\" or (message.role == \"assistant\" and message.tool_calls is not defined) -%}\n {{- '<|im_start|>' + message.role + '\n' + message.content | trim + '<|im_end|>\n'}}\n {%- elif message.role == \"assistant\" -%}\n {{- '<|im_start|>' + message.role }}\n {%- for tool_call in message.tool_calls -%}\n {{ '\n\n' }}\n {%- if tool_call.function -%}\n {\"name\": \"{{ tool_call.function.name }}\", \"arguments\": {{ tool_call.function.arguments | tojson }} }\n {%- else -%}\n {\"name\": \"{{ tool_call.name }}\", \"arguments\": {{ tool_call.arguments | tojson }} }\n {%- endif -%}\n {{ '\n' }}\n {%- endfor -%}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" -%}\n {%- if loop.previtem and loop.previtem.role != \"tool\" -%}\n {{- '<|im_start|>tool\n' }}\n {%- endif -%}\n {{- '\n' }} \n {{- message.content }}\n {{- '\n\n' }}\n {%- if loop.last or loop.nextitem.role != \"tool\" -%}\n {{- '<|im_end|>\n'}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n\n{# Prompt for assistant generation if needed #}\n{%- if add_generation_prompt -%}\n {{- '<|im_start|>assistant\n' }}\n{%- endif -%}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "legacy": true, + "model_max_length": 16384, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..cbdddf4 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,13281 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9996635488863468, + "eval_steps": 60, + "global_step": 1857, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005383217818450979, + "grad_norm": 109.41815851913972, + "learning_rate": 1.7857142857142858e-07, + "loss": 1.8223, + "step": 1 + }, + { + "epoch": 0.0010766435636901958, + "grad_norm": 6.467874390653583, + "learning_rate": 3.5714285714285716e-07, + "loss": 0.995, + "step": 2 + }, + { + "epoch": 0.0016149653455352936, + "grad_norm": 4802.638302957245, + "learning_rate": 5.357142857142857e-07, + "loss": 1.8859, + "step": 3 + }, + { + "epoch": 0.0021532871273803916, + "grad_norm": 16.08026403474417, + "learning_rate": 7.142857142857143e-07, + "loss": 0.953, + "step": 4 + }, + { + "epoch": 0.0026916089092254895, + "grad_norm": 57.36353411210698, + "learning_rate": 8.928571428571429e-07, + "loss": 0.9429, + "step": 5 + }, + { + "epoch": 0.0032299306910705873, + "grad_norm": 19675.34780096024, + "learning_rate": 1.0714285714285714e-06, + "loss": 2.6006, + "step": 6 + }, + { + "epoch": 0.0037682524729156855, + "grad_norm": 818.3276445922152, + "learning_rate": 1.25e-06, + "loss": 0.9918, + "step": 7 + }, + { + "epoch": 0.004306574254760783, + "grad_norm": 104.38551556669266, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.9549, + "step": 8 + }, + { + "epoch": 0.004844896036605881, + "grad_norm": 367.74940177761715, + "learning_rate": 1.6071428571428574e-06, + "loss": 2.68, + "step": 9 + }, + { + "epoch": 0.005383217818450979, + "grad_norm": 2496.9575083108416, + "learning_rate": 1.7857142857142859e-06, + "loss": 1.0041, + "step": 10 + }, + { + "epoch": 0.005921539600296077, + "grad_norm": 8.38602908383877, + "learning_rate": 1.9642857142857144e-06, + "loss": 0.9457, + "step": 11 + }, + { + "epoch": 0.0064598613821411745, + "grad_norm": 8.592616439259098, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.8897, + "step": 12 + }, + { + "epoch": 0.006998183163986273, + "grad_norm": 438.07218134142477, + "learning_rate": 2.321428571428572e-06, + "loss": 0.9019, + "step": 13 + }, + { + "epoch": 0.007536504945831371, + "grad_norm": 9.994776961708206, + "learning_rate": 2.5e-06, + "loss": 0.825, + "step": 14 + }, + { + "epoch": 0.008074826727676468, + "grad_norm": 10.743498710130337, + "learning_rate": 2.6785714285714285e-06, + "loss": 0.849, + "step": 15 + }, + { + "epoch": 0.008613148509521567, + "grad_norm": 15.967903678723232, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.8012, + "step": 16 + }, + { + "epoch": 0.009151470291366665, + "grad_norm": 4.1668693518220055, + "learning_rate": 3.0357142857142856e-06, + "loss": 0.7744, + "step": 17 + }, + { + "epoch": 0.009689792073211762, + "grad_norm": 6.926360685293587, + "learning_rate": 3.2142857142857147e-06, + "loss": 0.692, + "step": 18 + }, + { + "epoch": 0.010228113855056861, + "grad_norm": 3.2346473797515074, + "learning_rate": 3.3928571428571435e-06, + "loss": 0.6831, + "step": 19 + }, + { + "epoch": 0.010766435636901958, + "grad_norm": 3.7571978637997177, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.6537, + "step": 20 + }, + { + "epoch": 0.011304757418747056, + "grad_norm": 7.559105627558757, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.6141, + "step": 21 + }, + { + "epoch": 0.011843079200592153, + "grad_norm": 2.387397312463083, + "learning_rate": 3.928571428571429e-06, + "loss": 0.695, + "step": 22 + }, + { + "epoch": 0.012381400982437252, + "grad_norm": 4.391512430632287, + "learning_rate": 4.107142857142857e-06, + "loss": 0.6185, + "step": 23 + }, + { + "epoch": 0.012919722764282349, + "grad_norm": 4.15230181408785, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.5936, + "step": 24 + }, + { + "epoch": 0.013458044546127448, + "grad_norm": 2.0653396060051605, + "learning_rate": 4.464285714285715e-06, + "loss": 0.5898, + "step": 25 + }, + { + "epoch": 0.013996366327972546, + "grad_norm": 17.979519982673633, + "learning_rate": 4.642857142857144e-06, + "loss": 0.5905, + "step": 26 + }, + { + "epoch": 0.014534688109817643, + "grad_norm": 8.151818005754823, + "learning_rate": 4.821428571428572e-06, + "loss": 0.5533, + "step": 27 + }, + { + "epoch": 0.015073009891662742, + "grad_norm": 5.987974608299902, + "learning_rate": 5e-06, + "loss": 0.6107, + "step": 28 + }, + { + "epoch": 0.015611331673507839, + "grad_norm": 2.330194497601682, + "learning_rate": 5.1785714285714296e-06, + "loss": 0.5506, + "step": 29 + }, + { + "epoch": 0.016149653455352936, + "grad_norm": 12.794302959608173, + "learning_rate": 5.357142857142857e-06, + "loss": 0.5766, + "step": 30 + }, + { + "epoch": 0.016687975237198036, + "grad_norm": 1.9007212740143298, + "learning_rate": 5.535714285714286e-06, + "loss": 0.5627, + "step": 31 + }, + { + "epoch": 0.017226297019043133, + "grad_norm": 1.9146904127163438, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5883, + "step": 32 + }, + { + "epoch": 0.01776461880088823, + "grad_norm": 2.187675355948441, + "learning_rate": 5.892857142857144e-06, + "loss": 0.5035, + "step": 33 + }, + { + "epoch": 0.01830294058273333, + "grad_norm": 2.6806082565798124, + "learning_rate": 6.071428571428571e-06, + "loss": 0.5542, + "step": 34 + }, + { + "epoch": 0.018841262364578427, + "grad_norm": 3.3951990046554323, + "learning_rate": 6.25e-06, + "loss": 0.538, + "step": 35 + }, + { + "epoch": 0.019379584146423524, + "grad_norm": 2.5099129162892853, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.5251, + "step": 36 + }, + { + "epoch": 0.01991790592826862, + "grad_norm": 2.0412677432451627, + "learning_rate": 6.607142857142858e-06, + "loss": 0.5121, + "step": 37 + }, + { + "epoch": 0.020456227710113722, + "grad_norm": 2.781712184570042, + "learning_rate": 6.785714285714287e-06, + "loss": 0.5254, + "step": 38 + }, + { + "epoch": 0.02099454949195882, + "grad_norm": 6.805805985669443, + "learning_rate": 6.964285714285714e-06, + "loss": 0.5831, + "step": 39 + }, + { + "epoch": 0.021532871273803916, + "grad_norm": 2.0490434598423652, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.5393, + "step": 40 + }, + { + "epoch": 0.022071193055649013, + "grad_norm": 1.9372367290098516, + "learning_rate": 7.321428571428572e-06, + "loss": 0.5972, + "step": 41 + }, + { + "epoch": 0.022609514837494113, + "grad_norm": 2.7227356306942165, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5369, + "step": 42 + }, + { + "epoch": 0.02314783661933921, + "grad_norm": 2.011860407760454, + "learning_rate": 7.67857142857143e-06, + "loss": 0.5164, + "step": 43 + }, + { + "epoch": 0.023686158401184307, + "grad_norm": 2.380752182458038, + "learning_rate": 7.857142857142858e-06, + "loss": 0.4715, + "step": 44 + }, + { + "epoch": 0.024224480183029407, + "grad_norm": 2.0112153484283537, + "learning_rate": 8.035714285714286e-06, + "loss": 0.4943, + "step": 45 + }, + { + "epoch": 0.024762801964874504, + "grad_norm": 1.7657871236862508, + "learning_rate": 8.214285714285714e-06, + "loss": 0.5792, + "step": 46 + }, + { + "epoch": 0.0253011237467196, + "grad_norm": 2.012306508738324, + "learning_rate": 8.392857142857144e-06, + "loss": 0.5704, + "step": 47 + }, + { + "epoch": 0.025839445528564698, + "grad_norm": 2.0657223159326743, + "learning_rate": 8.571428571428571e-06, + "loss": 0.5145, + "step": 48 + }, + { + "epoch": 0.0263777673104098, + "grad_norm": 2.137310846323582, + "learning_rate": 8.750000000000001e-06, + "loss": 0.5067, + "step": 49 + }, + { + "epoch": 0.026916089092254895, + "grad_norm": 2.2166052489861534, + "learning_rate": 8.92857142857143e-06, + "loss": 0.5799, + "step": 50 + }, + { + "epoch": 0.027454410874099992, + "grad_norm": 2.029493952864758, + "learning_rate": 9.107142857142858e-06, + "loss": 0.5817, + "step": 51 + }, + { + "epoch": 0.027992732655945093, + "grad_norm": 1.5628607433382145, + "learning_rate": 9.285714285714288e-06, + "loss": 0.4722, + "step": 52 + }, + { + "epoch": 0.02853105443779019, + "grad_norm": 1.686683459837313, + "learning_rate": 9.464285714285714e-06, + "loss": 0.5233, + "step": 53 + }, + { + "epoch": 0.029069376219635287, + "grad_norm": 1.7287851726495882, + "learning_rate": 9.642857142857144e-06, + "loss": 0.5744, + "step": 54 + }, + { + "epoch": 0.029607698001480384, + "grad_norm": 2.246853321344625, + "learning_rate": 9.821428571428573e-06, + "loss": 0.4972, + "step": 55 + }, + { + "epoch": 0.030146019783325484, + "grad_norm": 1.9175548738162544, + "learning_rate": 1e-05, + "loss": 0.535, + "step": 56 + }, + { + "epoch": 0.03068434156517058, + "grad_norm": 2.169109901402676, + "learning_rate": 9.999992393020984e-06, + "loss": 0.5429, + "step": 57 + }, + { + "epoch": 0.031222663347015678, + "grad_norm": 2.260825281362616, + "learning_rate": 9.99996957210708e-06, + "loss": 0.521, + "step": 58 + }, + { + "epoch": 0.031760985128860775, + "grad_norm": 1.660309077201794, + "learning_rate": 9.999931537327727e-06, + "loss": 0.531, + "step": 59 + }, + { + "epoch": 0.03229930691070587, + "grad_norm": 2.069841458563405, + "learning_rate": 9.999878288798659e-06, + "loss": 0.5661, + "step": 60 + }, + { + "epoch": 0.03229930691070587, + "eval_loss": 0.5262647271156311, + "eval_runtime": 1569.0341, + "eval_samples_per_second": 15.94, + "eval_steps_per_second": 0.498, + "step": 60 + }, + { + "epoch": 0.032837628692550976, + "grad_norm": 2.6347591222570577, + "learning_rate": 9.999809826681898e-06, + "loss": 0.544, + "step": 61 + }, + { + "epoch": 0.03337595047439607, + "grad_norm": 2.286499156997404, + "learning_rate": 9.999726151185762e-06, + "loss": 0.5387, + "step": 62 + }, + { + "epoch": 0.03391427225624117, + "grad_norm": 1.8415858956026085, + "learning_rate": 9.999627262564856e-06, + "loss": 0.5148, + "step": 63 + }, + { + "epoch": 0.034452594038086266, + "grad_norm": 1.6900844200859937, + "learning_rate": 9.999513161120078e-06, + "loss": 0.5291, + "step": 64 + }, + { + "epoch": 0.03499091581993136, + "grad_norm": 1.7125448582732223, + "learning_rate": 9.999383847198618e-06, + "loss": 0.5535, + "step": 65 + }, + { + "epoch": 0.03552923760177646, + "grad_norm": 1.9111631206584763, + "learning_rate": 9.999239321193946e-06, + "loss": 0.5146, + "step": 66 + }, + { + "epoch": 0.03606755938362156, + "grad_norm": 1.5772484080951499, + "learning_rate": 9.999079583545829e-06, + "loss": 0.4713, + "step": 67 + }, + { + "epoch": 0.03660588116546666, + "grad_norm": 1.8895632782472054, + "learning_rate": 9.998904634740313e-06, + "loss": 0.5802, + "step": 68 + }, + { + "epoch": 0.03714420294731176, + "grad_norm": 1.7764047564754841, + "learning_rate": 9.998714475309733e-06, + "loss": 0.4893, + "step": 69 + }, + { + "epoch": 0.037682524729156855, + "grad_norm": 1.6552020383306354, + "learning_rate": 9.9985091058327e-06, + "loss": 0.5265, + "step": 70 + }, + { + "epoch": 0.03822084651100195, + "grad_norm": 1.6488442266603467, + "learning_rate": 9.998288526934115e-06, + "loss": 0.5231, + "step": 71 + }, + { + "epoch": 0.03875916829284705, + "grad_norm": 2.563488205094923, + "learning_rate": 9.998052739285151e-06, + "loss": 0.5305, + "step": 72 + }, + { + "epoch": 0.039297490074692146, + "grad_norm": 1.7898615543554037, + "learning_rate": 9.997801743603264e-06, + "loss": 0.5237, + "step": 73 + }, + { + "epoch": 0.03983581185653724, + "grad_norm": 1.7633259864675677, + "learning_rate": 9.997535540652177e-06, + "loss": 0.5502, + "step": 74 + }, + { + "epoch": 0.04037413363838234, + "grad_norm": 1.8121416043404328, + "learning_rate": 9.997254131241893e-06, + "loss": 0.4952, + "step": 75 + }, + { + "epoch": 0.040912455420227443, + "grad_norm": 1.5652647418073986, + "learning_rate": 9.996957516228682e-06, + "loss": 0.4945, + "step": 76 + }, + { + "epoch": 0.04145077720207254, + "grad_norm": 2.048844737679617, + "learning_rate": 9.996645696515082e-06, + "loss": 0.5123, + "step": 77 + }, + { + "epoch": 0.04198909898391764, + "grad_norm": 1.6687520157181732, + "learning_rate": 9.996318673049893e-06, + "loss": 0.5443, + "step": 78 + }, + { + "epoch": 0.042527420765762734, + "grad_norm": 1.66167477759581, + "learning_rate": 9.995976446828182e-06, + "loss": 0.5029, + "step": 79 + }, + { + "epoch": 0.04306574254760783, + "grad_norm": 1.5077402156848434, + "learning_rate": 9.99561901889127e-06, + "loss": 0.5197, + "step": 80 + }, + { + "epoch": 0.04360406432945293, + "grad_norm": 1.8622381731018631, + "learning_rate": 9.995246390326739e-06, + "loss": 0.5048, + "step": 81 + }, + { + "epoch": 0.044142386111298025, + "grad_norm": 1.6038417564132132, + "learning_rate": 9.994858562268415e-06, + "loss": 0.5779, + "step": 82 + }, + { + "epoch": 0.04468070789314313, + "grad_norm": 2.2450492036773126, + "learning_rate": 9.994455535896383e-06, + "loss": 0.5407, + "step": 83 + }, + { + "epoch": 0.045219029674988226, + "grad_norm": 1.7319893085330837, + "learning_rate": 9.994037312436963e-06, + "loss": 0.4857, + "step": 84 + }, + { + "epoch": 0.04575735145683332, + "grad_norm": 1.6718459312817726, + "learning_rate": 9.99360389316273e-06, + "loss": 0.4815, + "step": 85 + }, + { + "epoch": 0.04629567323867842, + "grad_norm": 2.7232264171397276, + "learning_rate": 9.993155279392479e-06, + "loss": 0.5877, + "step": 86 + }, + { + "epoch": 0.04683399502052352, + "grad_norm": 1.9404135244552454, + "learning_rate": 9.992691472491253e-06, + "loss": 0.5062, + "step": 87 + }, + { + "epoch": 0.047372316802368614, + "grad_norm": 1.9213426547558368, + "learning_rate": 9.99221247387032e-06, + "loss": 0.5188, + "step": 88 + }, + { + "epoch": 0.04791063858421371, + "grad_norm": 1.5451598644824311, + "learning_rate": 9.991718284987173e-06, + "loss": 0.5397, + "step": 89 + }, + { + "epoch": 0.048448960366058814, + "grad_norm": 2.5679521016629385, + "learning_rate": 9.991208907345524e-06, + "loss": 0.541, + "step": 90 + }, + { + "epoch": 0.04898728214790391, + "grad_norm": 2.98985646242629, + "learning_rate": 9.990684342495304e-06, + "loss": 0.4854, + "step": 91 + }, + { + "epoch": 0.04952560392974901, + "grad_norm": 1.9886055940456542, + "learning_rate": 9.990144592032657e-06, + "loss": 0.5256, + "step": 92 + }, + { + "epoch": 0.050063925711594105, + "grad_norm": 2.083677922083048, + "learning_rate": 9.989589657599927e-06, + "loss": 0.4859, + "step": 93 + }, + { + "epoch": 0.0506022474934392, + "grad_norm": 1.5145771411744222, + "learning_rate": 9.989019540885664e-06, + "loss": 0.4744, + "step": 94 + }, + { + "epoch": 0.0511405692752843, + "grad_norm": 1.655565898472542, + "learning_rate": 9.98843424362462e-06, + "loss": 0.4615, + "step": 95 + }, + { + "epoch": 0.051678891057129396, + "grad_norm": 1.9814143121579568, + "learning_rate": 9.987833767597726e-06, + "loss": 0.4806, + "step": 96 + }, + { + "epoch": 0.0522172128389745, + "grad_norm": 1.5166169599719224, + "learning_rate": 9.987218114632109e-06, + "loss": 0.5279, + "step": 97 + }, + { + "epoch": 0.0527555346208196, + "grad_norm": 1.7338166251896456, + "learning_rate": 9.98658728660107e-06, + "loss": 0.4885, + "step": 98 + }, + { + "epoch": 0.053293856402664694, + "grad_norm": 2.059909188253357, + "learning_rate": 9.98594128542409e-06, + "loss": 0.4878, + "step": 99 + }, + { + "epoch": 0.05383217818450979, + "grad_norm": 1.946469408161261, + "learning_rate": 9.985280113066816e-06, + "loss": 0.5423, + "step": 100 + }, + { + "epoch": 0.05437049996635489, + "grad_norm": 2.2782083747319333, + "learning_rate": 9.984603771541055e-06, + "loss": 0.5132, + "step": 101 + }, + { + "epoch": 0.054908821748199985, + "grad_norm": 2.057010956887204, + "learning_rate": 9.983912262904775e-06, + "loss": 0.5092, + "step": 102 + }, + { + "epoch": 0.05544714353004508, + "grad_norm": 1.7498707830077607, + "learning_rate": 9.983205589262093e-06, + "loss": 0.4711, + "step": 103 + }, + { + "epoch": 0.055985465311890185, + "grad_norm": 2.08857966446578, + "learning_rate": 9.98248375276327e-06, + "loss": 0.5405, + "step": 104 + }, + { + "epoch": 0.05652378709373528, + "grad_norm": 1.6492587393982439, + "learning_rate": 9.981746755604703e-06, + "loss": 0.5346, + "step": 105 + }, + { + "epoch": 0.05706210887558038, + "grad_norm": 2.4884932019084203, + "learning_rate": 9.980994600028919e-06, + "loss": 0.4979, + "step": 106 + }, + { + "epoch": 0.057600430657425476, + "grad_norm": 2.357643749019895, + "learning_rate": 9.980227288324576e-06, + "loss": 0.547, + "step": 107 + }, + { + "epoch": 0.05813875243927057, + "grad_norm": 1.7013608238808469, + "learning_rate": 9.979444822826438e-06, + "loss": 0.4984, + "step": 108 + }, + { + "epoch": 0.05867707422111567, + "grad_norm": 1.6424667181868076, + "learning_rate": 9.978647205915386e-06, + "loss": 0.5501, + "step": 109 + }, + { + "epoch": 0.05921539600296077, + "grad_norm": 1.8701509501400961, + "learning_rate": 9.977834440018406e-06, + "loss": 0.5478, + "step": 110 + }, + { + "epoch": 0.05975371778480587, + "grad_norm": 1.8496243899167086, + "learning_rate": 9.977006527608569e-06, + "loss": 0.4782, + "step": 111 + }, + { + "epoch": 0.06029203956665097, + "grad_norm": 1.6878413932010692, + "learning_rate": 9.976163471205045e-06, + "loss": 0.4832, + "step": 112 + }, + { + "epoch": 0.060830361348496065, + "grad_norm": 1.9099800850936837, + "learning_rate": 9.975305273373075e-06, + "loss": 0.515, + "step": 113 + }, + { + "epoch": 0.06136868313034116, + "grad_norm": 1.5649119566569916, + "learning_rate": 9.974431936723979e-06, + "loss": 0.4561, + "step": 114 + }, + { + "epoch": 0.06190700491218626, + "grad_norm": 1.7341754469580601, + "learning_rate": 9.973543463915139e-06, + "loss": 0.5348, + "step": 115 + }, + { + "epoch": 0.062445326694031356, + "grad_norm": 1.7476560123562952, + "learning_rate": 9.972639857649989e-06, + "loss": 0.5287, + "step": 116 + }, + { + "epoch": 0.06298364847587645, + "grad_norm": 2.0434137346621624, + "learning_rate": 9.971721120678018e-06, + "loss": 0.5932, + "step": 117 + }, + { + "epoch": 0.06352197025772155, + "grad_norm": 1.62299849715006, + "learning_rate": 9.97078725579475e-06, + "loss": 0.5077, + "step": 118 + }, + { + "epoch": 0.06406029203956665, + "grad_norm": 1.7228929187523507, + "learning_rate": 9.969838265841739e-06, + "loss": 0.5859, + "step": 119 + }, + { + "epoch": 0.06459861382141174, + "grad_norm": 1.6625474372880666, + "learning_rate": 9.968874153706567e-06, + "loss": 0.4655, + "step": 120 + }, + { + "epoch": 0.06459861382141174, + "eval_loss": 0.5072533488273621, + "eval_runtime": 1577.1777, + "eval_samples_per_second": 15.857, + "eval_steps_per_second": 0.496, + "step": 120 + }, + { + "epoch": 0.06513693560325684, + "grad_norm": 2.0716206061611486, + "learning_rate": 9.967894922322824e-06, + "loss": 0.539, + "step": 121 + }, + { + "epoch": 0.06567525738510195, + "grad_norm": 1.6205145916384769, + "learning_rate": 9.96690057467011e-06, + "loss": 0.5478, + "step": 122 + }, + { + "epoch": 0.06621357916694705, + "grad_norm": 1.587372514164151, + "learning_rate": 9.965891113774015e-06, + "loss": 0.538, + "step": 123 + }, + { + "epoch": 0.06675190094879214, + "grad_norm": 1.4772510136765666, + "learning_rate": 9.964866542706119e-06, + "loss": 0.5349, + "step": 124 + }, + { + "epoch": 0.06729022273063724, + "grad_norm": 1.7801746551956565, + "learning_rate": 9.963826864583979e-06, + "loss": 0.4909, + "step": 125 + }, + { + "epoch": 0.06782854451248234, + "grad_norm": 5.729919312521928, + "learning_rate": 9.962772082571115e-06, + "loss": 0.6005, + "step": 126 + }, + { + "epoch": 0.06836686629432744, + "grad_norm": 1.6619105967880943, + "learning_rate": 9.961702199877014e-06, + "loss": 0.4715, + "step": 127 + }, + { + "epoch": 0.06890518807617253, + "grad_norm": 1.5987631874828743, + "learning_rate": 9.960617219757105e-06, + "loss": 0.4807, + "step": 128 + }, + { + "epoch": 0.06944350985801763, + "grad_norm": 1.625681174655454, + "learning_rate": 9.959517145512754e-06, + "loss": 0.535, + "step": 129 + }, + { + "epoch": 0.06998183163986273, + "grad_norm": 2.100345459551234, + "learning_rate": 9.958401980491259e-06, + "loss": 0.5264, + "step": 130 + }, + { + "epoch": 0.07052015342170782, + "grad_norm": 1.7787800977162425, + "learning_rate": 9.957271728085836e-06, + "loss": 0.5171, + "step": 131 + }, + { + "epoch": 0.07105847520355292, + "grad_norm": 1.6985346393670706, + "learning_rate": 9.956126391735605e-06, + "loss": 0.5016, + "step": 132 + }, + { + "epoch": 0.07159679698539802, + "grad_norm": 1.3787117088478043, + "learning_rate": 9.954965974925586e-06, + "loss": 0.502, + "step": 133 + }, + { + "epoch": 0.07213511876724311, + "grad_norm": 1.547259961768447, + "learning_rate": 9.953790481186689e-06, + "loss": 0.5046, + "step": 134 + }, + { + "epoch": 0.07267344054908821, + "grad_norm": 1.7755359789986371, + "learning_rate": 9.952599914095692e-06, + "loss": 0.5385, + "step": 135 + }, + { + "epoch": 0.07321176233093332, + "grad_norm": 1.5896819627160363, + "learning_rate": 9.951394277275247e-06, + "loss": 0.4749, + "step": 136 + }, + { + "epoch": 0.07375008411277842, + "grad_norm": 1.6875256792153286, + "learning_rate": 9.950173574393853e-06, + "loss": 0.4763, + "step": 137 + }, + { + "epoch": 0.07428840589462352, + "grad_norm": 1.437266797535168, + "learning_rate": 9.948937809165853e-06, + "loss": 0.4833, + "step": 138 + }, + { + "epoch": 0.07482672767646861, + "grad_norm": 1.7282025114929471, + "learning_rate": 9.947686985351427e-06, + "loss": 0.4767, + "step": 139 + }, + { + "epoch": 0.07536504945831371, + "grad_norm": 1.8616012721247828, + "learning_rate": 9.946421106756568e-06, + "loss": 0.5093, + "step": 140 + }, + { + "epoch": 0.0759033712401588, + "grad_norm": 1.8460263465465812, + "learning_rate": 9.94514017723308e-06, + "loss": 0.517, + "step": 141 + }, + { + "epoch": 0.0764416930220039, + "grad_norm": 2.0057873955643823, + "learning_rate": 9.94384420067857e-06, + "loss": 0.5154, + "step": 142 + }, + { + "epoch": 0.076980014803849, + "grad_norm": 1.65882505385735, + "learning_rate": 9.94253318103642e-06, + "loss": 0.4701, + "step": 143 + }, + { + "epoch": 0.0775183365856941, + "grad_norm": 2.3628830084290806, + "learning_rate": 9.941207122295789e-06, + "loss": 0.5405, + "step": 144 + }, + { + "epoch": 0.0780566583675392, + "grad_norm": 1.6577450103892044, + "learning_rate": 9.9398660284916e-06, + "loss": 0.4927, + "step": 145 + }, + { + "epoch": 0.07859498014938429, + "grad_norm": 1.4186036899765784, + "learning_rate": 9.938509903704521e-06, + "loss": 0.4898, + "step": 146 + }, + { + "epoch": 0.07913330193122939, + "grad_norm": 1.544561300695159, + "learning_rate": 9.937138752060958e-06, + "loss": 0.4893, + "step": 147 + }, + { + "epoch": 0.07967162371307449, + "grad_norm": 2.396784154476515, + "learning_rate": 9.935752577733038e-06, + "loss": 0.5326, + "step": 148 + }, + { + "epoch": 0.08020994549491958, + "grad_norm": 1.6617814624124967, + "learning_rate": 9.9343513849386e-06, + "loss": 0.5131, + "step": 149 + }, + { + "epoch": 0.08074826727676468, + "grad_norm": 1.7862849588167096, + "learning_rate": 9.932935177941185e-06, + "loss": 0.571, + "step": 150 + }, + { + "epoch": 0.08128658905860979, + "grad_norm": 1.4319233814203582, + "learning_rate": 9.931503961050012e-06, + "loss": 0.5017, + "step": 151 + }, + { + "epoch": 0.08182491084045489, + "grad_norm": 4.306871831666418, + "learning_rate": 9.93005773861998e-06, + "loss": 0.4935, + "step": 152 + }, + { + "epoch": 0.08236323262229998, + "grad_norm": 2.160758045969246, + "learning_rate": 9.928596515051639e-06, + "loss": 0.4985, + "step": 153 + }, + { + "epoch": 0.08290155440414508, + "grad_norm": 1.5540015811422117, + "learning_rate": 9.927120294791188e-06, + "loss": 0.4575, + "step": 154 + }, + { + "epoch": 0.08343987618599018, + "grad_norm": 1.5794711992375656, + "learning_rate": 9.92562908233046e-06, + "loss": 0.5031, + "step": 155 + }, + { + "epoch": 0.08397819796783527, + "grad_norm": 2.034943473794147, + "learning_rate": 9.9241228822069e-06, + "loss": 0.4829, + "step": 156 + }, + { + "epoch": 0.08451651974968037, + "grad_norm": 1.878275757652009, + "learning_rate": 9.922601699003567e-06, + "loss": 0.5468, + "step": 157 + }, + { + "epoch": 0.08505484153152547, + "grad_norm": 1.8197718876914466, + "learning_rate": 9.921065537349097e-06, + "loss": 0.5228, + "step": 158 + }, + { + "epoch": 0.08559316331337057, + "grad_norm": 1.850901219005824, + "learning_rate": 9.919514401917717e-06, + "loss": 0.4894, + "step": 159 + }, + { + "epoch": 0.08613148509521566, + "grad_norm": 1.6912529326600465, + "learning_rate": 9.917948297429202e-06, + "loss": 0.4783, + "step": 160 + }, + { + "epoch": 0.08666980687706076, + "grad_norm": 1.9572290713193328, + "learning_rate": 9.916367228648887e-06, + "loss": 0.4889, + "step": 161 + }, + { + "epoch": 0.08720812865890586, + "grad_norm": 2.2412763350776497, + "learning_rate": 9.914771200387634e-06, + "loss": 0.5196, + "step": 162 + }, + { + "epoch": 0.08774645044075095, + "grad_norm": 2.0096075056146527, + "learning_rate": 9.913160217501822e-06, + "loss": 0.5098, + "step": 163 + }, + { + "epoch": 0.08828477222259605, + "grad_norm": 1.561955725348752, + "learning_rate": 9.911534284893336e-06, + "loss": 0.4993, + "step": 164 + }, + { + "epoch": 0.08882309400444116, + "grad_norm": 2.2239745440823113, + "learning_rate": 9.909893407509554e-06, + "loss": 0.5189, + "step": 165 + }, + { + "epoch": 0.08936141578628626, + "grad_norm": 2.1956593936333606, + "learning_rate": 9.90823759034332e-06, + "loss": 0.4956, + "step": 166 + }, + { + "epoch": 0.08989973756813135, + "grad_norm": 1.7245617400478288, + "learning_rate": 9.906566838432943e-06, + "loss": 0.5076, + "step": 167 + }, + { + "epoch": 0.09043805934997645, + "grad_norm": 1.6846599680454537, + "learning_rate": 9.904881156862172e-06, + "loss": 0.4546, + "step": 168 + }, + { + "epoch": 0.09097638113182155, + "grad_norm": 1.713604562000994, + "learning_rate": 9.903180550760184e-06, + "loss": 0.5622, + "step": 169 + }, + { + "epoch": 0.09151470291366665, + "grad_norm": 1.4559714724478827, + "learning_rate": 9.901465025301571e-06, + "loss": 0.499, + "step": 170 + }, + { + "epoch": 0.09205302469551174, + "grad_norm": 1.748975091207079, + "learning_rate": 9.899734585706316e-06, + "loss": 0.4823, + "step": 171 + }, + { + "epoch": 0.09259134647735684, + "grad_norm": 1.6268147978199312, + "learning_rate": 9.89798923723979e-06, + "loss": 0.5452, + "step": 172 + }, + { + "epoch": 0.09312966825920194, + "grad_norm": 1.7343158101478648, + "learning_rate": 9.896228985212722e-06, + "loss": 0.4359, + "step": 173 + }, + { + "epoch": 0.09366799004104703, + "grad_norm": 2.07042169826696, + "learning_rate": 9.894453834981194e-06, + "loss": 0.511, + "step": 174 + }, + { + "epoch": 0.09420631182289213, + "grad_norm": 1.791222622400255, + "learning_rate": 9.892663791946617e-06, + "loss": 0.5451, + "step": 175 + }, + { + "epoch": 0.09474463360473723, + "grad_norm": 2.20105621306618, + "learning_rate": 9.890858861555719e-06, + "loss": 0.5144, + "step": 176 + }, + { + "epoch": 0.09528295538658232, + "grad_norm": 1.6902715423027703, + "learning_rate": 9.889039049300526e-06, + "loss": 0.5445, + "step": 177 + }, + { + "epoch": 0.09582127716842742, + "grad_norm": 1.6384822244675972, + "learning_rate": 9.88720436071835e-06, + "loss": 0.5164, + "step": 178 + }, + { + "epoch": 0.09635959895027253, + "grad_norm": 1.486764051130488, + "learning_rate": 9.885354801391764e-06, + "loss": 0.478, + "step": 179 + }, + { + "epoch": 0.09689792073211763, + "grad_norm": 1.701132133672937, + "learning_rate": 9.883490376948593e-06, + "loss": 0.5027, + "step": 180 + }, + { + "epoch": 0.09689792073211763, + "eval_loss": 0.49806535243988037, + "eval_runtime": 1515.9148, + "eval_samples_per_second": 16.498, + "eval_steps_per_second": 0.516, + "step": 180 + }, + { + "epoch": 0.09743624251396273, + "grad_norm": 1.9402448136247314, + "learning_rate": 9.881611093061891e-06, + "loss": 0.5127, + "step": 181 + }, + { + "epoch": 0.09797456429580782, + "grad_norm": 1.7830082860168288, + "learning_rate": 9.879716955449927e-06, + "loss": 0.4977, + "step": 182 + }, + { + "epoch": 0.09851288607765292, + "grad_norm": 1.8728338162339362, + "learning_rate": 9.877807969876167e-06, + "loss": 0.5303, + "step": 183 + }, + { + "epoch": 0.09905120785949802, + "grad_norm": 1.9418905923773875, + "learning_rate": 9.875884142149258e-06, + "loss": 0.4924, + "step": 184 + }, + { + "epoch": 0.09958952964134311, + "grad_norm": 1.7198468996934395, + "learning_rate": 9.873945478123006e-06, + "loss": 0.4753, + "step": 185 + }, + { + "epoch": 0.10012785142318821, + "grad_norm": 1.9960103116925314, + "learning_rate": 9.87199198369636e-06, + "loss": 0.5277, + "step": 186 + }, + { + "epoch": 0.10066617320503331, + "grad_norm": 1.627744057918891, + "learning_rate": 9.870023664813399e-06, + "loss": 0.46, + "step": 187 + }, + { + "epoch": 0.1012044949868784, + "grad_norm": 1.689952574264165, + "learning_rate": 9.868040527463305e-06, + "loss": 0.4994, + "step": 188 + }, + { + "epoch": 0.1017428167687235, + "grad_norm": 1.5603624594142342, + "learning_rate": 9.866042577680354e-06, + "loss": 0.5304, + "step": 189 + }, + { + "epoch": 0.1022811385505686, + "grad_norm": 1.748472496778829, + "learning_rate": 9.86402982154389e-06, + "loss": 0.4964, + "step": 190 + }, + { + "epoch": 0.1028194603324137, + "grad_norm": 1.7431819106596798, + "learning_rate": 9.862002265178308e-06, + "loss": 0.4783, + "step": 191 + }, + { + "epoch": 0.10335778211425879, + "grad_norm": 1.837418537016329, + "learning_rate": 9.859959914753042e-06, + "loss": 0.4862, + "step": 192 + }, + { + "epoch": 0.1038961038961039, + "grad_norm": 2.596761998177084, + "learning_rate": 9.857902776482538e-06, + "loss": 0.5261, + "step": 193 + }, + { + "epoch": 0.104434425677949, + "grad_norm": 1.893467433056967, + "learning_rate": 9.85583085662624e-06, + "loss": 0.5324, + "step": 194 + }, + { + "epoch": 0.1049727474597941, + "grad_norm": 1.5311561663354358, + "learning_rate": 9.853744161488568e-06, + "loss": 0.4934, + "step": 195 + }, + { + "epoch": 0.1055110692416392, + "grad_norm": 1.573948338119931, + "learning_rate": 9.851642697418898e-06, + "loss": 0.5137, + "step": 196 + }, + { + "epoch": 0.10604939102348429, + "grad_norm": 1.7486390517463863, + "learning_rate": 9.84952647081155e-06, + "loss": 0.535, + "step": 197 + }, + { + "epoch": 0.10658771280532939, + "grad_norm": 1.589021194069147, + "learning_rate": 9.847395488105761e-06, + "loss": 0.443, + "step": 198 + }, + { + "epoch": 0.10712603458717448, + "grad_norm": 1.9185393015026924, + "learning_rate": 9.845249755785665e-06, + "loss": 0.5281, + "step": 199 + }, + { + "epoch": 0.10766435636901958, + "grad_norm": 2.3792026849321704, + "learning_rate": 9.84308928038028e-06, + "loss": 0.5031, + "step": 200 + }, + { + "epoch": 0.10820267815086468, + "grad_norm": 1.9165328926467609, + "learning_rate": 9.840914068463482e-06, + "loss": 0.5557, + "step": 201 + }, + { + "epoch": 0.10874099993270978, + "grad_norm": 2.5946215311840315, + "learning_rate": 9.838724126653987e-06, + "loss": 0.4922, + "step": 202 + }, + { + "epoch": 0.10927932171455487, + "grad_norm": 2.13076319151747, + "learning_rate": 9.836519461615331e-06, + "loss": 0.5781, + "step": 203 + }, + { + "epoch": 0.10981764349639997, + "grad_norm": 1.663228941320188, + "learning_rate": 9.834300080055854e-06, + "loss": 0.484, + "step": 204 + }, + { + "epoch": 0.11035596527824507, + "grad_norm": 2.225077581890442, + "learning_rate": 9.832065988728667e-06, + "loss": 0.4869, + "step": 205 + }, + { + "epoch": 0.11089428706009016, + "grad_norm": 1.4816502494413102, + "learning_rate": 9.829817194431646e-06, + "loss": 0.4782, + "step": 206 + }, + { + "epoch": 0.11143260884193526, + "grad_norm": 1.9584675295393534, + "learning_rate": 9.827553704007403e-06, + "loss": 0.4572, + "step": 207 + }, + { + "epoch": 0.11197093062378037, + "grad_norm": 1.4348786359320973, + "learning_rate": 9.82527552434327e-06, + "loss": 0.4682, + "step": 208 + }, + { + "epoch": 0.11250925240562547, + "grad_norm": 1.836643464151516, + "learning_rate": 9.82298266237127e-06, + "loss": 0.475, + "step": 209 + }, + { + "epoch": 0.11304757418747056, + "grad_norm": 1.6780795457698512, + "learning_rate": 9.820675125068105e-06, + "loss": 0.4903, + "step": 210 + }, + { + "epoch": 0.11358589596931566, + "grad_norm": 2.0824594091852124, + "learning_rate": 9.818352919455133e-06, + "loss": 0.5396, + "step": 211 + }, + { + "epoch": 0.11412421775116076, + "grad_norm": 1.7381485522277624, + "learning_rate": 9.816016052598336e-06, + "loss": 0.536, + "step": 212 + }, + { + "epoch": 0.11466253953300586, + "grad_norm": 1.7730039428627105, + "learning_rate": 9.813664531608319e-06, + "loss": 0.5344, + "step": 213 + }, + { + "epoch": 0.11520086131485095, + "grad_norm": 1.726577182888005, + "learning_rate": 9.811298363640265e-06, + "loss": 0.4686, + "step": 214 + }, + { + "epoch": 0.11573918309669605, + "grad_norm": 1.4284226913661735, + "learning_rate": 9.808917555893934e-06, + "loss": 0.417, + "step": 215 + }, + { + "epoch": 0.11627750487854115, + "grad_norm": 1.8490676859358208, + "learning_rate": 9.806522115613624e-06, + "loss": 0.4734, + "step": 216 + }, + { + "epoch": 0.11681582666038624, + "grad_norm": 1.9252320315263673, + "learning_rate": 9.804112050088164e-06, + "loss": 0.5216, + "step": 217 + }, + { + "epoch": 0.11735414844223134, + "grad_norm": 2.039324491259981, + "learning_rate": 9.801687366650882e-06, + "loss": 0.5209, + "step": 218 + }, + { + "epoch": 0.11789247022407644, + "grad_norm": 2.9773699463269168, + "learning_rate": 9.799248072679581e-06, + "loss": 0.5341, + "step": 219 + }, + { + "epoch": 0.11843079200592153, + "grad_norm": 2.742476530553411, + "learning_rate": 9.796794175596526e-06, + "loss": 0.5013, + "step": 220 + }, + { + "epoch": 0.11896911378776663, + "grad_norm": 1.7756468554357536, + "learning_rate": 9.794325682868413e-06, + "loss": 0.4789, + "step": 221 + }, + { + "epoch": 0.11950743556961174, + "grad_norm": 1.6809704903695406, + "learning_rate": 9.791842602006355e-06, + "loss": 0.4661, + "step": 222 + }, + { + "epoch": 0.12004575735145684, + "grad_norm": 1.5983552620095136, + "learning_rate": 9.789344940565844e-06, + "loss": 0.4525, + "step": 223 + }, + { + "epoch": 0.12058407913330194, + "grad_norm": 1.6785718872740183, + "learning_rate": 9.786832706146745e-06, + "loss": 0.5614, + "step": 224 + }, + { + "epoch": 0.12112240091514703, + "grad_norm": 1.8472396669798028, + "learning_rate": 9.784305906393266e-06, + "loss": 0.5442, + "step": 225 + }, + { + "epoch": 0.12166072269699213, + "grad_norm": 2.233728320756155, + "learning_rate": 9.781764548993932e-06, + "loss": 0.5065, + "step": 226 + }, + { + "epoch": 0.12219904447883723, + "grad_norm": 1.7583669595786098, + "learning_rate": 9.77920864168156e-06, + "loss": 0.5031, + "step": 227 + }, + { + "epoch": 0.12273736626068232, + "grad_norm": 1.856107901761449, + "learning_rate": 9.77663819223325e-06, + "loss": 0.5218, + "step": 228 + }, + { + "epoch": 0.12327568804252742, + "grad_norm": 1.5999284716572806, + "learning_rate": 9.774053208470338e-06, + "loss": 0.447, + "step": 229 + }, + { + "epoch": 0.12381400982437252, + "grad_norm": 3.170181526472491, + "learning_rate": 9.771453698258392e-06, + "loss": 0.4549, + "step": 230 + }, + { + "epoch": 0.12435233160621761, + "grad_norm": 1.7567006972999655, + "learning_rate": 9.768839669507185e-06, + "loss": 0.5203, + "step": 231 + }, + { + "epoch": 0.12489065338806271, + "grad_norm": 1.6024823185860628, + "learning_rate": 9.766211130170653e-06, + "loss": 0.5035, + "step": 232 + }, + { + "epoch": 0.1254289751699078, + "grad_norm": 1.9234982966827474, + "learning_rate": 9.7635680882469e-06, + "loss": 0.5742, + "step": 233 + }, + { + "epoch": 0.1259672969517529, + "grad_norm": 1.526400617412084, + "learning_rate": 9.760910551778149e-06, + "loss": 0.4953, + "step": 234 + }, + { + "epoch": 0.126505618733598, + "grad_norm": 1.7460568880199783, + "learning_rate": 9.758238528850733e-06, + "loss": 0.4705, + "step": 235 + }, + { + "epoch": 0.1270439405154431, + "grad_norm": 5.681983754980635, + "learning_rate": 9.755552027595055e-06, + "loss": 0.5499, + "step": 236 + }, + { + "epoch": 0.1275822622972882, + "grad_norm": 1.9059517301514561, + "learning_rate": 9.752851056185583e-06, + "loss": 0.5016, + "step": 237 + }, + { + "epoch": 0.1281205840791333, + "grad_norm": 2.032081768465102, + "learning_rate": 9.750135622840811e-06, + "loss": 0.4761, + "step": 238 + }, + { + "epoch": 0.1286589058609784, + "grad_norm": 2.044888486278771, + "learning_rate": 9.747405735823232e-06, + "loss": 0.535, + "step": 239 + }, + { + "epoch": 0.1291972276428235, + "grad_norm": 1.7814262228625417, + "learning_rate": 9.744661403439328e-06, + "loss": 0.5524, + "step": 240 + }, + { + "epoch": 0.1291972276428235, + "eval_loss": 0.4923091232776642, + "eval_runtime": 1516.8995, + "eval_samples_per_second": 16.488, + "eval_steps_per_second": 0.516, + "step": 240 + }, + { + "epoch": 0.12973554942466858, + "grad_norm": 3.1298270206538, + "learning_rate": 9.74190263403953e-06, + "loss": 0.4938, + "step": 241 + }, + { + "epoch": 0.13027387120651368, + "grad_norm": 1.4984946811035116, + "learning_rate": 9.739129436018193e-06, + "loss": 0.4417, + "step": 242 + }, + { + "epoch": 0.1308121929883588, + "grad_norm": 1.364613667269671, + "learning_rate": 9.736341817813586e-06, + "loss": 0.4698, + "step": 243 + }, + { + "epoch": 0.1313505147702039, + "grad_norm": 1.4558332152005662, + "learning_rate": 9.733539787907851e-06, + "loss": 0.51, + "step": 244 + }, + { + "epoch": 0.131888836552049, + "grad_norm": 1.605378069117634, + "learning_rate": 9.730723354826978e-06, + "loss": 0.4502, + "step": 245 + }, + { + "epoch": 0.1324271583338941, + "grad_norm": 1.6741314580897366, + "learning_rate": 9.727892527140787e-06, + "loss": 0.4445, + "step": 246 + }, + { + "epoch": 0.1329654801157392, + "grad_norm": 2.306950410094544, + "learning_rate": 9.725047313462897e-06, + "loss": 0.541, + "step": 247 + }, + { + "epoch": 0.1335038018975843, + "grad_norm": 2.110791301537649, + "learning_rate": 9.722187722450699e-06, + "loss": 0.5105, + "step": 248 + }, + { + "epoch": 0.1340421236794294, + "grad_norm": 1.8250944708952, + "learning_rate": 9.719313762805334e-06, + "loss": 0.5233, + "step": 249 + }, + { + "epoch": 0.13458044546127448, + "grad_norm": 1.5279014760068415, + "learning_rate": 9.716425443271663e-06, + "loss": 0.4978, + "step": 250 + }, + { + "epoch": 0.13511876724311958, + "grad_norm": 1.6155139379634116, + "learning_rate": 9.713522772638238e-06, + "loss": 0.489, + "step": 251 + }, + { + "epoch": 0.13565708902496468, + "grad_norm": 1.7541916143762504, + "learning_rate": 9.710605759737281e-06, + "loss": 0.5058, + "step": 252 + }, + { + "epoch": 0.13619541080680977, + "grad_norm": 2.0770411769433914, + "learning_rate": 9.707674413444658e-06, + "loss": 0.4765, + "step": 253 + }, + { + "epoch": 0.13673373258865487, + "grad_norm": 2.20017292136363, + "learning_rate": 9.70472874267984e-06, + "loss": 0.5073, + "step": 254 + }, + { + "epoch": 0.13727205437049997, + "grad_norm": 2.5155355882755495, + "learning_rate": 9.701768756405894e-06, + "loss": 0.5271, + "step": 255 + }, + { + "epoch": 0.13781037615234507, + "grad_norm": 1.6203966463313373, + "learning_rate": 9.698794463629438e-06, + "loss": 0.5328, + "step": 256 + }, + { + "epoch": 0.13834869793419016, + "grad_norm": 1.776204296227151, + "learning_rate": 9.695805873400627e-06, + "loss": 0.4975, + "step": 257 + }, + { + "epoch": 0.13888701971603526, + "grad_norm": 1.817996887986963, + "learning_rate": 9.692802994813117e-06, + "loss": 0.5076, + "step": 258 + }, + { + "epoch": 0.13942534149788036, + "grad_norm": 1.5387316388819356, + "learning_rate": 9.68978583700404e-06, + "loss": 0.4783, + "step": 259 + }, + { + "epoch": 0.13996366327972545, + "grad_norm": 1.4525191587799346, + "learning_rate": 9.686754409153984e-06, + "loss": 0.4541, + "step": 260 + }, + { + "epoch": 0.14050198506157055, + "grad_norm": 2.5072786042500286, + "learning_rate": 9.683708720486947e-06, + "loss": 0.4321, + "step": 261 + }, + { + "epoch": 0.14104030684341565, + "grad_norm": 1.928234336171056, + "learning_rate": 9.680648780270327e-06, + "loss": 0.5026, + "step": 262 + }, + { + "epoch": 0.14157862862526074, + "grad_norm": 1.9095002820990152, + "learning_rate": 9.677574597814884e-06, + "loss": 0.5048, + "step": 263 + }, + { + "epoch": 0.14211695040710584, + "grad_norm": 2.7537047870453777, + "learning_rate": 9.674486182474716e-06, + "loss": 0.5202, + "step": 264 + }, + { + "epoch": 0.14265527218895094, + "grad_norm": 1.5411698281683408, + "learning_rate": 9.671383543647225e-06, + "loss": 0.473, + "step": 265 + }, + { + "epoch": 0.14319359397079603, + "grad_norm": 1.6351867542673815, + "learning_rate": 9.668266690773094e-06, + "loss": 0.4734, + "step": 266 + }, + { + "epoch": 0.14373191575264113, + "grad_norm": 1.8884810300636565, + "learning_rate": 9.66513563333626e-06, + "loss": 0.5014, + "step": 267 + }, + { + "epoch": 0.14427023753448623, + "grad_norm": 1.6743904016832571, + "learning_rate": 9.661990380863876e-06, + "loss": 0.4782, + "step": 268 + }, + { + "epoch": 0.14480855931633133, + "grad_norm": 1.9090758165263444, + "learning_rate": 9.658830942926291e-06, + "loss": 0.5003, + "step": 269 + }, + { + "epoch": 0.14534688109817642, + "grad_norm": 1.4937405913115736, + "learning_rate": 9.655657329137015e-06, + "loss": 0.4432, + "step": 270 + }, + { + "epoch": 0.14588520288002152, + "grad_norm": 1.9026943182309153, + "learning_rate": 9.652469549152695e-06, + "loss": 0.529, + "step": 271 + }, + { + "epoch": 0.14642352466186664, + "grad_norm": 1.8186943886881364, + "learning_rate": 9.649267612673079e-06, + "loss": 0.4737, + "step": 272 + }, + { + "epoch": 0.14696184644371174, + "grad_norm": 1.8259823260308685, + "learning_rate": 9.646051529440993e-06, + "loss": 0.4985, + "step": 273 + }, + { + "epoch": 0.14750016822555684, + "grad_norm": 1.9385932273349529, + "learning_rate": 9.64282130924231e-06, + "loss": 0.4838, + "step": 274 + }, + { + "epoch": 0.14803849000740193, + "grad_norm": 2.04013899262351, + "learning_rate": 9.639576961905915e-06, + "loss": 0.5434, + "step": 275 + }, + { + "epoch": 0.14857681178924703, + "grad_norm": 1.4822512590060632, + "learning_rate": 9.636318497303679e-06, + "loss": 0.5105, + "step": 276 + }, + { + "epoch": 0.14911513357109213, + "grad_norm": 1.580055299090581, + "learning_rate": 9.633045925350436e-06, + "loss": 0.5236, + "step": 277 + }, + { + "epoch": 0.14965345535293723, + "grad_norm": 1.947058506268201, + "learning_rate": 9.629759256003936e-06, + "loss": 0.517, + "step": 278 + }, + { + "epoch": 0.15019177713478232, + "grad_norm": 2.09097300966892, + "learning_rate": 9.626458499264833e-06, + "loss": 0.4795, + "step": 279 + }, + { + "epoch": 0.15073009891662742, + "grad_norm": 1.9281815370039999, + "learning_rate": 9.623143665176636e-06, + "loss": 0.5091, + "step": 280 + }, + { + "epoch": 0.15126842069847252, + "grad_norm": 1.8942765435710498, + "learning_rate": 9.6198147638257e-06, + "loss": 0.486, + "step": 281 + }, + { + "epoch": 0.1518067424803176, + "grad_norm": 1.5680877122601742, + "learning_rate": 9.616471805341175e-06, + "loss": 0.5756, + "step": 282 + }, + { + "epoch": 0.1523450642621627, + "grad_norm": 1.8187589637332664, + "learning_rate": 9.613114799894989e-06, + "loss": 0.4848, + "step": 283 + }, + { + "epoch": 0.1528833860440078, + "grad_norm": 2.845269186548161, + "learning_rate": 9.609743757701806e-06, + "loss": 0.5196, + "step": 284 + }, + { + "epoch": 0.1534217078258529, + "grad_norm": 1.6573799451128552, + "learning_rate": 9.60635868901901e-06, + "loss": 0.5256, + "step": 285 + }, + { + "epoch": 0.153960029607698, + "grad_norm": 1.403409672767778, + "learning_rate": 9.602959604146658e-06, + "loss": 0.4591, + "step": 286 + }, + { + "epoch": 0.1544983513895431, + "grad_norm": 1.5756224710697608, + "learning_rate": 9.599546513427455e-06, + "loss": 0.4499, + "step": 287 + }, + { + "epoch": 0.1550366731713882, + "grad_norm": 1.8561161081867996, + "learning_rate": 9.596119427246727e-06, + "loss": 0.514, + "step": 288 + }, + { + "epoch": 0.1555749949532333, + "grad_norm": 1.6430886050709819, + "learning_rate": 9.592678356032382e-06, + "loss": 0.4916, + "step": 289 + }, + { + "epoch": 0.1561133167350784, + "grad_norm": 1.5608831001537813, + "learning_rate": 9.589223310254881e-06, + "loss": 0.4845, + "step": 290 + }, + { + "epoch": 0.15665163851692349, + "grad_norm": 2.041472319934021, + "learning_rate": 9.58575430042721e-06, + "loss": 0.5105, + "step": 291 + }, + { + "epoch": 0.15718996029876858, + "grad_norm": 1.879252835980779, + "learning_rate": 9.582271337104844e-06, + "loss": 0.5254, + "step": 292 + }, + { + "epoch": 0.15772828208061368, + "grad_norm": 1.7353738362985391, + "learning_rate": 9.578774430885714e-06, + "loss": 0.545, + "step": 293 + }, + { + "epoch": 0.15826660386245878, + "grad_norm": 1.6167983704567415, + "learning_rate": 9.575263592410176e-06, + "loss": 0.484, + "step": 294 + }, + { + "epoch": 0.15880492564430387, + "grad_norm": 1.6983057165346465, + "learning_rate": 9.571738832360979e-06, + "loss": 0.5001, + "step": 295 + }, + { + "epoch": 0.15934324742614897, + "grad_norm": 2.081190213763369, + "learning_rate": 9.568200161463237e-06, + "loss": 0.4722, + "step": 296 + }, + { + "epoch": 0.15988156920799407, + "grad_norm": 2.246655796617688, + "learning_rate": 9.564647590484384e-06, + "loss": 0.5171, + "step": 297 + }, + { + "epoch": 0.16041989098983916, + "grad_norm": 1.4481263563444773, + "learning_rate": 9.561081130234155e-06, + "loss": 0.471, + "step": 298 + }, + { + "epoch": 0.16095821277168426, + "grad_norm": 1.6254902571476582, + "learning_rate": 9.557500791564545e-06, + "loss": 0.4709, + "step": 299 + }, + { + "epoch": 0.16149653455352936, + "grad_norm": 1.6522030181707457, + "learning_rate": 9.55390658536978e-06, + "loss": 0.4314, + "step": 300 + }, + { + "epoch": 0.16149653455352936, + "eval_loss": 0.48600396513938904, + "eval_runtime": 1525.5556, + "eval_samples_per_second": 16.394, + "eval_steps_per_second": 0.513, + "step": 300 + }, + { + "epoch": 0.16203485633537448, + "grad_norm": 1.6735119675316397, + "learning_rate": 9.550298522586277e-06, + "loss": 0.4981, + "step": 301 + }, + { + "epoch": 0.16257317811721958, + "grad_norm": 1.7492206784400102, + "learning_rate": 9.546676614192623e-06, + "loss": 0.5166, + "step": 302 + }, + { + "epoch": 0.16311149989906468, + "grad_norm": 1.8716369675908593, + "learning_rate": 9.543040871209528e-06, + "loss": 0.4587, + "step": 303 + }, + { + "epoch": 0.16364982168090977, + "grad_norm": 1.5260344735318792, + "learning_rate": 9.5393913046998e-06, + "loss": 0.4637, + "step": 304 + }, + { + "epoch": 0.16418814346275487, + "grad_norm": 1.9514934425079693, + "learning_rate": 9.535727925768312e-06, + "loss": 0.5018, + "step": 305 + }, + { + "epoch": 0.16472646524459997, + "grad_norm": 1.9239888955973004, + "learning_rate": 9.53205074556196e-06, + "loss": 0.5156, + "step": 306 + }, + { + "epoch": 0.16526478702644506, + "grad_norm": 1.4397611201745624, + "learning_rate": 9.528359775269637e-06, + "loss": 0.4876, + "step": 307 + }, + { + "epoch": 0.16580310880829016, + "grad_norm": 1.6314792528136741, + "learning_rate": 9.524655026122199e-06, + "loss": 0.4466, + "step": 308 + }, + { + "epoch": 0.16634143059013526, + "grad_norm": 1.7046994741333183, + "learning_rate": 9.520936509392425e-06, + "loss": 0.5137, + "step": 309 + }, + { + "epoch": 0.16687975237198036, + "grad_norm": 1.6773498230286716, + "learning_rate": 9.517204236394983e-06, + "loss": 0.4857, + "step": 310 + }, + { + "epoch": 0.16741807415382545, + "grad_norm": 1.9407453364887826, + "learning_rate": 9.513458218486404e-06, + "loss": 0.569, + "step": 311 + }, + { + "epoch": 0.16795639593567055, + "grad_norm": 2.3596815310352355, + "learning_rate": 9.509698467065042e-06, + "loss": 0.4823, + "step": 312 + }, + { + "epoch": 0.16849471771751565, + "grad_norm": 1.491461623274511, + "learning_rate": 9.505924993571037e-06, + "loss": 0.4814, + "step": 313 + }, + { + "epoch": 0.16903303949936074, + "grad_norm": 1.755984194501031, + "learning_rate": 9.502137809486277e-06, + "loss": 0.4953, + "step": 314 + }, + { + "epoch": 0.16957136128120584, + "grad_norm": 1.4330639099631888, + "learning_rate": 9.49833692633438e-06, + "loss": 0.4566, + "step": 315 + }, + { + "epoch": 0.17010968306305094, + "grad_norm": 2.8224430252996413, + "learning_rate": 9.49452235568064e-06, + "loss": 0.5356, + "step": 316 + }, + { + "epoch": 0.17064800484489603, + "grad_norm": 1.6038158256481398, + "learning_rate": 9.490694109131997e-06, + "loss": 0.4667, + "step": 317 + }, + { + "epoch": 0.17118632662674113, + "grad_norm": 1.5264996881581228, + "learning_rate": 9.486852198337013e-06, + "loss": 0.5066, + "step": 318 + }, + { + "epoch": 0.17172464840858623, + "grad_norm": 2.1960133726792987, + "learning_rate": 9.482996634985818e-06, + "loss": 0.51, + "step": 319 + }, + { + "epoch": 0.17226297019043132, + "grad_norm": 1.8025162435130595, + "learning_rate": 9.479127430810087e-06, + "loss": 0.4542, + "step": 320 + }, + { + "epoch": 0.17280129197227642, + "grad_norm": 1.573351382907097, + "learning_rate": 9.475244597583007e-06, + "loss": 0.4932, + "step": 321 + }, + { + "epoch": 0.17333961375412152, + "grad_norm": 1.8667569419712537, + "learning_rate": 9.471348147119226e-06, + "loss": 0.5095, + "step": 322 + }, + { + "epoch": 0.17387793553596662, + "grad_norm": 1.7668055772396445, + "learning_rate": 9.467438091274831e-06, + "loss": 0.5407, + "step": 323 + }, + { + "epoch": 0.1744162573178117, + "grad_norm": 1.8953472452582216, + "learning_rate": 9.46351444194731e-06, + "loss": 0.5128, + "step": 324 + }, + { + "epoch": 0.1749545790996568, + "grad_norm": 1.4178882398027213, + "learning_rate": 9.459577211075505e-06, + "loss": 0.4783, + "step": 325 + }, + { + "epoch": 0.1754929008815019, + "grad_norm": 2.0556054399757833, + "learning_rate": 9.455626410639595e-06, + "loss": 0.4883, + "step": 326 + }, + { + "epoch": 0.176031222663347, + "grad_norm": 1.7326020245251583, + "learning_rate": 9.451662052661042e-06, + "loss": 0.5118, + "step": 327 + }, + { + "epoch": 0.1765695444451921, + "grad_norm": 4.171939008569256, + "learning_rate": 9.447684149202555e-06, + "loss": 0.5034, + "step": 328 + }, + { + "epoch": 0.17710786622703723, + "grad_norm": 1.4094510294695572, + "learning_rate": 9.44369271236807e-06, + "loss": 0.485, + "step": 329 + }, + { + "epoch": 0.17764618800888232, + "grad_norm": 1.7412556596004685, + "learning_rate": 9.4396877543027e-06, + "loss": 0.5202, + "step": 330 + }, + { + "epoch": 0.17818450979072742, + "grad_norm": 2.605859372043168, + "learning_rate": 9.435669287192691e-06, + "loss": 0.4685, + "step": 331 + }, + { + "epoch": 0.17872283157257252, + "grad_norm": 1.751047130574041, + "learning_rate": 9.431637323265406e-06, + "loss": 0.5435, + "step": 332 + }, + { + "epoch": 0.1792611533544176, + "grad_norm": 1.6979113314955865, + "learning_rate": 9.42759187478927e-06, + "loss": 0.5082, + "step": 333 + }, + { + "epoch": 0.1797994751362627, + "grad_norm": 1.655193667961951, + "learning_rate": 9.423532954073737e-06, + "loss": 0.52, + "step": 334 + }, + { + "epoch": 0.1803377969181078, + "grad_norm": 1.715183078111553, + "learning_rate": 9.419460573469262e-06, + "loss": 0.4876, + "step": 335 + }, + { + "epoch": 0.1808761186999529, + "grad_norm": 1.755206515543788, + "learning_rate": 9.415374745367245e-06, + "loss": 0.4826, + "step": 336 + }, + { + "epoch": 0.181414440481798, + "grad_norm": 1.530238277234238, + "learning_rate": 9.411275482200015e-06, + "loss": 0.5227, + "step": 337 + }, + { + "epoch": 0.1819527622636431, + "grad_norm": 1.4873212835334444, + "learning_rate": 9.40716279644077e-06, + "loss": 0.4784, + "step": 338 + }, + { + "epoch": 0.1824910840454882, + "grad_norm": 1.4713841358562554, + "learning_rate": 9.403036700603561e-06, + "loss": 0.4872, + "step": 339 + }, + { + "epoch": 0.1830294058273333, + "grad_norm": 1.5551919063027968, + "learning_rate": 9.398897207243232e-06, + "loss": 0.4817, + "step": 340 + }, + { + "epoch": 0.1835677276091784, + "grad_norm": 1.8717050820441055, + "learning_rate": 9.394744328955403e-06, + "loss": 0.5002, + "step": 341 + }, + { + "epoch": 0.18410604939102349, + "grad_norm": 1.9843100820794195, + "learning_rate": 9.390578078376417e-06, + "loss": 0.4799, + "step": 342 + }, + { + "epoch": 0.18464437117286858, + "grad_norm": 2.156998251608843, + "learning_rate": 9.386398468183304e-06, + "loss": 0.4469, + "step": 343 + }, + { + "epoch": 0.18518269295471368, + "grad_norm": 1.7123477834586953, + "learning_rate": 9.38220551109375e-06, + "loss": 0.5312, + "step": 344 + }, + { + "epoch": 0.18572101473655878, + "grad_norm": 1.862901860663747, + "learning_rate": 9.377999219866046e-06, + "loss": 0.5146, + "step": 345 + }, + { + "epoch": 0.18625933651840387, + "grad_norm": 1.8400145206055536, + "learning_rate": 9.373779607299061e-06, + "loss": 0.498, + "step": 346 + }, + { + "epoch": 0.18679765830024897, + "grad_norm": 1.4419967374301528, + "learning_rate": 9.369546686232199e-06, + "loss": 0.491, + "step": 347 + }, + { + "epoch": 0.18733598008209407, + "grad_norm": 1.6800971553110484, + "learning_rate": 9.365300469545352e-06, + "loss": 0.453, + "step": 348 + }, + { + "epoch": 0.18787430186393916, + "grad_norm": 1.4414646625492236, + "learning_rate": 9.361040970158876e-06, + "loss": 0.4844, + "step": 349 + }, + { + "epoch": 0.18841262364578426, + "grad_norm": 1.4693828151901231, + "learning_rate": 9.356768201033542e-06, + "loss": 0.4846, + "step": 350 + }, + { + "epoch": 0.18895094542762936, + "grad_norm": 1.6213301090422854, + "learning_rate": 9.35248217517049e-06, + "loss": 0.4528, + "step": 351 + }, + { + "epoch": 0.18948926720947445, + "grad_norm": 1.3998204036117714, + "learning_rate": 9.348182905611209e-06, + "loss": 0.4677, + "step": 352 + }, + { + "epoch": 0.19002758899131955, + "grad_norm": 1.4713366703366633, + "learning_rate": 9.343870405437477e-06, + "loss": 0.4292, + "step": 353 + }, + { + "epoch": 0.19056591077316465, + "grad_norm": 1.941068700941172, + "learning_rate": 9.339544687771334e-06, + "loss": 0.5102, + "step": 354 + }, + { + "epoch": 0.19110423255500975, + "grad_norm": 1.828849112653357, + "learning_rate": 9.335205765775039e-06, + "loss": 0.4638, + "step": 355 + }, + { + "epoch": 0.19164255433685484, + "grad_norm": 1.6885129161638754, + "learning_rate": 9.330853652651026e-06, + "loss": 0.4391, + "step": 356 + }, + { + "epoch": 0.19218087611869994, + "grad_norm": 1.7268115477491062, + "learning_rate": 9.326488361641867e-06, + "loss": 0.4557, + "step": 357 + }, + { + "epoch": 0.19271919790054506, + "grad_norm": 1.369390489248521, + "learning_rate": 9.322109906030237e-06, + "loss": 0.4451, + "step": 358 + }, + { + "epoch": 0.19325751968239016, + "grad_norm": 1.653269795096283, + "learning_rate": 9.31771829913886e-06, + "loss": 0.4466, + "step": 359 + }, + { + "epoch": 0.19379584146423526, + "grad_norm": 1.6015504141518857, + "learning_rate": 9.313313554330484e-06, + "loss": 0.4977, + "step": 360 + }, + { + "epoch": 0.19379584146423526, + "eval_loss": 0.4812440574169159, + "eval_runtime": 1528.9254, + "eval_samples_per_second": 16.358, + "eval_steps_per_second": 0.511, + "step": 360 + }, + { + "epoch": 0.19433416324608035, + "grad_norm": 1.6899547102686612, + "learning_rate": 9.308895685007824e-06, + "loss": 0.5404, + "step": 361 + }, + { + "epoch": 0.19487248502792545, + "grad_norm": 1.8153441873291498, + "learning_rate": 9.304464704613541e-06, + "loss": 0.5128, + "step": 362 + }, + { + "epoch": 0.19541080680977055, + "grad_norm": 1.6094259149494354, + "learning_rate": 9.300020626630184e-06, + "loss": 0.4854, + "step": 363 + }, + { + "epoch": 0.19594912859161565, + "grad_norm": 1.726004590201776, + "learning_rate": 9.295563464580153e-06, + "loss": 0.4827, + "step": 364 + }, + { + "epoch": 0.19648745037346074, + "grad_norm": 1.7917006550897865, + "learning_rate": 9.29109323202567e-06, + "loss": 0.4689, + "step": 365 + }, + { + "epoch": 0.19702577215530584, + "grad_norm": 2.067420755566304, + "learning_rate": 9.286609942568712e-06, + "loss": 0.4411, + "step": 366 + }, + { + "epoch": 0.19756409393715094, + "grad_norm": 1.9439738397276571, + "learning_rate": 9.282113609851002e-06, + "loss": 0.4748, + "step": 367 + }, + { + "epoch": 0.19810241571899603, + "grad_norm": 1.6206588657538272, + "learning_rate": 9.277604247553939e-06, + "loss": 0.5215, + "step": 368 + }, + { + "epoch": 0.19864073750084113, + "grad_norm": 2.0968303117516136, + "learning_rate": 9.273081869398577e-06, + "loss": 0.4466, + "step": 369 + }, + { + "epoch": 0.19917905928268623, + "grad_norm": 1.5483077144548956, + "learning_rate": 9.268546489145566e-06, + "loss": 0.5042, + "step": 370 + }, + { + "epoch": 0.19971738106453132, + "grad_norm": 1.6430391903483688, + "learning_rate": 9.263998120595124e-06, + "loss": 0.4798, + "step": 371 + }, + { + "epoch": 0.20025570284637642, + "grad_norm": 1.451263876582638, + "learning_rate": 9.259436777586991e-06, + "loss": 0.4498, + "step": 372 + }, + { + "epoch": 0.20079402462822152, + "grad_norm": 1.924895097651951, + "learning_rate": 9.25486247400038e-06, + "loss": 0.4971, + "step": 373 + }, + { + "epoch": 0.20133234641006661, + "grad_norm": 1.5044716731151997, + "learning_rate": 9.250275223753948e-06, + "loss": 0.4761, + "step": 374 + }, + { + "epoch": 0.2018706681919117, + "grad_norm": 1.8105401635317677, + "learning_rate": 9.245675040805738e-06, + "loss": 0.4645, + "step": 375 + }, + { + "epoch": 0.2024089899737568, + "grad_norm": 1.4400001043179194, + "learning_rate": 9.241061939153146e-06, + "loss": 0.5052, + "step": 376 + }, + { + "epoch": 0.2029473117556019, + "grad_norm": 2.1898160128283046, + "learning_rate": 9.236435932832883e-06, + "loss": 0.4571, + "step": 377 + }, + { + "epoch": 0.203485633537447, + "grad_norm": 1.728102995146478, + "learning_rate": 9.231797035920921e-06, + "loss": 0.459, + "step": 378 + }, + { + "epoch": 0.2040239553192921, + "grad_norm": 1.5484346370702677, + "learning_rate": 9.227145262532458e-06, + "loss": 0.5106, + "step": 379 + }, + { + "epoch": 0.2045622771011372, + "grad_norm": 1.5623742217769747, + "learning_rate": 9.222480626821868e-06, + "loss": 0.444, + "step": 380 + }, + { + "epoch": 0.2051005988829823, + "grad_norm": 1.7091436440987169, + "learning_rate": 9.217803142982668e-06, + "loss": 0.4732, + "step": 381 + }, + { + "epoch": 0.2056389206648274, + "grad_norm": 1.4196906974845203, + "learning_rate": 9.213112825247466e-06, + "loss": 0.4779, + "step": 382 + }, + { + "epoch": 0.2061772424466725, + "grad_norm": 1.5167704426292719, + "learning_rate": 9.20840968788792e-06, + "loss": 0.4967, + "step": 383 + }, + { + "epoch": 0.20671556422851758, + "grad_norm": 1.4170871947038493, + "learning_rate": 9.203693745214698e-06, + "loss": 0.491, + "step": 384 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.5152939794668674, + "learning_rate": 9.19896501157743e-06, + "loss": 0.4541, + "step": 385 + }, + { + "epoch": 0.2077922077922078, + "grad_norm": 1.9536536833455793, + "learning_rate": 9.19422350136467e-06, + "loss": 0.4799, + "step": 386 + }, + { + "epoch": 0.2083305295740529, + "grad_norm": 2.316326510948496, + "learning_rate": 9.18946922900384e-06, + "loss": 0.4658, + "step": 387 + }, + { + "epoch": 0.208868851355898, + "grad_norm": 1.2922243986398827, + "learning_rate": 9.184702208961204e-06, + "loss": 0.4057, + "step": 388 + }, + { + "epoch": 0.2094071731377431, + "grad_norm": 1.8303479595554093, + "learning_rate": 9.179922455741812e-06, + "loss": 0.4427, + "step": 389 + }, + { + "epoch": 0.2099454949195882, + "grad_norm": 1.541720900007236, + "learning_rate": 9.175129983889452e-06, + "loss": 0.516, + "step": 390 + }, + { + "epoch": 0.2104838167014333, + "grad_norm": 1.9307101459341938, + "learning_rate": 9.17032480798662e-06, + "loss": 0.4349, + "step": 391 + }, + { + "epoch": 0.2110221384832784, + "grad_norm": 1.3922182421272982, + "learning_rate": 9.165506942654468e-06, + "loss": 0.4816, + "step": 392 + }, + { + "epoch": 0.21156046026512348, + "grad_norm": 1.6974151932118977, + "learning_rate": 9.16067640255275e-06, + "loss": 0.4812, + "step": 393 + }, + { + "epoch": 0.21209878204696858, + "grad_norm": 1.4726854167474133, + "learning_rate": 9.155833202379798e-06, + "loss": 0.4717, + "step": 394 + }, + { + "epoch": 0.21263710382881368, + "grad_norm": 1.8790922445419658, + "learning_rate": 9.150977356872456e-06, + "loss": 0.4885, + "step": 395 + }, + { + "epoch": 0.21317542561065878, + "grad_norm": 1.9084443087840661, + "learning_rate": 9.146108880806056e-06, + "loss": 0.4633, + "step": 396 + }, + { + "epoch": 0.21371374739250387, + "grad_norm": 1.6996601490386696, + "learning_rate": 9.141227788994348e-06, + "loss": 0.4453, + "step": 397 + }, + { + "epoch": 0.21425206917434897, + "grad_norm": 1.7127514086857762, + "learning_rate": 9.136334096289485e-06, + "loss": 0.5144, + "step": 398 + }, + { + "epoch": 0.21479039095619407, + "grad_norm": 1.4183339048304517, + "learning_rate": 9.131427817581953e-06, + "loss": 0.476, + "step": 399 + }, + { + "epoch": 0.21532871273803916, + "grad_norm": 1.5688801517253075, + "learning_rate": 9.12650896780053e-06, + "loss": 0.4657, + "step": 400 + }, + { + "epoch": 0.21586703451988426, + "grad_norm": 1.391080609496865, + "learning_rate": 9.121577561912256e-06, + "loss": 0.5043, + "step": 401 + }, + { + "epoch": 0.21640535630172936, + "grad_norm": 3.302547702490585, + "learning_rate": 9.11663361492237e-06, + "loss": 0.497, + "step": 402 + }, + { + "epoch": 0.21694367808357445, + "grad_norm": 1.7874988296563226, + "learning_rate": 9.111677141874273e-06, + "loss": 0.4465, + "step": 403 + }, + { + "epoch": 0.21748199986541955, + "grad_norm": 1.830004021479594, + "learning_rate": 9.106708157849478e-06, + "loss": 0.5088, + "step": 404 + }, + { + "epoch": 0.21802032164726465, + "grad_norm": 2.4236747379642267, + "learning_rate": 9.101726677967569e-06, + "loss": 0.4922, + "step": 405 + }, + { + "epoch": 0.21855864342910974, + "grad_norm": 1.5488577176317244, + "learning_rate": 9.096732717386152e-06, + "loss": 0.497, + "step": 406 + }, + { + "epoch": 0.21909696521095484, + "grad_norm": 2.3263014189367306, + "learning_rate": 9.091726291300806e-06, + "loss": 0.4791, + "step": 407 + }, + { + "epoch": 0.21963528699279994, + "grad_norm": 1.7243223143837634, + "learning_rate": 9.086707414945044e-06, + "loss": 0.5192, + "step": 408 + }, + { + "epoch": 0.22017360877464504, + "grad_norm": 1.3667216442420331, + "learning_rate": 9.08167610359026e-06, + "loss": 0.4816, + "step": 409 + }, + { + "epoch": 0.22071193055649013, + "grad_norm": 1.4675898960533509, + "learning_rate": 9.076632372545688e-06, + "loss": 0.4694, + "step": 410 + }, + { + "epoch": 0.22125025233833523, + "grad_norm": 1.725309532729321, + "learning_rate": 9.071576237158348e-06, + "loss": 0.5097, + "step": 411 + }, + { + "epoch": 0.22178857412018033, + "grad_norm": 1.48659542538949, + "learning_rate": 9.066507712813009e-06, + "loss": 0.445, + "step": 412 + }, + { + "epoch": 0.22232689590202542, + "grad_norm": 1.6287270540094485, + "learning_rate": 9.06142681493213e-06, + "loss": 0.4948, + "step": 413 + }, + { + "epoch": 0.22286521768387052, + "grad_norm": 1.5275233090165254, + "learning_rate": 9.056333558975828e-06, + "loss": 0.4556, + "step": 414 + }, + { + "epoch": 0.22340353946571564, + "grad_norm": 1.6620168630066545, + "learning_rate": 9.051227960441819e-06, + "loss": 0.4652, + "step": 415 + }, + { + "epoch": 0.22394186124756074, + "grad_norm": 2.059601149156459, + "learning_rate": 9.046110034865374e-06, + "loss": 0.5085, + "step": 416 + }, + { + "epoch": 0.22448018302940584, + "grad_norm": 1.762324556385875, + "learning_rate": 9.040979797819275e-06, + "loss": 0.4461, + "step": 417 + }, + { + "epoch": 0.22501850481125094, + "grad_norm": 1.7567357923246754, + "learning_rate": 9.035837264913764e-06, + "loss": 0.4732, + "step": 418 + }, + { + "epoch": 0.22555682659309603, + "grad_norm": 1.6696886078675257, + "learning_rate": 9.030682451796497e-06, + "loss": 0.4642, + "step": 419 + }, + { + "epoch": 0.22609514837494113, + "grad_norm": 1.8175306322549967, + "learning_rate": 9.025515374152498e-06, + "loss": 0.4613, + "step": 420 + }, + { + "epoch": 0.22609514837494113, + "eval_loss": 0.4776149392127991, + "eval_runtime": 1533.2316, + "eval_samples_per_second": 16.312, + "eval_steps_per_second": 0.51, + "step": 420 + }, + { + "epoch": 0.22663347015678623, + "grad_norm": 1.7934239843519915, + "learning_rate": 9.020336047704105e-06, + "loss": 0.516, + "step": 421 + }, + { + "epoch": 0.22717179193863132, + "grad_norm": 1.5310720805604554, + "learning_rate": 9.015144488210927e-06, + "loss": 0.489, + "step": 422 + }, + { + "epoch": 0.22771011372047642, + "grad_norm": 1.48774951332565, + "learning_rate": 9.009940711469804e-06, + "loss": 0.5009, + "step": 423 + }, + { + "epoch": 0.22824843550232152, + "grad_norm": 2.4756529462562145, + "learning_rate": 9.004724733314738e-06, + "loss": 0.4406, + "step": 424 + }, + { + "epoch": 0.22878675728416661, + "grad_norm": 1.4505668733407078, + "learning_rate": 8.999496569616867e-06, + "loss": 0.4554, + "step": 425 + }, + { + "epoch": 0.2293250790660117, + "grad_norm": 1.7945762191089136, + "learning_rate": 8.994256236284402e-06, + "loss": 0.4632, + "step": 426 + }, + { + "epoch": 0.2298634008478568, + "grad_norm": 1.6376843185311614, + "learning_rate": 8.989003749262587e-06, + "loss": 0.4885, + "step": 427 + }, + { + "epoch": 0.2304017226297019, + "grad_norm": 1.8830741232863908, + "learning_rate": 8.983739124533644e-06, + "loss": 0.5075, + "step": 428 + }, + { + "epoch": 0.230940044411547, + "grad_norm": 1.3195150579928587, + "learning_rate": 8.978462378116729e-06, + "loss": 0.4708, + "step": 429 + }, + { + "epoch": 0.2314783661933921, + "grad_norm": 3.7495214134368977, + "learning_rate": 8.973173526067883e-06, + "loss": 0.4286, + "step": 430 + }, + { + "epoch": 0.2320166879752372, + "grad_norm": 2.359888838059791, + "learning_rate": 8.967872584479977e-06, + "loss": 0.5009, + "step": 431 + }, + { + "epoch": 0.2325550097570823, + "grad_norm": 2.307039087438763, + "learning_rate": 8.962559569482677e-06, + "loss": 0.5676, + "step": 432 + }, + { + "epoch": 0.2330933315389274, + "grad_norm": 1.6816015759212095, + "learning_rate": 8.957234497242378e-06, + "loss": 0.4741, + "step": 433 + }, + { + "epoch": 0.2336316533207725, + "grad_norm": 1.322921614998224, + "learning_rate": 8.951897383962163e-06, + "loss": 0.4688, + "step": 434 + }, + { + "epoch": 0.23416997510261758, + "grad_norm": 1.4430047272258668, + "learning_rate": 8.946548245881758e-06, + "loss": 0.4711, + "step": 435 + }, + { + "epoch": 0.23470829688446268, + "grad_norm": 1.5731159349637571, + "learning_rate": 8.941187099277475e-06, + "loss": 0.5128, + "step": 436 + }, + { + "epoch": 0.23524661866630778, + "grad_norm": 1.7731819377906834, + "learning_rate": 8.935813960462166e-06, + "loss": 0.4669, + "step": 437 + }, + { + "epoch": 0.23578494044815287, + "grad_norm": 1.5736170200351274, + "learning_rate": 8.930428845785171e-06, + "loss": 0.5151, + "step": 438 + }, + { + "epoch": 0.23632326222999797, + "grad_norm": 1.9488876650276103, + "learning_rate": 8.925031771632273e-06, + "loss": 0.449, + "step": 439 + }, + { + "epoch": 0.23686158401184307, + "grad_norm": 1.8677275264654012, + "learning_rate": 8.919622754425645e-06, + "loss": 0.4758, + "step": 440 + }, + { + "epoch": 0.23739990579368817, + "grad_norm": 1.6185523790901868, + "learning_rate": 8.914201810623796e-06, + "loss": 0.4539, + "step": 441 + }, + { + "epoch": 0.23793822757553326, + "grad_norm": 1.7808483857096469, + "learning_rate": 8.908768956721535e-06, + "loss": 0.5022, + "step": 442 + }, + { + "epoch": 0.2384765493573784, + "grad_norm": 1.5766134824810658, + "learning_rate": 8.903324209249895e-06, + "loss": 0.448, + "step": 443 + }, + { + "epoch": 0.23901487113922348, + "grad_norm": 1.734675342226781, + "learning_rate": 8.897867584776114e-06, + "loss": 0.4646, + "step": 444 + }, + { + "epoch": 0.23955319292106858, + "grad_norm": 1.5790149541067802, + "learning_rate": 8.892399099903564e-06, + "loss": 0.4786, + "step": 445 + }, + { + "epoch": 0.24009151470291368, + "grad_norm": 1.4746994503206987, + "learning_rate": 8.8869187712717e-06, + "loss": 0.5055, + "step": 446 + }, + { + "epoch": 0.24062983648475877, + "grad_norm": 1.629202002564735, + "learning_rate": 8.881426615556023e-06, + "loss": 0.4572, + "step": 447 + }, + { + "epoch": 0.24116815826660387, + "grad_norm": 2.060742412650639, + "learning_rate": 8.875922649468019e-06, + "loss": 0.5032, + "step": 448 + }, + { + "epoch": 0.24170648004844897, + "grad_norm": 1.5621749237333817, + "learning_rate": 8.87040688975511e-06, + "loss": 0.4654, + "step": 449 + }, + { + "epoch": 0.24224480183029407, + "grad_norm": 1.4674899116105513, + "learning_rate": 8.864879353200599e-06, + "loss": 0.4747, + "step": 450 + }, + { + "epoch": 0.24278312361213916, + "grad_norm": 1.5183875651941505, + "learning_rate": 8.859340056623632e-06, + "loss": 0.4982, + "step": 451 + }, + { + "epoch": 0.24332144539398426, + "grad_norm": 1.5706370531453442, + "learning_rate": 8.853789016879134e-06, + "loss": 0.4667, + "step": 452 + }, + { + "epoch": 0.24385976717582936, + "grad_norm": 1.6305623278282155, + "learning_rate": 8.84822625085776e-06, + "loss": 0.456, + "step": 453 + }, + { + "epoch": 0.24439808895767445, + "grad_norm": 1.6523301690172285, + "learning_rate": 8.842651775485848e-06, + "loss": 0.5383, + "step": 454 + }, + { + "epoch": 0.24493641073951955, + "grad_norm": 1.5998220743266833, + "learning_rate": 8.837065607725368e-06, + "loss": 0.4829, + "step": 455 + }, + { + "epoch": 0.24547473252136465, + "grad_norm": 1.7862569885991761, + "learning_rate": 8.831467764573863e-06, + "loss": 0.5101, + "step": 456 + }, + { + "epoch": 0.24601305430320974, + "grad_norm": 1.704691179868801, + "learning_rate": 8.8258582630644e-06, + "loss": 0.4627, + "step": 457 + }, + { + "epoch": 0.24655137608505484, + "grad_norm": 1.7756811764982563, + "learning_rate": 8.820237120265526e-06, + "loss": 0.5079, + "step": 458 + }, + { + "epoch": 0.24708969786689994, + "grad_norm": 1.3696742776597963, + "learning_rate": 8.814604353281206e-06, + "loss": 0.4393, + "step": 459 + }, + { + "epoch": 0.24762801964874503, + "grad_norm": 2.7637461827933083, + "learning_rate": 8.80895997925078e-06, + "loss": 0.4548, + "step": 460 + }, + { + "epoch": 0.24816634143059013, + "grad_norm": 1.9115795242982947, + "learning_rate": 8.803304015348894e-06, + "loss": 0.4805, + "step": 461 + }, + { + "epoch": 0.24870466321243523, + "grad_norm": 1.6805506691737162, + "learning_rate": 8.797636478785475e-06, + "loss": 0.4786, + "step": 462 + }, + { + "epoch": 0.24924298499428033, + "grad_norm": 1.865661091263274, + "learning_rate": 8.791957386805651e-06, + "loss": 0.4722, + "step": 463 + }, + { + "epoch": 0.24978130677612542, + "grad_norm": 1.9405317358586787, + "learning_rate": 8.78626675668972e-06, + "loss": 0.4705, + "step": 464 + }, + { + "epoch": 0.2503196285579705, + "grad_norm": 1.4415009315383829, + "learning_rate": 8.78056460575308e-06, + "loss": 0.4301, + "step": 465 + }, + { + "epoch": 0.2508579503398156, + "grad_norm": 1.6060330602526178, + "learning_rate": 8.774850951346188e-06, + "loss": 0.4114, + "step": 466 + }, + { + "epoch": 0.2513962721216607, + "grad_norm": 1.7567677906852937, + "learning_rate": 8.769125810854504e-06, + "loss": 0.4922, + "step": 467 + }, + { + "epoch": 0.2519345939035058, + "grad_norm": 1.4281502602519498, + "learning_rate": 8.763389201698438e-06, + "loss": 0.4426, + "step": 468 + }, + { + "epoch": 0.2524729156853509, + "grad_norm": 1.787920776798679, + "learning_rate": 8.757641141333296e-06, + "loss": 0.4451, + "step": 469 + }, + { + "epoch": 0.253011237467196, + "grad_norm": 1.4246034781799948, + "learning_rate": 8.751881647249228e-06, + "loss": 0.4353, + "step": 470 + }, + { + "epoch": 0.2535495592490411, + "grad_norm": 1.6679185342871934, + "learning_rate": 8.746110736971175e-06, + "loss": 0.4573, + "step": 471 + }, + { + "epoch": 0.2540878810308862, + "grad_norm": 1.6765594656197593, + "learning_rate": 8.740328428058813e-06, + "loss": 0.4797, + "step": 472 + }, + { + "epoch": 0.2546262028127313, + "grad_norm": 1.7826390062476167, + "learning_rate": 8.734534738106503e-06, + "loss": 0.473, + "step": 473 + }, + { + "epoch": 0.2551645245945764, + "grad_norm": 2.195730177211015, + "learning_rate": 8.728729684743238e-06, + "loss": 0.4648, + "step": 474 + }, + { + "epoch": 0.2557028463764215, + "grad_norm": 1.475566632306908, + "learning_rate": 8.722913285632584e-06, + "loss": 0.4845, + "step": 475 + }, + { + "epoch": 0.2562411681582666, + "grad_norm": 1.7347583810505152, + "learning_rate": 8.717085558472631e-06, + "loss": 0.4708, + "step": 476 + }, + { + "epoch": 0.2567794899401117, + "grad_norm": 1.6902146229456119, + "learning_rate": 8.71124652099594e-06, + "loss": 0.4817, + "step": 477 + }, + { + "epoch": 0.2573178117219568, + "grad_norm": 1.7071042054828858, + "learning_rate": 8.705396190969484e-06, + "loss": 0.4712, + "step": 478 + }, + { + "epoch": 0.2578561335038019, + "grad_norm": 1.729348975756144, + "learning_rate": 8.699534586194598e-06, + "loss": 0.4881, + "step": 479 + }, + { + "epoch": 0.258394455285647, + "grad_norm": 1.4614872127177663, + "learning_rate": 8.693661724506924e-06, + "loss": 0.457, + "step": 480 + }, + { + "epoch": 0.258394455285647, + "eval_loss": 0.4751787483692169, + "eval_runtime": 1539.7899, + "eval_samples_per_second": 16.242, + "eval_steps_per_second": 0.508, + "step": 480 + }, + { + "epoch": 0.25893277706749207, + "grad_norm": 2.1154756500873977, + "learning_rate": 8.687777623776357e-06, + "loss": 0.4842, + "step": 481 + }, + { + "epoch": 0.25947109884933717, + "grad_norm": 1.5862460419373354, + "learning_rate": 8.681882301906988e-06, + "loss": 0.4432, + "step": 482 + }, + { + "epoch": 0.26000942063118226, + "grad_norm": 1.796404843665338, + "learning_rate": 8.675975776837053e-06, + "loss": 0.4759, + "step": 483 + }, + { + "epoch": 0.26054774241302736, + "grad_norm": 1.5555927859924092, + "learning_rate": 8.67005806653888e-06, + "loss": 0.509, + "step": 484 + }, + { + "epoch": 0.26108606419487246, + "grad_norm": 2.1699720622194354, + "learning_rate": 8.664129189018826e-06, + "loss": 0.5334, + "step": 485 + }, + { + "epoch": 0.2616243859767176, + "grad_norm": 1.690073634180223, + "learning_rate": 8.658189162317226e-06, + "loss": 0.4356, + "step": 486 + }, + { + "epoch": 0.2621627077585627, + "grad_norm": 1.8294975401345657, + "learning_rate": 8.65223800450835e-06, + "loss": 0.4387, + "step": 487 + }, + { + "epoch": 0.2627010295404078, + "grad_norm": 2.5288130694594337, + "learning_rate": 8.646275733700327e-06, + "loss": 0.4567, + "step": 488 + }, + { + "epoch": 0.2632393513222529, + "grad_norm": 1.957861459161194, + "learning_rate": 8.640302368035105e-06, + "loss": 0.4614, + "step": 489 + }, + { + "epoch": 0.263777673104098, + "grad_norm": 1.5304950580333017, + "learning_rate": 8.634317925688392e-06, + "loss": 0.4655, + "step": 490 + }, + { + "epoch": 0.2643159948859431, + "grad_norm": 1.667011172421826, + "learning_rate": 8.628322424869599e-06, + "loss": 0.4834, + "step": 491 + }, + { + "epoch": 0.2648543166677882, + "grad_norm": 2.1636641173694464, + "learning_rate": 8.622315883821783e-06, + "loss": 0.4776, + "step": 492 + }, + { + "epoch": 0.2653926384496333, + "grad_norm": 1.46798046973594, + "learning_rate": 8.616298320821601e-06, + "loss": 0.4272, + "step": 493 + }, + { + "epoch": 0.2659309602314784, + "grad_norm": 1.861178177564276, + "learning_rate": 8.61026975417924e-06, + "loss": 0.4784, + "step": 494 + }, + { + "epoch": 0.2664692820133235, + "grad_norm": 1.6268110739530368, + "learning_rate": 8.604230202238373e-06, + "loss": 0.5029, + "step": 495 + }, + { + "epoch": 0.2670076037951686, + "grad_norm": 1.5680263307618678, + "learning_rate": 8.598179683376098e-06, + "loss": 0.4225, + "step": 496 + }, + { + "epoch": 0.2675459255770137, + "grad_norm": 1.5774347517397593, + "learning_rate": 8.592118216002883e-06, + "loss": 0.4879, + "step": 497 + }, + { + "epoch": 0.2680842473588588, + "grad_norm": 2.670832440569625, + "learning_rate": 8.586045818562508e-06, + "loss": 0.4667, + "step": 498 + }, + { + "epoch": 0.26862256914070387, + "grad_norm": 2.2055704035459787, + "learning_rate": 8.579962509532016e-06, + "loss": 0.4331, + "step": 499 + }, + { + "epoch": 0.26916089092254897, + "grad_norm": 1.4435727148058994, + "learning_rate": 8.573868307421648e-06, + "loss": 0.4894, + "step": 500 + }, + { + "epoch": 0.26969921270439406, + "grad_norm": 1.6814136996880347, + "learning_rate": 8.567763230774789e-06, + "loss": 0.4697, + "step": 501 + }, + { + "epoch": 0.27023753448623916, + "grad_norm": 1.5774141123551826, + "learning_rate": 8.561647298167918e-06, + "loss": 0.503, + "step": 502 + }, + { + "epoch": 0.27077585626808426, + "grad_norm": 1.5778826165083357, + "learning_rate": 8.555520528210541e-06, + "loss": 0.4535, + "step": 503 + }, + { + "epoch": 0.27131417804992936, + "grad_norm": 1.7129721491097367, + "learning_rate": 8.549382939545143e-06, + "loss": 0.4494, + "step": 504 + }, + { + "epoch": 0.27185249983177445, + "grad_norm": 1.8943346844828264, + "learning_rate": 8.543234550847128e-06, + "loss": 0.5063, + "step": 505 + }, + { + "epoch": 0.27239082161361955, + "grad_norm": 1.5886936361058726, + "learning_rate": 8.537075380824761e-06, + "loss": 0.4652, + "step": 506 + }, + { + "epoch": 0.27292914339546465, + "grad_norm": 1.4831172032030655, + "learning_rate": 8.530905448219112e-06, + "loss": 0.4243, + "step": 507 + }, + { + "epoch": 0.27346746517730974, + "grad_norm": 1.7919686995453996, + "learning_rate": 8.524724771804001e-06, + "loss": 0.5049, + "step": 508 + }, + { + "epoch": 0.27400578695915484, + "grad_norm": 1.7505822684442558, + "learning_rate": 8.518533370385939e-06, + "loss": 0.4423, + "step": 509 + }, + { + "epoch": 0.27454410874099994, + "grad_norm": 1.5798026347891434, + "learning_rate": 8.512331262804069e-06, + "loss": 0.4866, + "step": 510 + }, + { + "epoch": 0.27508243052284503, + "grad_norm": 1.8464155171834333, + "learning_rate": 8.506118467930112e-06, + "loss": 0.4708, + "step": 511 + }, + { + "epoch": 0.27562075230469013, + "grad_norm": 1.6897436623195476, + "learning_rate": 8.499895004668308e-06, + "loss": 0.4903, + "step": 512 + }, + { + "epoch": 0.27615907408653523, + "grad_norm": 1.7863457448170967, + "learning_rate": 8.49366089195536e-06, + "loss": 0.5092, + "step": 513 + }, + { + "epoch": 0.2766973958683803, + "grad_norm": 1.7320740104134424, + "learning_rate": 8.487416148760375e-06, + "loss": 0.48, + "step": 514 + }, + { + "epoch": 0.2772357176502254, + "grad_norm": 1.7064456081649735, + "learning_rate": 8.481160794084799e-06, + "loss": 0.4754, + "step": 515 + }, + { + "epoch": 0.2777740394320705, + "grad_norm": 1.7525756365837095, + "learning_rate": 8.47489484696238e-06, + "loss": 0.427, + "step": 516 + }, + { + "epoch": 0.2783123612139156, + "grad_norm": 2.058946941055886, + "learning_rate": 8.468618326459086e-06, + "loss": 0.4847, + "step": 517 + }, + { + "epoch": 0.2788506829957607, + "grad_norm": 2.0477477556261467, + "learning_rate": 8.46233125167306e-06, + "loss": 0.4579, + "step": 518 + }, + { + "epoch": 0.2793890047776058, + "grad_norm": 1.783616738245662, + "learning_rate": 8.456033641734562e-06, + "loss": 0.4858, + "step": 519 + }, + { + "epoch": 0.2799273265594509, + "grad_norm": 2.0513841896237444, + "learning_rate": 8.449725515805907e-06, + "loss": 0.5352, + "step": 520 + }, + { + "epoch": 0.280465648341296, + "grad_norm": 1.6372025528727123, + "learning_rate": 8.443406893081406e-06, + "loss": 0.4618, + "step": 521 + }, + { + "epoch": 0.2810039701231411, + "grad_norm": 1.5571805104955587, + "learning_rate": 8.437077792787314e-06, + "loss": 0.4038, + "step": 522 + }, + { + "epoch": 0.2815422919049862, + "grad_norm": 1.75233105631481, + "learning_rate": 8.43073823418176e-06, + "loss": 0.4845, + "step": 523 + }, + { + "epoch": 0.2820806136868313, + "grad_norm": 1.6881033261753147, + "learning_rate": 8.424388236554704e-06, + "loss": 0.4865, + "step": 524 + }, + { + "epoch": 0.2826189354686764, + "grad_norm": 1.796069079351986, + "learning_rate": 8.418027819227861e-06, + "loss": 0.4538, + "step": 525 + }, + { + "epoch": 0.2831572572505215, + "grad_norm": 1.24349614978993, + "learning_rate": 8.41165700155466e-06, + "loss": 0.4166, + "step": 526 + }, + { + "epoch": 0.2836955790323666, + "grad_norm": 1.932274887854439, + "learning_rate": 8.405275802920168e-06, + "loss": 0.5061, + "step": 527 + }, + { + "epoch": 0.2842339008142117, + "grad_norm": 1.5593268393001998, + "learning_rate": 8.398884242741045e-06, + "loss": 0.4894, + "step": 528 + }, + { + "epoch": 0.2847722225960568, + "grad_norm": 1.7069043502360113, + "learning_rate": 8.392482340465475e-06, + "loss": 0.4485, + "step": 529 + }, + { + "epoch": 0.2853105443779019, + "grad_norm": 1.5063144141336193, + "learning_rate": 8.386070115573115e-06, + "loss": 0.4175, + "step": 530 + }, + { + "epoch": 0.285848866159747, + "grad_norm": 1.4364305869165457, + "learning_rate": 8.379647587575026e-06, + "loss": 0.4766, + "step": 531 + }, + { + "epoch": 0.28638718794159207, + "grad_norm": 1.3932649525614649, + "learning_rate": 8.373214776013625e-06, + "loss": 0.406, + "step": 532 + }, + { + "epoch": 0.28692550972343717, + "grad_norm": 1.5523357464392091, + "learning_rate": 8.366771700462615e-06, + "loss": 0.508, + "step": 533 + }, + { + "epoch": 0.28746383150528226, + "grad_norm": 2.1213305217928613, + "learning_rate": 8.360318380526932e-06, + "loss": 0.4985, + "step": 534 + }, + { + "epoch": 0.28800215328712736, + "grad_norm": 1.5873480547904262, + "learning_rate": 8.353854835842685e-06, + "loss": 0.4919, + "step": 535 + }, + { + "epoch": 0.28854047506897246, + "grad_norm": 1.5670280821673355, + "learning_rate": 8.347381086077095e-06, + "loss": 0.4708, + "step": 536 + }, + { + "epoch": 0.28907879685081755, + "grad_norm": 1.6763746949820768, + "learning_rate": 8.34089715092843e-06, + "loss": 0.4165, + "step": 537 + }, + { + "epoch": 0.28961711863266265, + "grad_norm": 1.5717106133141925, + "learning_rate": 8.334403050125956e-06, + "loss": 0.4554, + "step": 538 + }, + { + "epoch": 0.29015544041450775, + "grad_norm": 1.9743994746638458, + "learning_rate": 8.327898803429866e-06, + "loss": 0.4695, + "step": 539 + }, + { + "epoch": 0.29069376219635285, + "grad_norm": 1.5473676266482859, + "learning_rate": 8.32138443063123e-06, + "loss": 0.4712, + "step": 540 + }, + { + "epoch": 0.29069376219635285, + "eval_loss": 0.47182729840278625, + "eval_runtime": 1553.992, + "eval_samples_per_second": 16.094, + "eval_steps_per_second": 0.503, + "step": 540 + }, + { + "epoch": 0.29123208397819794, + "grad_norm": 1.4425882953477511, + "learning_rate": 8.314859951551926e-06, + "loss": 0.4837, + "step": 541 + }, + { + "epoch": 0.29177040576004304, + "grad_norm": 1.3326493426074462, + "learning_rate": 8.308325386044583e-06, + "loss": 0.4814, + "step": 542 + }, + { + "epoch": 0.2923087275418882, + "grad_norm": 1.6128638362772016, + "learning_rate": 8.301780753992523e-06, + "loss": 0.4575, + "step": 543 + }, + { + "epoch": 0.2928470493237333, + "grad_norm": 1.4423693981211698, + "learning_rate": 8.295226075309697e-06, + "loss": 0.4633, + "step": 544 + }, + { + "epoch": 0.2933853711055784, + "grad_norm": 1.6198600771922913, + "learning_rate": 8.288661369940627e-06, + "loss": 0.4463, + "step": 545 + }, + { + "epoch": 0.2939236928874235, + "grad_norm": 1.5249628074643904, + "learning_rate": 8.282086657860342e-06, + "loss": 0.4668, + "step": 546 + }, + { + "epoch": 0.2944620146692686, + "grad_norm": 1.8125904384120293, + "learning_rate": 8.275501959074325e-06, + "loss": 0.4825, + "step": 547 + }, + { + "epoch": 0.2950003364511137, + "grad_norm": 1.9606743516276068, + "learning_rate": 8.268907293618437e-06, + "loss": 0.4684, + "step": 548 + }, + { + "epoch": 0.2955386582329588, + "grad_norm": 1.494990763192773, + "learning_rate": 8.262302681558872e-06, + "loss": 0.4664, + "step": 549 + }, + { + "epoch": 0.29607698001480387, + "grad_norm": 1.8337579001893594, + "learning_rate": 8.255688142992089e-06, + "loss": 0.4699, + "step": 550 + }, + { + "epoch": 0.29661530179664897, + "grad_norm": 1.779841389754219, + "learning_rate": 8.24906369804475e-06, + "loss": 0.4857, + "step": 551 + }, + { + "epoch": 0.29715362357849406, + "grad_norm": 1.6593925240524081, + "learning_rate": 8.242429366873663e-06, + "loss": 0.5038, + "step": 552 + }, + { + "epoch": 0.29769194536033916, + "grad_norm": 1.9956877344800352, + "learning_rate": 8.235785169665711e-06, + "loss": 0.4911, + "step": 553 + }, + { + "epoch": 0.29823026714218426, + "grad_norm": 1.579568204329291, + "learning_rate": 8.229131126637804e-06, + "loss": 0.4552, + "step": 554 + }, + { + "epoch": 0.29876858892402935, + "grad_norm": 1.5989428055850947, + "learning_rate": 8.222467258036808e-06, + "loss": 0.5177, + "step": 555 + }, + { + "epoch": 0.29930691070587445, + "grad_norm": 2.349536199541145, + "learning_rate": 8.215793584139485e-06, + "loss": 0.4911, + "step": 556 + }, + { + "epoch": 0.29984523248771955, + "grad_norm": 1.9403593317863332, + "learning_rate": 8.209110125252435e-06, + "loss": 0.5061, + "step": 557 + }, + { + "epoch": 0.30038355426956465, + "grad_norm": 1.7346564666609186, + "learning_rate": 8.202416901712033e-06, + "loss": 0.4357, + "step": 558 + }, + { + "epoch": 0.30092187605140974, + "grad_norm": 1.710471255918245, + "learning_rate": 8.195713933884359e-06, + "loss": 0.5015, + "step": 559 + }, + { + "epoch": 0.30146019783325484, + "grad_norm": 2.207816727293276, + "learning_rate": 8.189001242165151e-06, + "loss": 0.527, + "step": 560 + }, + { + "epoch": 0.30199851961509994, + "grad_norm": 1.428363458277829, + "learning_rate": 8.182278846979728e-06, + "loss": 0.4983, + "step": 561 + }, + { + "epoch": 0.30253684139694503, + "grad_norm": 1.77069966551508, + "learning_rate": 8.175546768782938e-06, + "loss": 0.4996, + "step": 562 + }, + { + "epoch": 0.30307516317879013, + "grad_norm": 1.631420375855133, + "learning_rate": 8.168805028059095e-06, + "loss": 0.4899, + "step": 563 + }, + { + "epoch": 0.3036134849606352, + "grad_norm": 1.6234744365340297, + "learning_rate": 8.162053645321908e-06, + "loss": 0.4275, + "step": 564 + }, + { + "epoch": 0.3041518067424803, + "grad_norm": 1.7151129037835051, + "learning_rate": 8.15529264111443e-06, + "loss": 0.4628, + "step": 565 + }, + { + "epoch": 0.3046901285243254, + "grad_norm": 1.6757537025608307, + "learning_rate": 8.148522036008985e-06, + "loss": 0.4636, + "step": 566 + }, + { + "epoch": 0.3052284503061705, + "grad_norm": 1.157809434742461, + "learning_rate": 8.141741850607117e-06, + "loss": 0.3868, + "step": 567 + }, + { + "epoch": 0.3057667720880156, + "grad_norm": 1.4360027236144732, + "learning_rate": 8.134952105539515e-06, + "loss": 0.4725, + "step": 568 + }, + { + "epoch": 0.3063050938698607, + "grad_norm": 1.6762158717929798, + "learning_rate": 8.128152821465957e-06, + "loss": 0.4818, + "step": 569 + }, + { + "epoch": 0.3068434156517058, + "grad_norm": 1.6736535469921034, + "learning_rate": 8.121344019075253e-06, + "loss": 0.4805, + "step": 570 + }, + { + "epoch": 0.3073817374335509, + "grad_norm": 1.5918931966460608, + "learning_rate": 8.114525719085163e-06, + "loss": 0.5152, + "step": 571 + }, + { + "epoch": 0.307920059215396, + "grad_norm": 1.4169517878992852, + "learning_rate": 8.107697942242356e-06, + "loss": 0.4731, + "step": 572 + }, + { + "epoch": 0.3084583809972411, + "grad_norm": 1.5959353428431666, + "learning_rate": 8.100860709322334e-06, + "loss": 0.4463, + "step": 573 + }, + { + "epoch": 0.3089967027790862, + "grad_norm": 1.4569323564340282, + "learning_rate": 8.094014041129373e-06, + "loss": 0.4046, + "step": 574 + }, + { + "epoch": 0.3095350245609313, + "grad_norm": 1.5558748525412556, + "learning_rate": 8.087157958496456e-06, + "loss": 0.4644, + "step": 575 + }, + { + "epoch": 0.3100733463427764, + "grad_norm": 1.6641076139574378, + "learning_rate": 8.080292482285213e-06, + "loss": 0.5064, + "step": 576 + }, + { + "epoch": 0.3106116681246215, + "grad_norm": 1.5793644667521578, + "learning_rate": 8.07341763338586e-06, + "loss": 0.515, + "step": 577 + }, + { + "epoch": 0.3111499899064666, + "grad_norm": 1.895774618714942, + "learning_rate": 8.066533432717127e-06, + "loss": 0.4763, + "step": 578 + }, + { + "epoch": 0.3116883116883117, + "grad_norm": 1.6689610869771314, + "learning_rate": 8.059639901226203e-06, + "loss": 0.4487, + "step": 579 + }, + { + "epoch": 0.3122266334701568, + "grad_norm": 1.4289516860868958, + "learning_rate": 8.05273705988867e-06, + "loss": 0.426, + "step": 580 + }, + { + "epoch": 0.3127649552520019, + "grad_norm": 1.448460429863824, + "learning_rate": 8.04582492970843e-06, + "loss": 0.4622, + "step": 581 + }, + { + "epoch": 0.31330327703384697, + "grad_norm": 1.562340995796949, + "learning_rate": 8.038903531717662e-06, + "loss": 0.4644, + "step": 582 + }, + { + "epoch": 0.31384159881569207, + "grad_norm": 1.4837986133941243, + "learning_rate": 8.031972886976731e-06, + "loss": 0.4845, + "step": 583 + }, + { + "epoch": 0.31437992059753717, + "grad_norm": 1.696043847539263, + "learning_rate": 8.025033016574148e-06, + "loss": 0.4631, + "step": 584 + }, + { + "epoch": 0.31491824237938226, + "grad_norm": 1.8636443570370922, + "learning_rate": 8.018083941626494e-06, + "loss": 0.4582, + "step": 585 + }, + { + "epoch": 0.31545656416122736, + "grad_norm": 1.6588060343624296, + "learning_rate": 8.011125683278351e-06, + "loss": 0.4118, + "step": 586 + }, + { + "epoch": 0.31599488594307246, + "grad_norm": 2.064927405044272, + "learning_rate": 8.004158262702253e-06, + "loss": 0.5307, + "step": 587 + }, + { + "epoch": 0.31653320772491755, + "grad_norm": 1.7599540523459494, + "learning_rate": 7.997181701098608e-06, + "loss": 0.4542, + "step": 588 + }, + { + "epoch": 0.31707152950676265, + "grad_norm": 1.679120614548226, + "learning_rate": 7.99019601969564e-06, + "loss": 0.4462, + "step": 589 + }, + { + "epoch": 0.31760985128860775, + "grad_norm": 1.6748781594901945, + "learning_rate": 7.983201239749321e-06, + "loss": 0.4435, + "step": 590 + }, + { + "epoch": 0.31814817307045284, + "grad_norm": 1.6895768411385892, + "learning_rate": 7.976197382543306e-06, + "loss": 0.5043, + "step": 591 + }, + { + "epoch": 0.31868649485229794, + "grad_norm": 1.4551705590923076, + "learning_rate": 7.969184469388877e-06, + "loss": 0.4992, + "step": 592 + }, + { + "epoch": 0.31922481663414304, + "grad_norm": 1.8224446520059305, + "learning_rate": 7.962162521624865e-06, + "loss": 0.5242, + "step": 593 + }, + { + "epoch": 0.31976313841598814, + "grad_norm": 1.5471915857747345, + "learning_rate": 7.955131560617595e-06, + "loss": 0.4672, + "step": 594 + }, + { + "epoch": 0.32030146019783323, + "grad_norm": 1.943277469873626, + "learning_rate": 7.948091607760815e-06, + "loss": 0.4817, + "step": 595 + }, + { + "epoch": 0.32083978197967833, + "grad_norm": 1.361762394527565, + "learning_rate": 7.941042684475635e-06, + "loss": 0.4341, + "step": 596 + }, + { + "epoch": 0.3213781037615234, + "grad_norm": 1.578768861245864, + "learning_rate": 7.933984812210459e-06, + "loss": 0.452, + "step": 597 + }, + { + "epoch": 0.3219164255433685, + "grad_norm": 1.3732353872225034, + "learning_rate": 7.926918012440923e-06, + "loss": 0.4349, + "step": 598 + }, + { + "epoch": 0.3224547473252136, + "grad_norm": 1.8064334973816905, + "learning_rate": 7.919842306669825e-06, + "loss": 0.4499, + "step": 599 + }, + { + "epoch": 0.3229930691070587, + "grad_norm": 1.582853458222087, + "learning_rate": 7.912757716427062e-06, + "loss": 0.4865, + "step": 600 + }, + { + "epoch": 0.3229930691070587, + "eval_loss": 0.4672350585460663, + "eval_runtime": 1563.3319, + "eval_samples_per_second": 15.998, + "eval_steps_per_second": 0.5, + "step": 600 + }, + { + "epoch": 0.32353139088890387, + "grad_norm": 1.6009402167895466, + "learning_rate": 7.905664263269567e-06, + "loss": 0.4576, + "step": 601 + }, + { + "epoch": 0.32406971267074897, + "grad_norm": 1.6832973254975117, + "learning_rate": 7.898561968781242e-06, + "loss": 0.457, + "step": 602 + }, + { + "epoch": 0.32460803445259406, + "grad_norm": 4.046599916473538, + "learning_rate": 7.891450854572884e-06, + "loss": 0.49, + "step": 603 + }, + { + "epoch": 0.32514635623443916, + "grad_norm": 1.5254137578843718, + "learning_rate": 7.884330942282136e-06, + "loss": 0.4533, + "step": 604 + }, + { + "epoch": 0.32568467801628426, + "grad_norm": 1.5392402810831298, + "learning_rate": 7.877202253573404e-06, + "loss": 0.4566, + "step": 605 + }, + { + "epoch": 0.32622299979812935, + "grad_norm": 1.5838863815714255, + "learning_rate": 7.870064810137806e-06, + "loss": 0.4224, + "step": 606 + }, + { + "epoch": 0.32676132157997445, + "grad_norm": 1.5112598539099842, + "learning_rate": 7.862918633693091e-06, + "loss": 0.4537, + "step": 607 + }, + { + "epoch": 0.32729964336181955, + "grad_norm": 1.7380984306062113, + "learning_rate": 7.855763745983588e-06, + "loss": 0.5168, + "step": 608 + }, + { + "epoch": 0.32783796514366464, + "grad_norm": 1.3686616623355445, + "learning_rate": 7.848600168780127e-06, + "loss": 0.4774, + "step": 609 + }, + { + "epoch": 0.32837628692550974, + "grad_norm": 1.8037345014596735, + "learning_rate": 7.841427923879982e-06, + "loss": 0.4841, + "step": 610 + }, + { + "epoch": 0.32891460870735484, + "grad_norm": 1.5578093278723995, + "learning_rate": 7.834247033106798e-06, + "loss": 0.4494, + "step": 611 + }, + { + "epoch": 0.32945293048919994, + "grad_norm": 1.7470526074648303, + "learning_rate": 7.827057518310532e-06, + "loss": 0.4316, + "step": 612 + }, + { + "epoch": 0.32999125227104503, + "grad_norm": 1.344635684714144, + "learning_rate": 7.819859401367376e-06, + "loss": 0.4277, + "step": 613 + }, + { + "epoch": 0.33052957405289013, + "grad_norm": 1.6142148463610868, + "learning_rate": 7.8126527041797e-06, + "loss": 0.4732, + "step": 614 + }, + { + "epoch": 0.3310678958347352, + "grad_norm": 1.4894686294102883, + "learning_rate": 7.805437448675986e-06, + "loss": 0.4804, + "step": 615 + }, + { + "epoch": 0.3316062176165803, + "grad_norm": 1.959553525810308, + "learning_rate": 7.798213656810747e-06, + "loss": 0.5052, + "step": 616 + }, + { + "epoch": 0.3321445393984254, + "grad_norm": 1.5799236754205312, + "learning_rate": 7.790981350564482e-06, + "loss": 0.432, + "step": 617 + }, + { + "epoch": 0.3326828611802705, + "grad_norm": 1.82490515289263, + "learning_rate": 7.783740551943586e-06, + "loss": 0.4394, + "step": 618 + }, + { + "epoch": 0.3332211829621156, + "grad_norm": 1.5031228288941465, + "learning_rate": 7.776491282980305e-06, + "loss": 0.5064, + "step": 619 + }, + { + "epoch": 0.3337595047439607, + "grad_norm": 1.4329349118783261, + "learning_rate": 7.76923356573265e-06, + "loss": 0.489, + "step": 620 + }, + { + "epoch": 0.3342978265258058, + "grad_norm": 1.4961946186338742, + "learning_rate": 7.761967422284347e-06, + "loss": 0.4704, + "step": 621 + }, + { + "epoch": 0.3348361483076509, + "grad_norm": 1.7319823672043928, + "learning_rate": 7.754692874744752e-06, + "loss": 0.4621, + "step": 622 + }, + { + "epoch": 0.335374470089496, + "grad_norm": 2.0507693298974035, + "learning_rate": 7.747409945248797e-06, + "loss": 0.502, + "step": 623 + }, + { + "epoch": 0.3359127918713411, + "grad_norm": 1.4817353671174234, + "learning_rate": 7.74011865595692e-06, + "loss": 0.4975, + "step": 624 + }, + { + "epoch": 0.3364511136531862, + "grad_norm": 1.5154706925154366, + "learning_rate": 7.732819029054999e-06, + "loss": 0.4819, + "step": 625 + }, + { + "epoch": 0.3369894354350313, + "grad_norm": 2.9866409096863507, + "learning_rate": 7.725511086754269e-06, + "loss": 0.4947, + "step": 626 + }, + { + "epoch": 0.3375277572168764, + "grad_norm": 1.7699700957236326, + "learning_rate": 7.718194851291284e-06, + "loss": 0.4703, + "step": 627 + }, + { + "epoch": 0.3380660789987215, + "grad_norm": 2.371528841529566, + "learning_rate": 7.710870344927817e-06, + "loss": 0.5458, + "step": 628 + }, + { + "epoch": 0.3386044007805666, + "grad_norm": 1.5200234564971724, + "learning_rate": 7.703537589950819e-06, + "loss": 0.4562, + "step": 629 + }, + { + "epoch": 0.3391427225624117, + "grad_norm": 1.371146036616362, + "learning_rate": 7.696196608672333e-06, + "loss": 0.4196, + "step": 630 + }, + { + "epoch": 0.3396810443442568, + "grad_norm": 1.5627852767313657, + "learning_rate": 7.688847423429434e-06, + "loss": 0.505, + "step": 631 + }, + { + "epoch": 0.3402193661261019, + "grad_norm": 1.3089486655111793, + "learning_rate": 7.68149005658417e-06, + "loss": 0.4532, + "step": 632 + }, + { + "epoch": 0.34075768790794697, + "grad_norm": 1.72862210074593, + "learning_rate": 7.674124530523461e-06, + "loss": 0.5431, + "step": 633 + }, + { + "epoch": 0.34129600968979207, + "grad_norm": 1.397330557638678, + "learning_rate": 7.666750867659078e-06, + "loss": 0.46, + "step": 634 + }, + { + "epoch": 0.34183433147163717, + "grad_norm": 1.5822930242940645, + "learning_rate": 7.659369090427537e-06, + "loss": 0.5183, + "step": 635 + }, + { + "epoch": 0.34237265325348226, + "grad_norm": 1.517257101602274, + "learning_rate": 7.651979221290049e-06, + "loss": 0.4847, + "step": 636 + }, + { + "epoch": 0.34291097503532736, + "grad_norm": 1.569552765274582, + "learning_rate": 7.644581282732445e-06, + "loss": 0.5237, + "step": 637 + }, + { + "epoch": 0.34344929681717246, + "grad_norm": 1.5173887839906304, + "learning_rate": 7.637175297265109e-06, + "loss": 0.444, + "step": 638 + }, + { + "epoch": 0.34398761859901755, + "grad_norm": 2.6037607041595883, + "learning_rate": 7.629761287422915e-06, + "loss": 0.4271, + "step": 639 + }, + { + "epoch": 0.34452594038086265, + "grad_norm": 1.6900192017878133, + "learning_rate": 7.622339275765147e-06, + "loss": 0.4631, + "step": 640 + }, + { + "epoch": 0.34506426216270775, + "grad_norm": 1.6204089265699804, + "learning_rate": 7.61490928487544e-06, + "loss": 0.4798, + "step": 641 + }, + { + "epoch": 0.34560258394455284, + "grad_norm": 2.072148397739707, + "learning_rate": 7.6074713373617094e-06, + "loss": 0.5169, + "step": 642 + }, + { + "epoch": 0.34614090572639794, + "grad_norm": 1.4489303833679512, + "learning_rate": 7.600025455856078e-06, + "loss": 0.4477, + "step": 643 + }, + { + "epoch": 0.34667922750824304, + "grad_norm": 1.808968142318587, + "learning_rate": 7.592571663014811e-06, + "loss": 0.4591, + "step": 644 + }, + { + "epoch": 0.34721754929008813, + "grad_norm": 1.4861828747421941, + "learning_rate": 7.5851099815182505e-06, + "loss": 0.4792, + "step": 645 + }, + { + "epoch": 0.34775587107193323, + "grad_norm": 1.6729126421729203, + "learning_rate": 7.577640434070734e-06, + "loss": 0.4832, + "step": 646 + }, + { + "epoch": 0.34829419285377833, + "grad_norm": 1.871195222211602, + "learning_rate": 7.5701630434005405e-06, + "loss": 0.4417, + "step": 647 + }, + { + "epoch": 0.3488325146356234, + "grad_norm": 1.51735945461571, + "learning_rate": 7.56267783225981e-06, + "loss": 0.4741, + "step": 648 + }, + { + "epoch": 0.3493708364174685, + "grad_norm": 2.071142969866682, + "learning_rate": 7.555184823424479e-06, + "loss": 0.4127, + "step": 649 + }, + { + "epoch": 0.3499091581993136, + "grad_norm": 1.910282433363155, + "learning_rate": 7.547684039694216e-06, + "loss": 0.4531, + "step": 650 + }, + { + "epoch": 0.3504474799811587, + "grad_norm": 1.9652818314978835, + "learning_rate": 7.54017550389234e-06, + "loss": 0.5085, + "step": 651 + }, + { + "epoch": 0.3509858017630038, + "grad_norm": 1.6117024086203307, + "learning_rate": 7.5326592388657605e-06, + "loss": 0.5148, + "step": 652 + }, + { + "epoch": 0.3515241235448489, + "grad_norm": 1.4960314880258612, + "learning_rate": 7.525135267484906e-06, + "loss": 0.4629, + "step": 653 + }, + { + "epoch": 0.352062445326694, + "grad_norm": 1.604228922752054, + "learning_rate": 7.517603612643653e-06, + "loss": 0.5117, + "step": 654 + }, + { + "epoch": 0.3526007671085391, + "grad_norm": 2.136019956641433, + "learning_rate": 7.5100642972592606e-06, + "loss": 0.4629, + "step": 655 + }, + { + "epoch": 0.3531390888903842, + "grad_norm": 1.4857693238664922, + "learning_rate": 7.50251734427229e-06, + "loss": 0.4671, + "step": 656 + }, + { + "epoch": 0.3536774106722293, + "grad_norm": 1.4380772688023766, + "learning_rate": 7.494962776646549e-06, + "loss": 0.428, + "step": 657 + }, + { + "epoch": 0.35421573245407445, + "grad_norm": 1.7510803552126726, + "learning_rate": 7.487400617369013e-06, + "loss": 0.4417, + "step": 658 + }, + { + "epoch": 0.35475405423591955, + "grad_norm": 1.8718328199464012, + "learning_rate": 7.479830889449754e-06, + "loss": 0.4489, + "step": 659 + }, + { + "epoch": 0.35529237601776464, + "grad_norm": 1.3987482870509058, + "learning_rate": 7.472253615921878e-06, + "loss": 0.5121, + "step": 660 + }, + { + "epoch": 0.35529237601776464, + "eval_loss": 0.4641415774822235, + "eval_runtime": 1581.4987, + "eval_samples_per_second": 15.814, + "eval_steps_per_second": 0.494, + "step": 660 + }, + { + "epoch": 0.35583069779960974, + "grad_norm": 1.5856953831241587, + "learning_rate": 7.464668819841453e-06, + "loss": 0.4429, + "step": 661 + }, + { + "epoch": 0.35636901958145484, + "grad_norm": 1.648655956667231, + "learning_rate": 7.457076524287426e-06, + "loss": 0.4794, + "step": 662 + }, + { + "epoch": 0.35690734136329993, + "grad_norm": 1.8056054836187343, + "learning_rate": 7.4494767523615754e-06, + "loss": 0.4488, + "step": 663 + }, + { + "epoch": 0.35744566314514503, + "grad_norm": 1.7062432057396102, + "learning_rate": 7.441869527188421e-06, + "loss": 0.4506, + "step": 664 + }, + { + "epoch": 0.35798398492699013, + "grad_norm": 1.4819375518870144, + "learning_rate": 7.434254871915166e-06, + "loss": 0.4135, + "step": 665 + }, + { + "epoch": 0.3585223067088352, + "grad_norm": 1.734074823822691, + "learning_rate": 7.426632809711617e-06, + "loss": 0.4744, + "step": 666 + }, + { + "epoch": 0.3590606284906803, + "grad_norm": 1.5235626105944915, + "learning_rate": 7.4190033637701216e-06, + "loss": 0.4646, + "step": 667 + }, + { + "epoch": 0.3595989502725254, + "grad_norm": 1.9128329967338416, + "learning_rate": 7.411366557305495e-06, + "loss": 0.4626, + "step": 668 + }, + { + "epoch": 0.3601372720543705, + "grad_norm": 2.5022708068016097, + "learning_rate": 7.403722413554947e-06, + "loss": 0.4959, + "step": 669 + }, + { + "epoch": 0.3606755938362156, + "grad_norm": 1.8966801972869858, + "learning_rate": 7.396070955778013e-06, + "loss": 0.45, + "step": 670 + }, + { + "epoch": 0.3612139156180607, + "grad_norm": 2.061313497940433, + "learning_rate": 7.388412207256486e-06, + "loss": 0.4961, + "step": 671 + }, + { + "epoch": 0.3617522373999058, + "grad_norm": 1.6720715956995327, + "learning_rate": 7.380746191294341e-06, + "loss": 0.4667, + "step": 672 + }, + { + "epoch": 0.3622905591817509, + "grad_norm": 1.5487990630837682, + "learning_rate": 7.373072931217669e-06, + "loss": 0.527, + "step": 673 + }, + { + "epoch": 0.362828880963596, + "grad_norm": 1.4996736955806738, + "learning_rate": 7.365392450374598e-06, + "loss": 0.4353, + "step": 674 + }, + { + "epoch": 0.3633672027454411, + "grad_norm": 1.6372189463929279, + "learning_rate": 7.357704772135231e-06, + "loss": 0.469, + "step": 675 + }, + { + "epoch": 0.3639055245272862, + "grad_norm": 1.5447454253844684, + "learning_rate": 7.350009919891574e-06, + "loss": 0.4278, + "step": 676 + }, + { + "epoch": 0.3644438463091313, + "grad_norm": 1.4107385578994651, + "learning_rate": 7.342307917057457e-06, + "loss": 0.44, + "step": 677 + }, + { + "epoch": 0.3649821680909764, + "grad_norm": 1.4950963156286234, + "learning_rate": 7.334598787068469e-06, + "loss": 0.4529, + "step": 678 + }, + { + "epoch": 0.3655204898728215, + "grad_norm": 2.047196931688194, + "learning_rate": 7.326882553381886e-06, + "loss": 0.4993, + "step": 679 + }, + { + "epoch": 0.3660588116546666, + "grad_norm": 1.8078116478641435, + "learning_rate": 7.319159239476601e-06, + "loss": 0.4903, + "step": 680 + }, + { + "epoch": 0.3665971334365117, + "grad_norm": 1.6585777335125267, + "learning_rate": 7.311428868853047e-06, + "loss": 0.449, + "step": 681 + }, + { + "epoch": 0.3671354552183568, + "grad_norm": 1.644551492901717, + "learning_rate": 7.30369146503313e-06, + "loss": 0.4359, + "step": 682 + }, + { + "epoch": 0.3676737770002019, + "grad_norm": 1.566051715226832, + "learning_rate": 7.29594705156016e-06, + "loss": 0.5171, + "step": 683 + }, + { + "epoch": 0.36821209878204697, + "grad_norm": 1.860361723636211, + "learning_rate": 7.288195651998772e-06, + "loss": 0.5058, + "step": 684 + }, + { + "epoch": 0.36875042056389207, + "grad_norm": 1.479824820585221, + "learning_rate": 7.280437289934858e-06, + "loss": 0.5082, + "step": 685 + }, + { + "epoch": 0.36928874234573716, + "grad_norm": 1.5621912841951935, + "learning_rate": 7.272671988975499e-06, + "loss": 0.4861, + "step": 686 + }, + { + "epoch": 0.36982706412758226, + "grad_norm": 1.6260728405178757, + "learning_rate": 7.264899772748889e-06, + "loss": 0.5003, + "step": 687 + }, + { + "epoch": 0.37036538590942736, + "grad_norm": 1.5646367035382582, + "learning_rate": 7.2571206649042584e-06, + "loss": 0.4559, + "step": 688 + }, + { + "epoch": 0.37090370769127246, + "grad_norm": 1.7472551729015091, + "learning_rate": 7.249334689111814e-06, + "loss": 0.4541, + "step": 689 + }, + { + "epoch": 0.37144202947311755, + "grad_norm": 1.6362939723396042, + "learning_rate": 7.241541869062656e-06, + "loss": 0.4733, + "step": 690 + }, + { + "epoch": 0.37198035125496265, + "grad_norm": 1.4710809281537391, + "learning_rate": 7.2337422284687135e-06, + "loss": 0.4523, + "step": 691 + }, + { + "epoch": 0.37251867303680775, + "grad_norm": 1.6849371563467512, + "learning_rate": 7.225935791062665e-06, + "loss": 0.4976, + "step": 692 + }, + { + "epoch": 0.37305699481865284, + "grad_norm": 1.7850003378424297, + "learning_rate": 7.2181225805978745e-06, + "loss": 0.4482, + "step": 693 + }, + { + "epoch": 0.37359531660049794, + "grad_norm": 2.355398835881447, + "learning_rate": 7.210302620848315e-06, + "loss": 0.4599, + "step": 694 + }, + { + "epoch": 0.37413363838234304, + "grad_norm": 1.617194741699657, + "learning_rate": 7.20247593560849e-06, + "loss": 0.4543, + "step": 695 + }, + { + "epoch": 0.37467196016418813, + "grad_norm": 1.4733355105927, + "learning_rate": 7.1946425486933755e-06, + "loss": 0.4125, + "step": 696 + }, + { + "epoch": 0.37521028194603323, + "grad_norm": 1.4512303803275823, + "learning_rate": 7.186802483938333e-06, + "loss": 0.4515, + "step": 697 + }, + { + "epoch": 0.3757486037278783, + "grad_norm": 1.4829224037632613, + "learning_rate": 7.178955765199048e-06, + "loss": 0.475, + "step": 698 + }, + { + "epoch": 0.3762869255097234, + "grad_norm": 1.4882203445110318, + "learning_rate": 7.171102416351448e-06, + "loss": 0.4485, + "step": 699 + }, + { + "epoch": 0.3768252472915685, + "grad_norm": 1.6613200067557963, + "learning_rate": 7.163242461291639e-06, + "loss": 0.4402, + "step": 700 + }, + { + "epoch": 0.3773635690734136, + "grad_norm": 1.7483634690103926, + "learning_rate": 7.155375923935826e-06, + "loss": 0.4936, + "step": 701 + }, + { + "epoch": 0.3779018908552587, + "grad_norm": 1.6616671629226913, + "learning_rate": 7.14750282822024e-06, + "loss": 0.4644, + "step": 702 + }, + { + "epoch": 0.3784402126371038, + "grad_norm": 1.5260208283942596, + "learning_rate": 7.139623198101073e-06, + "loss": 0.489, + "step": 703 + }, + { + "epoch": 0.3789785344189489, + "grad_norm": 1.361965813750003, + "learning_rate": 7.131737057554399e-06, + "loss": 0.3901, + "step": 704 + }, + { + "epoch": 0.379516856200794, + "grad_norm": 1.620874046214403, + "learning_rate": 7.1238444305760975e-06, + "loss": 0.458, + "step": 705 + }, + { + "epoch": 0.3800551779826391, + "grad_norm": 1.7744718469804224, + "learning_rate": 7.115945341181789e-06, + "loss": 0.4585, + "step": 706 + }, + { + "epoch": 0.3805934997644842, + "grad_norm": 1.4959797567409379, + "learning_rate": 7.108039813406755e-06, + "loss": 0.4497, + "step": 707 + }, + { + "epoch": 0.3811318215463293, + "grad_norm": 1.645088668489625, + "learning_rate": 7.10012787130587e-06, + "loss": 0.4419, + "step": 708 + }, + { + "epoch": 0.3816701433281744, + "grad_norm": 1.5908205648141605, + "learning_rate": 7.092209538953527e-06, + "loss": 0.4768, + "step": 709 + }, + { + "epoch": 0.3822084651100195, + "grad_norm": 1.2865059891101038, + "learning_rate": 7.0842848404435574e-06, + "loss": 0.4432, + "step": 710 + }, + { + "epoch": 0.3827467868918646, + "grad_norm": 1.438686585698748, + "learning_rate": 7.07635379988917e-06, + "loss": 0.463, + "step": 711 + }, + { + "epoch": 0.3832851086737097, + "grad_norm": 1.5810030390346108, + "learning_rate": 7.068416441422867e-06, + "loss": 0.4324, + "step": 712 + }, + { + "epoch": 0.3838234304555548, + "grad_norm": 1.8920886247581228, + "learning_rate": 7.060472789196378e-06, + "loss": 0.4513, + "step": 713 + }, + { + "epoch": 0.3843617522373999, + "grad_norm": 1.4721512319324748, + "learning_rate": 7.052522867380578e-06, + "loss": 0.4794, + "step": 714 + }, + { + "epoch": 0.38490007401924503, + "grad_norm": 1.8748283518664401, + "learning_rate": 7.044566700165426e-06, + "loss": 0.5359, + "step": 715 + }, + { + "epoch": 0.38543839580109013, + "grad_norm": 2.1664339926414247, + "learning_rate": 7.036604311759879e-06, + "loss": 0.4696, + "step": 716 + }, + { + "epoch": 0.3859767175829352, + "grad_norm": 1.599064767192068, + "learning_rate": 7.028635726391826e-06, + "loss": 0.5009, + "step": 717 + }, + { + "epoch": 0.3865150393647803, + "grad_norm": 1.658951664965314, + "learning_rate": 7.020660968308011e-06, + "loss": 0.526, + "step": 718 + }, + { + "epoch": 0.3870533611466254, + "grad_norm": 1.5566803387570707, + "learning_rate": 7.012680061773962e-06, + "loss": 0.4944, + "step": 719 + }, + { + "epoch": 0.3875916829284705, + "grad_norm": 1.5561052872784167, + "learning_rate": 7.0046930310739145e-06, + "loss": 0.4023, + "step": 720 + }, + { + "epoch": 0.3875916829284705, + "eval_loss": 0.4598337709903717, + "eval_runtime": 1512.3789, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 0.517, + "step": 720 + }, + { + "epoch": 0.3881300047103156, + "grad_norm": 1.5343444055056177, + "learning_rate": 6.996699900510736e-06, + "loss": 0.4661, + "step": 721 + }, + { + "epoch": 0.3886683264921607, + "grad_norm": 1.5835711750557553, + "learning_rate": 6.988700694405861e-06, + "loss": 0.5243, + "step": 722 + }, + { + "epoch": 0.3892066482740058, + "grad_norm": 1.739458700941234, + "learning_rate": 6.980695437099203e-06, + "loss": 0.468, + "step": 723 + }, + { + "epoch": 0.3897449700558509, + "grad_norm": 1.4597418259308022, + "learning_rate": 6.972684152949095e-06, + "loss": 0.4312, + "step": 724 + }, + { + "epoch": 0.390283291837696, + "grad_norm": 1.4822140659700849, + "learning_rate": 6.964666866332202e-06, + "loss": 0.4171, + "step": 725 + }, + { + "epoch": 0.3908216136195411, + "grad_norm": 2.219448742321713, + "learning_rate": 6.956643601643459e-06, + "loss": 0.4682, + "step": 726 + }, + { + "epoch": 0.3913599354013862, + "grad_norm": 1.6249675680199915, + "learning_rate": 6.948614383295988e-06, + "loss": 0.467, + "step": 727 + }, + { + "epoch": 0.3918982571832313, + "grad_norm": 2.5331886913847916, + "learning_rate": 6.940579235721027e-06, + "loss": 0.5046, + "step": 728 + }, + { + "epoch": 0.3924365789650764, + "grad_norm": 1.651989792055275, + "learning_rate": 6.932538183367854e-06, + "loss": 0.4432, + "step": 729 + }, + { + "epoch": 0.3929749007469215, + "grad_norm": 1.4451051204854284, + "learning_rate": 6.924491250703716e-06, + "loss": 0.436, + "step": 730 + }, + { + "epoch": 0.3935132225287666, + "grad_norm": 1.6726948542569147, + "learning_rate": 6.916438462213756e-06, + "loss": 0.4701, + "step": 731 + }, + { + "epoch": 0.3940515443106117, + "grad_norm": 1.3458270610890806, + "learning_rate": 6.908379842400926e-06, + "loss": 0.461, + "step": 732 + }, + { + "epoch": 0.3945898660924568, + "grad_norm": 1.8671906958135296, + "learning_rate": 6.90031541578593e-06, + "loss": 0.4621, + "step": 733 + }, + { + "epoch": 0.3951281878743019, + "grad_norm": 1.6937643401491398, + "learning_rate": 6.892245206907136e-06, + "loss": 0.4403, + "step": 734 + }, + { + "epoch": 0.39566650965614697, + "grad_norm": 1.6011629978962008, + "learning_rate": 6.88416924032051e-06, + "loss": 0.4832, + "step": 735 + }, + { + "epoch": 0.39620483143799207, + "grad_norm": 1.7023847640279732, + "learning_rate": 6.876087540599532e-06, + "loss": 0.4871, + "step": 736 + }, + { + "epoch": 0.39674315321983716, + "grad_norm": 1.5639503808317925, + "learning_rate": 6.868000132335132e-06, + "loss": 0.504, + "step": 737 + }, + { + "epoch": 0.39728147500168226, + "grad_norm": 1.6209519657967315, + "learning_rate": 6.859907040135609e-06, + "loss": 0.4947, + "step": 738 + }, + { + "epoch": 0.39781979678352736, + "grad_norm": 1.4902231086791655, + "learning_rate": 6.851808288626554e-06, + "loss": 0.4329, + "step": 739 + }, + { + "epoch": 0.39835811856537245, + "grad_norm": 1.4751989923406863, + "learning_rate": 6.843703902450781e-06, + "loss": 0.469, + "step": 740 + }, + { + "epoch": 0.39889644034721755, + "grad_norm": 1.7318655949983495, + "learning_rate": 6.8355939062682485e-06, + "loss": 0.4646, + "step": 741 + }, + { + "epoch": 0.39943476212906265, + "grad_norm": 2.0477062374958312, + "learning_rate": 6.827478324755986e-06, + "loss": 0.4527, + "step": 742 + }, + { + "epoch": 0.39997308391090775, + "grad_norm": 1.5357049173396753, + "learning_rate": 6.819357182608014e-06, + "loss": 0.4119, + "step": 743 + }, + { + "epoch": 0.40051140569275284, + "grad_norm": 1.6669074072618764, + "learning_rate": 6.811230504535276e-06, + "loss": 0.4123, + "step": 744 + }, + { + "epoch": 0.40104972747459794, + "grad_norm": 2.0238793916536095, + "learning_rate": 6.803098315265563e-06, + "loss": 0.4607, + "step": 745 + }, + { + "epoch": 0.40158804925644304, + "grad_norm": 1.7302550872159141, + "learning_rate": 6.7949606395434294e-06, + "loss": 0.5252, + "step": 746 + }, + { + "epoch": 0.40212637103828813, + "grad_norm": 1.5575167275155066, + "learning_rate": 6.786817502130127e-06, + "loss": 0.4484, + "step": 747 + }, + { + "epoch": 0.40266469282013323, + "grad_norm": 1.3960320100955355, + "learning_rate": 6.778668927803526e-06, + "loss": 0.444, + "step": 748 + }, + { + "epoch": 0.4032030146019783, + "grad_norm": 1.5537207671933355, + "learning_rate": 6.770514941358041e-06, + "loss": 0.4522, + "step": 749 + }, + { + "epoch": 0.4037413363838234, + "grad_norm": 1.6191186519608955, + "learning_rate": 6.762355567604553e-06, + "loss": 0.489, + "step": 750 + }, + { + "epoch": 0.4042796581656685, + "grad_norm": 1.7320364851332162, + "learning_rate": 6.7541908313703355e-06, + "loss": 0.4746, + "step": 751 + }, + { + "epoch": 0.4048179799475136, + "grad_norm": 1.5268044530623444, + "learning_rate": 6.746020757498979e-06, + "loss": 0.4138, + "step": 752 + }, + { + "epoch": 0.4053563017293587, + "grad_norm": 1.522928297135606, + "learning_rate": 6.737845370850317e-06, + "loss": 0.4938, + "step": 753 + }, + { + "epoch": 0.4058946235112038, + "grad_norm": 1.567608770456755, + "learning_rate": 6.729664696300347e-06, + "loss": 0.4745, + "step": 754 + }, + { + "epoch": 0.4064329452930489, + "grad_norm": 1.5048680773669196, + "learning_rate": 6.721478758741155e-06, + "loss": 0.4714, + "step": 755 + }, + { + "epoch": 0.406971267074894, + "grad_norm": 1.7508536934704277, + "learning_rate": 6.713287583080845e-06, + "loss": 0.4778, + "step": 756 + }, + { + "epoch": 0.4075095888567391, + "grad_norm": 1.6217945250756625, + "learning_rate": 6.70509119424346e-06, + "loss": 0.4529, + "step": 757 + }, + { + "epoch": 0.4080479106385842, + "grad_norm": 1.6092594479977214, + "learning_rate": 6.696889617168897e-06, + "loss": 0.4674, + "step": 758 + }, + { + "epoch": 0.4085862324204293, + "grad_norm": 1.5153766468742507, + "learning_rate": 6.688682876812851e-06, + "loss": 0.4612, + "step": 759 + }, + { + "epoch": 0.4091245542022744, + "grad_norm": 1.6200362705011053, + "learning_rate": 6.6804709981467195e-06, + "loss": 0.4812, + "step": 760 + }, + { + "epoch": 0.4096628759841195, + "grad_norm": 1.6047382022765324, + "learning_rate": 6.672254006157541e-06, + "loss": 0.4758, + "step": 761 + }, + { + "epoch": 0.4102011977659646, + "grad_norm": 1.8520426373676713, + "learning_rate": 6.664031925847908e-06, + "loss": 0.4184, + "step": 762 + }, + { + "epoch": 0.4107395195478097, + "grad_norm": 2.2658987317474195, + "learning_rate": 6.6558047822358975e-06, + "loss": 0.5178, + "step": 763 + }, + { + "epoch": 0.4112778413296548, + "grad_norm": 1.580321228406977, + "learning_rate": 6.6475726003549934e-06, + "loss": 0.4249, + "step": 764 + }, + { + "epoch": 0.4118161631114999, + "grad_norm": 1.4077736219835957, + "learning_rate": 6.639335405254008e-06, + "loss": 0.4586, + "step": 765 + }, + { + "epoch": 0.412354484893345, + "grad_norm": 1.5112139801178681, + "learning_rate": 6.631093221997012e-06, + "loss": 0.4316, + "step": 766 + }, + { + "epoch": 0.41289280667519007, + "grad_norm": 1.4529648200398257, + "learning_rate": 6.6228460756632496e-06, + "loss": 0.4571, + "step": 767 + }, + { + "epoch": 0.41343112845703517, + "grad_norm": 1.826148495373045, + "learning_rate": 6.61459399134707e-06, + "loss": 0.4278, + "step": 768 + }, + { + "epoch": 0.41396945023888027, + "grad_norm": 1.5179851185666227, + "learning_rate": 6.6063369941578445e-06, + "loss": 0.4622, + "step": 769 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 1.3529363726674315, + "learning_rate": 6.5980751092198955e-06, + "loss": 0.4215, + "step": 770 + }, + { + "epoch": 0.41504609380257046, + "grad_norm": 2.0731132539429944, + "learning_rate": 6.589808361672417e-06, + "loss": 0.484, + "step": 771 + }, + { + "epoch": 0.4155844155844156, + "grad_norm": 1.4870501106627148, + "learning_rate": 6.581536776669402e-06, + "loss": 0.4863, + "step": 772 + }, + { + "epoch": 0.4161227373662607, + "grad_norm": 1.9062099501037697, + "learning_rate": 6.5732603793795535e-06, + "loss": 0.4238, + "step": 773 + }, + { + "epoch": 0.4166610591481058, + "grad_norm": 1.5565227999579219, + "learning_rate": 6.564979194986229e-06, + "loss": 0.4524, + "step": 774 + }, + { + "epoch": 0.4171993809299509, + "grad_norm": 2.306172957615922, + "learning_rate": 6.5566932486873455e-06, + "loss": 0.4964, + "step": 775 + }, + { + "epoch": 0.417737702711796, + "grad_norm": 1.401583156601946, + "learning_rate": 6.54840256569531e-06, + "loss": 0.4304, + "step": 776 + }, + { + "epoch": 0.4182760244936411, + "grad_norm": 1.749412909981746, + "learning_rate": 6.540107171236943e-06, + "loss": 0.4844, + "step": 777 + }, + { + "epoch": 0.4188143462754862, + "grad_norm": 1.6322807652870075, + "learning_rate": 6.531807090553402e-06, + "loss": 0.4853, + "step": 778 + }, + { + "epoch": 0.4193526680573313, + "grad_norm": 1.2479234535295218, + "learning_rate": 6.5235023489001046e-06, + "loss": 0.4491, + "step": 779 + }, + { + "epoch": 0.4198909898391764, + "grad_norm": 1.5833625576839316, + "learning_rate": 6.515192971546645e-06, + "loss": 0.4171, + "step": 780 + }, + { + "epoch": 0.4198909898391764, + "eval_loss": 0.4564184546470642, + "eval_runtime": 1517.3821, + "eval_samples_per_second": 16.482, + "eval_steps_per_second": 0.515, + "step": 780 + }, + { + "epoch": 0.4204293116210215, + "grad_norm": 1.5809122747897906, + "learning_rate": 6.50687898377673e-06, + "loss": 0.4087, + "step": 781 + }, + { + "epoch": 0.4209676334028666, + "grad_norm": 1.5387429096209948, + "learning_rate": 6.49856041088809e-06, + "loss": 0.4414, + "step": 782 + }, + { + "epoch": 0.4215059551847117, + "grad_norm": 1.6020701369523538, + "learning_rate": 6.49023727819241e-06, + "loss": 0.4237, + "step": 783 + }, + { + "epoch": 0.4220442769665568, + "grad_norm": 1.6896383664306511, + "learning_rate": 6.481909611015249e-06, + "loss": 0.5049, + "step": 784 + }, + { + "epoch": 0.42258259874840187, + "grad_norm": 1.4623261927757227, + "learning_rate": 6.47357743469596e-06, + "loss": 0.4513, + "step": 785 + }, + { + "epoch": 0.42312092053024697, + "grad_norm": 1.8063028002015338, + "learning_rate": 6.465240774587623e-06, + "loss": 0.4917, + "step": 786 + }, + { + "epoch": 0.42365924231209207, + "grad_norm": 1.639390083578586, + "learning_rate": 6.4568996560569515e-06, + "loss": 0.4578, + "step": 787 + }, + { + "epoch": 0.42419756409393716, + "grad_norm": 1.337761070121856, + "learning_rate": 6.448554104484236e-06, + "loss": 0.4523, + "step": 788 + }, + { + "epoch": 0.42473588587578226, + "grad_norm": 1.518872556678575, + "learning_rate": 6.44020414526325e-06, + "loss": 0.4384, + "step": 789 + }, + { + "epoch": 0.42527420765762736, + "grad_norm": 1.491028002743192, + "learning_rate": 6.431849803801179e-06, + "loss": 0.451, + "step": 790 + }, + { + "epoch": 0.42581252943947245, + "grad_norm": 2.093042650030991, + "learning_rate": 6.423491105518542e-06, + "loss": 0.4656, + "step": 791 + }, + { + "epoch": 0.42635085122131755, + "grad_norm": 1.9063256309499805, + "learning_rate": 6.415128075849118e-06, + "loss": 0.4848, + "step": 792 + }, + { + "epoch": 0.42688917300316265, + "grad_norm": 1.7660120890204227, + "learning_rate": 6.4067607402398625e-06, + "loss": 0.4451, + "step": 793 + }, + { + "epoch": 0.42742749478500774, + "grad_norm": 1.577961253859089, + "learning_rate": 6.398389124150832e-06, + "loss": 0.485, + "step": 794 + }, + { + "epoch": 0.42796581656685284, + "grad_norm": 1.6746798086361996, + "learning_rate": 6.3900132530551125e-06, + "loss": 0.4521, + "step": 795 + }, + { + "epoch": 0.42850413834869794, + "grad_norm": 1.696615006593536, + "learning_rate": 6.381633152438733e-06, + "loss": 0.4406, + "step": 796 + }, + { + "epoch": 0.42904246013054304, + "grad_norm": 3.213801364228645, + "learning_rate": 6.373248847800595e-06, + "loss": 0.5115, + "step": 797 + }, + { + "epoch": 0.42958078191238813, + "grad_norm": 1.719986070739237, + "learning_rate": 6.364860364652388e-06, + "loss": 0.4237, + "step": 798 + }, + { + "epoch": 0.43011910369423323, + "grad_norm": 1.778509802687885, + "learning_rate": 6.3564677285185196e-06, + "loss": 0.4568, + "step": 799 + }, + { + "epoch": 0.4306574254760783, + "grad_norm": 1.5260126863179546, + "learning_rate": 6.348070964936032e-06, + "loss": 0.4337, + "step": 800 + }, + { + "epoch": 0.4311957472579234, + "grad_norm": 1.5937231247097972, + "learning_rate": 6.339670099454526e-06, + "loss": 0.4642, + "step": 801 + }, + { + "epoch": 0.4317340690397685, + "grad_norm": 2.9535392042792465, + "learning_rate": 6.3312651576360866e-06, + "loss": 0.4434, + "step": 802 + }, + { + "epoch": 0.4322723908216136, + "grad_norm": 1.49472223900728, + "learning_rate": 6.322856165055198e-06, + "loss": 0.4125, + "step": 803 + }, + { + "epoch": 0.4328107126034587, + "grad_norm": 2.242176131558003, + "learning_rate": 6.314443147298675e-06, + "loss": 0.49, + "step": 804 + }, + { + "epoch": 0.4333490343853038, + "grad_norm": 1.681655235385771, + "learning_rate": 6.306026129965573e-06, + "loss": 0.4245, + "step": 805 + }, + { + "epoch": 0.4338873561671489, + "grad_norm": 1.5909295811480582, + "learning_rate": 6.297605138667127e-06, + "loss": 0.4748, + "step": 806 + }, + { + "epoch": 0.434425677948994, + "grad_norm": 1.5145278838582474, + "learning_rate": 6.289180199026654e-06, + "loss": 0.4578, + "step": 807 + }, + { + "epoch": 0.4349639997308391, + "grad_norm": 1.459737051246134, + "learning_rate": 6.280751336679495e-06, + "loss": 0.4637, + "step": 808 + }, + { + "epoch": 0.4355023215126842, + "grad_norm": 1.6191142290587295, + "learning_rate": 6.2723185772729166e-06, + "loss": 0.4582, + "step": 809 + }, + { + "epoch": 0.4360406432945293, + "grad_norm": 2.0040844342157422, + "learning_rate": 6.263881946466049e-06, + "loss": 0.4783, + "step": 810 + }, + { + "epoch": 0.4365789650763744, + "grad_norm": 1.7322826082498741, + "learning_rate": 6.255441469929804e-06, + "loss": 0.5002, + "step": 811 + }, + { + "epoch": 0.4371172868582195, + "grad_norm": 1.4894619670010198, + "learning_rate": 6.2469971733467925e-06, + "loss": 0.4253, + "step": 812 + }, + { + "epoch": 0.4376556086400646, + "grad_norm": 1.6488111913669299, + "learning_rate": 6.238549082411247e-06, + "loss": 0.4539, + "step": 813 + }, + { + "epoch": 0.4381939304219097, + "grad_norm": 1.3488898562178637, + "learning_rate": 6.230097222828949e-06, + "loss": 0.4623, + "step": 814 + }, + { + "epoch": 0.4387322522037548, + "grad_norm": 1.6423043283763479, + "learning_rate": 6.221641620317147e-06, + "loss": 0.4921, + "step": 815 + }, + { + "epoch": 0.4392705739855999, + "grad_norm": 1.9335639612379423, + "learning_rate": 6.2131823006044756e-06, + "loss": 0.4453, + "step": 816 + }, + { + "epoch": 0.439808895767445, + "grad_norm": 1.389152591337612, + "learning_rate": 6.2047192894308815e-06, + "loss": 0.4413, + "step": 817 + }, + { + "epoch": 0.44034721754929007, + "grad_norm": 1.983305422880984, + "learning_rate": 6.196252612547545e-06, + "loss": 0.5093, + "step": 818 + }, + { + "epoch": 0.44088553933113517, + "grad_norm": 2.053814295705837, + "learning_rate": 6.187782295716802e-06, + "loss": 0.4381, + "step": 819 + }, + { + "epoch": 0.44142386111298026, + "grad_norm": 1.547864349515979, + "learning_rate": 6.179308364712056e-06, + "loss": 0.4932, + "step": 820 + }, + { + "epoch": 0.44196218289482536, + "grad_norm": 1.4111506897228125, + "learning_rate": 6.170830845317717e-06, + "loss": 0.4695, + "step": 821 + }, + { + "epoch": 0.44250050467667046, + "grad_norm": 2.5994615269947485, + "learning_rate": 6.162349763329109e-06, + "loss": 0.5318, + "step": 822 + }, + { + "epoch": 0.44303882645851556, + "grad_norm": 1.5802737203663468, + "learning_rate": 6.153865144552398e-06, + "loss": 0.4676, + "step": 823 + }, + { + "epoch": 0.44357714824036065, + "grad_norm": 1.4711770748421387, + "learning_rate": 6.145377014804509e-06, + "loss": 0.4687, + "step": 824 + }, + { + "epoch": 0.44411547002220575, + "grad_norm": 1.3383114582462243, + "learning_rate": 6.136885399913052e-06, + "loss": 0.4514, + "step": 825 + }, + { + "epoch": 0.44465379180405085, + "grad_norm": 1.375700143244168, + "learning_rate": 6.1283903257162434e-06, + "loss": 0.4581, + "step": 826 + }, + { + "epoch": 0.44519211358589594, + "grad_norm": 1.6933351988143874, + "learning_rate": 6.119891818062822e-06, + "loss": 0.4399, + "step": 827 + }, + { + "epoch": 0.44573043536774104, + "grad_norm": 1.4137670063234855, + "learning_rate": 6.1113899028119764e-06, + "loss": 0.4298, + "step": 828 + }, + { + "epoch": 0.4462687571495862, + "grad_norm": 1.8781325581931287, + "learning_rate": 6.102884605833262e-06, + "loss": 0.4921, + "step": 829 + }, + { + "epoch": 0.4468070789314313, + "grad_norm": 1.5329498351981126, + "learning_rate": 6.094375953006527e-06, + "loss": 0.4518, + "step": 830 + }, + { + "epoch": 0.4473454007132764, + "grad_norm": 1.6692806133274172, + "learning_rate": 6.085863970221827e-06, + "loss": 0.5337, + "step": 831 + }, + { + "epoch": 0.4478837224951215, + "grad_norm": 1.5092683621943173, + "learning_rate": 6.077348683379351e-06, + "loss": 0.4578, + "step": 832 + }, + { + "epoch": 0.4484220442769666, + "grad_norm": 1.6510945855973929, + "learning_rate": 6.068830118389345e-06, + "loss": 0.479, + "step": 833 + }, + { + "epoch": 0.4489603660588117, + "grad_norm": 2.639396623007194, + "learning_rate": 6.060308301172026e-06, + "loss": 0.451, + "step": 834 + }, + { + "epoch": 0.4494986878406568, + "grad_norm": 1.8709014826106682, + "learning_rate": 6.051783257657508e-06, + "loss": 0.5109, + "step": 835 + }, + { + "epoch": 0.45003700962250187, + "grad_norm": 2.1325245569205284, + "learning_rate": 6.04325501378572e-06, + "loss": 0.4874, + "step": 836 + }, + { + "epoch": 0.45057533140434697, + "grad_norm": 1.4972184191802396, + "learning_rate": 6.034723595506334e-06, + "loss": 0.4671, + "step": 837 + }, + { + "epoch": 0.45111365318619207, + "grad_norm": 1.3179174814289414, + "learning_rate": 6.026189028778675e-06, + "loss": 0.4078, + "step": 838 + }, + { + "epoch": 0.45165197496803716, + "grad_norm": 1.521198968359238, + "learning_rate": 6.017651339571652e-06, + "loss": 0.4456, + "step": 839 + }, + { + "epoch": 0.45219029674988226, + "grad_norm": 1.4836797423023151, + "learning_rate": 6.009110553863674e-06, + "loss": 0.4497, + "step": 840 + }, + { + "epoch": 0.45219029674988226, + "eval_loss": 0.4534289836883545, + "eval_runtime": 1525.9354, + "eval_samples_per_second": 16.39, + "eval_steps_per_second": 0.512, + "step": 840 + }, + { + "epoch": 0.45272861853172736, + "grad_norm": 1.808617433298175, + "learning_rate": 6.000566697642575e-06, + "loss": 0.435, + "step": 841 + }, + { + "epoch": 0.45326694031357245, + "grad_norm": 2.008290454012663, + "learning_rate": 5.992019796905524e-06, + "loss": 0.4626, + "step": 842 + }, + { + "epoch": 0.45380526209541755, + "grad_norm": 1.7710157949578111, + "learning_rate": 5.9834698776589614e-06, + "loss": 0.4311, + "step": 843 + }, + { + "epoch": 0.45434358387726265, + "grad_norm": 1.6230775011015806, + "learning_rate": 5.9749169659185104e-06, + "loss": 0.4693, + "step": 844 + }, + { + "epoch": 0.45488190565910774, + "grad_norm": 1.3639464284433171, + "learning_rate": 5.966361087708898e-06, + "loss": 0.4658, + "step": 845 + }, + { + "epoch": 0.45542022744095284, + "grad_norm": 1.8137146027163404, + "learning_rate": 5.957802269063878e-06, + "loss": 0.4567, + "step": 846 + }, + { + "epoch": 0.45595854922279794, + "grad_norm": 1.6758956331351547, + "learning_rate": 5.949240536026153e-06, + "loss": 0.467, + "step": 847 + }, + { + "epoch": 0.45649687100464303, + "grad_norm": 1.5131926980070547, + "learning_rate": 5.940675914647293e-06, + "loss": 0.4106, + "step": 848 + }, + { + "epoch": 0.45703519278648813, + "grad_norm": 1.5046633719884865, + "learning_rate": 5.9321084309876555e-06, + "loss": 0.4282, + "step": 849 + }, + { + "epoch": 0.45757351456833323, + "grad_norm": 1.6481158877878923, + "learning_rate": 5.923538111116307e-06, + "loss": 0.4414, + "step": 850 + }, + { + "epoch": 0.4581118363501783, + "grad_norm": 2.175705374474076, + "learning_rate": 5.914964981110944e-06, + "loss": 0.5038, + "step": 851 + }, + { + "epoch": 0.4586501581320234, + "grad_norm": 1.748850851161863, + "learning_rate": 5.906389067057819e-06, + "loss": 0.4603, + "step": 852 + }, + { + "epoch": 0.4591884799138685, + "grad_norm": 1.5440809581743327, + "learning_rate": 5.897810395051646e-06, + "loss": 0.4697, + "step": 853 + }, + { + "epoch": 0.4597268016957136, + "grad_norm": 1.5332714275032744, + "learning_rate": 5.889228991195539e-06, + "loss": 0.4549, + "step": 854 + }, + { + "epoch": 0.4602651234775587, + "grad_norm": 1.6246537267152152, + "learning_rate": 5.880644881600921e-06, + "loss": 0.4413, + "step": 855 + }, + { + "epoch": 0.4608034452594038, + "grad_norm": 1.7384003721983572, + "learning_rate": 5.872058092387449e-06, + "loss": 0.5178, + "step": 856 + }, + { + "epoch": 0.4613417670412489, + "grad_norm": 1.4306474231507047, + "learning_rate": 5.863468649682933e-06, + "loss": 0.4584, + "step": 857 + }, + { + "epoch": 0.461880088823094, + "grad_norm": 1.7487008875581123, + "learning_rate": 5.8548765796232565e-06, + "loss": 0.4775, + "step": 858 + }, + { + "epoch": 0.4624184106049391, + "grad_norm": 1.6200058585564832, + "learning_rate": 5.846281908352299e-06, + "loss": 0.4718, + "step": 859 + }, + { + "epoch": 0.4629567323867842, + "grad_norm": 1.4993582658806037, + "learning_rate": 5.837684662021856e-06, + "loss": 0.4367, + "step": 860 + }, + { + "epoch": 0.4634950541686293, + "grad_norm": 1.6215871681690963, + "learning_rate": 5.829084866791551e-06, + "loss": 0.4891, + "step": 861 + }, + { + "epoch": 0.4640333759504744, + "grad_norm": 1.6479378578126422, + "learning_rate": 5.820482548828773e-06, + "loss": 0.4701, + "step": 862 + }, + { + "epoch": 0.4645716977323195, + "grad_norm": 1.709497613352161, + "learning_rate": 5.811877734308583e-06, + "loss": 0.4314, + "step": 863 + }, + { + "epoch": 0.4651100195141646, + "grad_norm": 1.850585526202356, + "learning_rate": 5.803270449413636e-06, + "loss": 0.4399, + "step": 864 + }, + { + "epoch": 0.4656483412960097, + "grad_norm": 1.4300437023045451, + "learning_rate": 5.7946607203341075e-06, + "loss": 0.4434, + "step": 865 + }, + { + "epoch": 0.4661866630778548, + "grad_norm": 1.4799373263095972, + "learning_rate": 5.786048573267608e-06, + "loss": 0.4065, + "step": 866 + }, + { + "epoch": 0.4667249848596999, + "grad_norm": 1.8869037850434587, + "learning_rate": 5.777434034419111e-06, + "loss": 0.4823, + "step": 867 + }, + { + "epoch": 0.467263306641545, + "grad_norm": 1.720619241457494, + "learning_rate": 5.768817130000857e-06, + "loss": 0.4444, + "step": 868 + }, + { + "epoch": 0.46780162842339007, + "grad_norm": 1.3809501342652182, + "learning_rate": 5.760197886232292e-06, + "loss": 0.4058, + "step": 869 + }, + { + "epoch": 0.46833995020523517, + "grad_norm": 1.6474446895806825, + "learning_rate": 5.75157632933998e-06, + "loss": 0.4244, + "step": 870 + }, + { + "epoch": 0.46887827198708026, + "grad_norm": 1.3347455312904397, + "learning_rate": 5.7429524855575216e-06, + "loss": 0.4509, + "step": 871 + }, + { + "epoch": 0.46941659376892536, + "grad_norm": 2.4700574740497583, + "learning_rate": 5.7343263811254746e-06, + "loss": 0.4078, + "step": 872 + }, + { + "epoch": 0.46995491555077046, + "grad_norm": 1.6808144924631037, + "learning_rate": 5.725698042291279e-06, + "loss": 0.445, + "step": 873 + }, + { + "epoch": 0.47049323733261555, + "grad_norm": 1.6561338534624221, + "learning_rate": 5.717067495309172e-06, + "loss": 0.4626, + "step": 874 + }, + { + "epoch": 0.47103155911446065, + "grad_norm": 1.4357104359447126, + "learning_rate": 5.708434766440109e-06, + "loss": 0.4253, + "step": 875 + }, + { + "epoch": 0.47156988089630575, + "grad_norm": 1.5584705980730198, + "learning_rate": 5.699799881951684e-06, + "loss": 0.4326, + "step": 876 + }, + { + "epoch": 0.47210820267815085, + "grad_norm": 1.6134096232268902, + "learning_rate": 5.691162868118052e-06, + "loss": 0.4361, + "step": 877 + }, + { + "epoch": 0.47264652445999594, + "grad_norm": 1.4597620039500387, + "learning_rate": 5.682523751219846e-06, + "loss": 0.4009, + "step": 878 + }, + { + "epoch": 0.47318484624184104, + "grad_norm": 1.6065681327100592, + "learning_rate": 5.673882557544098e-06, + "loss": 0.4859, + "step": 879 + }, + { + "epoch": 0.47372316802368614, + "grad_norm": 1.5207533993363942, + "learning_rate": 5.665239313384161e-06, + "loss": 0.4281, + "step": 880 + }, + { + "epoch": 0.47426148980553123, + "grad_norm": 1.4714029139534557, + "learning_rate": 5.656594045039623e-06, + "loss": 0.4364, + "step": 881 + }, + { + "epoch": 0.47479981158737633, + "grad_norm": 1.7055967072229654, + "learning_rate": 5.647946778816238e-06, + "loss": 0.5044, + "step": 882 + }, + { + "epoch": 0.4753381333692214, + "grad_norm": 1.7261543220071143, + "learning_rate": 5.639297541025831e-06, + "loss": 0.486, + "step": 883 + }, + { + "epoch": 0.4758764551510665, + "grad_norm": 1.6626927738024924, + "learning_rate": 5.630646357986232e-06, + "loss": 0.5142, + "step": 884 + }, + { + "epoch": 0.4764147769329116, + "grad_norm": 1.5653946306822688, + "learning_rate": 5.621993256021188e-06, + "loss": 0.4364, + "step": 885 + }, + { + "epoch": 0.4769530987147568, + "grad_norm": 1.8026208698346797, + "learning_rate": 5.613338261460287e-06, + "loss": 0.4538, + "step": 886 + }, + { + "epoch": 0.47749142049660187, + "grad_norm": 1.6799784860946594, + "learning_rate": 5.6046814006388705e-06, + "loss": 0.4644, + "step": 887 + }, + { + "epoch": 0.47802974227844697, + "grad_norm": 1.4364276865950356, + "learning_rate": 5.596022699897963e-06, + "loss": 0.4051, + "step": 888 + }, + { + "epoch": 0.47856806406029206, + "grad_norm": 1.6914469502870713, + "learning_rate": 5.587362185584189e-06, + "loss": 0.4871, + "step": 889 + }, + { + "epoch": 0.47910638584213716, + "grad_norm": 1.4415518156055118, + "learning_rate": 5.578699884049683e-06, + "loss": 0.4429, + "step": 890 + }, + { + "epoch": 0.47964470762398226, + "grad_norm": 1.4674935937695475, + "learning_rate": 5.570035821652029e-06, + "loss": 0.426, + "step": 891 + }, + { + "epoch": 0.48018302940582736, + "grad_norm": 2.1147351198112982, + "learning_rate": 5.561370024754161e-06, + "loss": 0.4789, + "step": 892 + }, + { + "epoch": 0.48072135118767245, + "grad_norm": 1.4253127193278772, + "learning_rate": 5.552702519724294e-06, + "loss": 0.4346, + "step": 893 + }, + { + "epoch": 0.48125967296951755, + "grad_norm": 3.7503200169998676, + "learning_rate": 5.544033332935838e-06, + "loss": 0.4393, + "step": 894 + }, + { + "epoch": 0.48179799475136265, + "grad_norm": 2.1079137772003818, + "learning_rate": 5.535362490767323e-06, + "loss": 0.5118, + "step": 895 + }, + { + "epoch": 0.48233631653320774, + "grad_norm": 2.2185325950005477, + "learning_rate": 5.526690019602315e-06, + "loss": 0.3894, + "step": 896 + }, + { + "epoch": 0.48287463831505284, + "grad_norm": 1.5274617672885367, + "learning_rate": 5.518015945829337e-06, + "loss": 0.42, + "step": 897 + }, + { + "epoch": 0.48341296009689794, + "grad_norm": 1.622273471984762, + "learning_rate": 5.509340295841785e-06, + "loss": 0.5112, + "step": 898 + }, + { + "epoch": 0.48395128187874303, + "grad_norm": 1.5776105686627353, + "learning_rate": 5.500663096037856e-06, + "loss": 0.4577, + "step": 899 + }, + { + "epoch": 0.48448960366058813, + "grad_norm": 1.4494216604414056, + "learning_rate": 5.491984372820461e-06, + "loss": 0.4585, + "step": 900 + }, + { + "epoch": 0.48448960366058813, + "eval_loss": 0.4497644305229187, + "eval_runtime": 1526.5252, + "eval_samples_per_second": 16.384, + "eval_steps_per_second": 0.512, + "step": 900 + }, + { + "epoch": 0.4850279254424332, + "grad_norm": 1.5164622603897875, + "learning_rate": 5.483304152597145e-06, + "loss": 0.4488, + "step": 901 + }, + { + "epoch": 0.4855662472242783, + "grad_norm": 1.5363015107046971, + "learning_rate": 5.474622461780011e-06, + "loss": 0.424, + "step": 902 + }, + { + "epoch": 0.4861045690061234, + "grad_norm": 1.5955517741757022, + "learning_rate": 5.465939326785634e-06, + "loss": 0.4544, + "step": 903 + }, + { + "epoch": 0.4866428907879685, + "grad_norm": 1.879614888686265, + "learning_rate": 5.457254774034983e-06, + "loss": 0.5032, + "step": 904 + }, + { + "epoch": 0.4871812125698136, + "grad_norm": 1.5621620080191398, + "learning_rate": 5.448568829953344e-06, + "loss": 0.4675, + "step": 905 + }, + { + "epoch": 0.4877195343516587, + "grad_norm": 1.463009731317384, + "learning_rate": 5.439881520970234e-06, + "loss": 0.5112, + "step": 906 + }, + { + "epoch": 0.4882578561335038, + "grad_norm": 1.4309448662315376, + "learning_rate": 5.431192873519326e-06, + "loss": 0.4532, + "step": 907 + }, + { + "epoch": 0.4887961779153489, + "grad_norm": 1.8077348129923718, + "learning_rate": 5.422502914038359e-06, + "loss": 0.4498, + "step": 908 + }, + { + "epoch": 0.489334499697194, + "grad_norm": 1.770786349097794, + "learning_rate": 5.413811668969072e-06, + "loss": 0.5081, + "step": 909 + }, + { + "epoch": 0.4898728214790391, + "grad_norm": 1.911624959064584, + "learning_rate": 5.4051191647571126e-06, + "loss": 0.4297, + "step": 910 + }, + { + "epoch": 0.4904111432608842, + "grad_norm": 2.238598280094612, + "learning_rate": 5.396425427851958e-06, + "loss": 0.4722, + "step": 911 + }, + { + "epoch": 0.4909494650427293, + "grad_norm": 1.7184560772593453, + "learning_rate": 5.387730484706839e-06, + "loss": 0.4778, + "step": 912 + }, + { + "epoch": 0.4914877868245744, + "grad_norm": 1.452205930174256, + "learning_rate": 5.3790343617786555e-06, + "loss": 0.4233, + "step": 913 + }, + { + "epoch": 0.4920261086064195, + "grad_norm": 1.6315132839706739, + "learning_rate": 5.3703370855278995e-06, + "loss": 0.4429, + "step": 914 + }, + { + "epoch": 0.4925644303882646, + "grad_norm": 2.1202501474227984, + "learning_rate": 5.361638682418565e-06, + "loss": 0.461, + "step": 915 + }, + { + "epoch": 0.4931027521701097, + "grad_norm": 1.4850726589476337, + "learning_rate": 5.352939178918084e-06, + "loss": 0.5053, + "step": 916 + }, + { + "epoch": 0.4936410739519548, + "grad_norm": 2.5715760460764505, + "learning_rate": 5.344238601497231e-06, + "loss": 0.523, + "step": 917 + }, + { + "epoch": 0.4941793957337999, + "grad_norm": 1.6641597075498922, + "learning_rate": 5.335536976630052e-06, + "loss": 0.4452, + "step": 918 + }, + { + "epoch": 0.494717717515645, + "grad_norm": 1.579954501546705, + "learning_rate": 5.326834330793775e-06, + "loss": 0.4365, + "step": 919 + }, + { + "epoch": 0.49525603929749007, + "grad_norm": 1.8639771696751175, + "learning_rate": 5.318130690468741e-06, + "loss": 0.4956, + "step": 920 + }, + { + "epoch": 0.49579436107933517, + "grad_norm": 1.6264721082016091, + "learning_rate": 5.309426082138311e-06, + "loss": 0.4592, + "step": 921 + }, + { + "epoch": 0.49633268286118026, + "grad_norm": 1.624012882860616, + "learning_rate": 5.300720532288798e-06, + "loss": 0.437, + "step": 922 + }, + { + "epoch": 0.49687100464302536, + "grad_norm": 1.6131788103239653, + "learning_rate": 5.29201406740937e-06, + "loss": 0.4335, + "step": 923 + }, + { + "epoch": 0.49740932642487046, + "grad_norm": 1.4350753111666732, + "learning_rate": 5.28330671399199e-06, + "loss": 0.4462, + "step": 924 + }, + { + "epoch": 0.49794764820671555, + "grad_norm": 1.9075044926150524, + "learning_rate": 5.274598498531318e-06, + "loss": 0.5123, + "step": 925 + }, + { + "epoch": 0.49848596998856065, + "grad_norm": 2.2955162228107233, + "learning_rate": 5.265889447524641e-06, + "loss": 0.4649, + "step": 926 + }, + { + "epoch": 0.49902429177040575, + "grad_norm": 1.8752294916309997, + "learning_rate": 5.257179587471784e-06, + "loss": 0.4339, + "step": 927 + }, + { + "epoch": 0.49956261355225084, + "grad_norm": 1.776206864828494, + "learning_rate": 5.248468944875036e-06, + "loss": 0.4047, + "step": 928 + }, + { + "epoch": 0.5001009353340959, + "grad_norm": 1.6863520776370677, + "learning_rate": 5.239757546239069e-06, + "loss": 0.4041, + "step": 929 + }, + { + "epoch": 0.500639257115941, + "grad_norm": 1.6004117617835396, + "learning_rate": 5.231045418070852e-06, + "loss": 0.4026, + "step": 930 + }, + { + "epoch": 0.5011775788977861, + "grad_norm": 1.6497898215404967, + "learning_rate": 5.222332586879576e-06, + "loss": 0.4953, + "step": 931 + }, + { + "epoch": 0.5017159006796312, + "grad_norm": 1.6264336562152901, + "learning_rate": 5.2136190791765714e-06, + "loss": 0.4697, + "step": 932 + }, + { + "epoch": 0.5022542224614763, + "grad_norm": 1.4687648507656423, + "learning_rate": 5.204904921475226e-06, + "loss": 0.4608, + "step": 933 + }, + { + "epoch": 0.5027925442433214, + "grad_norm": 1.555407852307028, + "learning_rate": 5.196190140290905e-06, + "loss": 0.4191, + "step": 934 + }, + { + "epoch": 0.5033308660251665, + "grad_norm": 1.6926089059266405, + "learning_rate": 5.1874747621408705e-06, + "loss": 0.4034, + "step": 935 + }, + { + "epoch": 0.5038691878070116, + "grad_norm": 1.5853166612648868, + "learning_rate": 5.178758813544203e-06, + "loss": 0.4288, + "step": 936 + }, + { + "epoch": 0.5044075095888567, + "grad_norm": 1.5462488708677307, + "learning_rate": 5.170042321021721e-06, + "loss": 0.5049, + "step": 937 + }, + { + "epoch": 0.5049458313707018, + "grad_norm": 1.6860561151031408, + "learning_rate": 5.161325311095889e-06, + "loss": 0.4673, + "step": 938 + }, + { + "epoch": 0.5054841531525469, + "grad_norm": 1.603506680608381, + "learning_rate": 5.1526078102907565e-06, + "loss": 0.4613, + "step": 939 + }, + { + "epoch": 0.506022474934392, + "grad_norm": 1.7493626988274396, + "learning_rate": 5.143889845131859e-06, + "loss": 0.4563, + "step": 940 + }, + { + "epoch": 0.5065607967162371, + "grad_norm": 1.7677497007408356, + "learning_rate": 5.135171442146147e-06, + "loss": 0.4389, + "step": 941 + }, + { + "epoch": 0.5070991184980822, + "grad_norm": 1.7686507376112643, + "learning_rate": 5.126452627861906e-06, + "loss": 0.469, + "step": 942 + }, + { + "epoch": 0.5076374402799273, + "grad_norm": 2.03881052798833, + "learning_rate": 5.117733428808671e-06, + "loss": 0.473, + "step": 943 + }, + { + "epoch": 0.5081757620617724, + "grad_norm": 1.5924723958151055, + "learning_rate": 5.109013871517148e-06, + "loss": 0.4449, + "step": 944 + }, + { + "epoch": 0.5087140838436175, + "grad_norm": 1.787982594535362, + "learning_rate": 5.10029398251913e-06, + "loss": 0.4575, + "step": 945 + }, + { + "epoch": 0.5092524056254626, + "grad_norm": 1.8443122029947836, + "learning_rate": 5.091573788347424e-06, + "loss": 0.4825, + "step": 946 + }, + { + "epoch": 0.5097907274073077, + "grad_norm": 1.5660114035251782, + "learning_rate": 5.082853315535764e-06, + "loss": 0.4705, + "step": 947 + }, + { + "epoch": 0.5103290491891528, + "grad_norm": 1.4015195298555256, + "learning_rate": 5.074132590618731e-06, + "loss": 0.4222, + "step": 948 + }, + { + "epoch": 0.5108673709709979, + "grad_norm": 1.6261999654731143, + "learning_rate": 5.065411640131672e-06, + "loss": 0.4172, + "step": 949 + }, + { + "epoch": 0.511405692752843, + "grad_norm": 1.6580955314247148, + "learning_rate": 5.0566904906106254e-06, + "loss": 0.4803, + "step": 950 + }, + { + "epoch": 0.5119440145346881, + "grad_norm": 1.6882580545035042, + "learning_rate": 5.047969168592229e-06, + "loss": 0.4959, + "step": 951 + }, + { + "epoch": 0.5124823363165332, + "grad_norm": 1.2734853203083423, + "learning_rate": 5.039247700613649e-06, + "loss": 0.4532, + "step": 952 + }, + { + "epoch": 0.5130206580983783, + "grad_norm": 1.6598696282615735, + "learning_rate": 5.030526113212494e-06, + "loss": 0.4443, + "step": 953 + }, + { + "epoch": 0.5135589798802234, + "grad_norm": 1.555381309193185, + "learning_rate": 5.021804432926739e-06, + "loss": 0.4704, + "step": 954 + }, + { + "epoch": 0.5140973016620685, + "grad_norm": 1.5525351037863324, + "learning_rate": 5.013082686294639e-06, + "loss": 0.4373, + "step": 955 + }, + { + "epoch": 0.5146356234439136, + "grad_norm": 1.5575470355469987, + "learning_rate": 5.00436089985465e-06, + "loss": 0.4242, + "step": 956 + }, + { + "epoch": 0.5151739452257587, + "grad_norm": 1.7457061624641392, + "learning_rate": 4.995639100145352e-06, + "loss": 0.4685, + "step": 957 + }, + { + "epoch": 0.5157122670076038, + "grad_norm": 1.6284837184280405, + "learning_rate": 4.9869173137053625e-06, + "loss": 0.4702, + "step": 958 + }, + { + "epoch": 0.5162505887894488, + "grad_norm": 2.191085743474062, + "learning_rate": 4.978195567073262e-06, + "loss": 0.5185, + "step": 959 + }, + { + "epoch": 0.516788910571294, + "grad_norm": 1.5407588424547343, + "learning_rate": 4.969473886787507e-06, + "loss": 0.505, + "step": 960 + }, + { + "epoch": 0.516788910571294, + "eval_loss": 0.44528621435165405, + "eval_runtime": 1532.2971, + "eval_samples_per_second": 16.322, + "eval_steps_per_second": 0.51, + "step": 960 + }, + { + "epoch": 0.517327232353139, + "grad_norm": 1.7214959560480187, + "learning_rate": 4.960752299386353e-06, + "loss": 0.4826, + "step": 961 + }, + { + "epoch": 0.5178655541349841, + "grad_norm": 1.5649628360297678, + "learning_rate": 4.9520308314077726e-06, + "loss": 0.4224, + "step": 962 + }, + { + "epoch": 0.5184038759168292, + "grad_norm": 1.6424636557347856, + "learning_rate": 4.943309509389377e-06, + "loss": 0.4148, + "step": 963 + }, + { + "epoch": 0.5189421976986743, + "grad_norm": 1.98993484637264, + "learning_rate": 4.934588359868329e-06, + "loss": 0.4307, + "step": 964 + }, + { + "epoch": 0.5194805194805194, + "grad_norm": 2.0804456077787123, + "learning_rate": 4.92586740938127e-06, + "loss": 0.4108, + "step": 965 + }, + { + "epoch": 0.5200188412623645, + "grad_norm": 1.748710199317067, + "learning_rate": 4.917146684464238e-06, + "loss": 0.4567, + "step": 966 + }, + { + "epoch": 0.5205571630442096, + "grad_norm": 1.4755067360374794, + "learning_rate": 4.908426211652577e-06, + "loss": 0.4523, + "step": 967 + }, + { + "epoch": 0.5210954848260547, + "grad_norm": 1.6340640272431366, + "learning_rate": 4.899706017480872e-06, + "loss": 0.4697, + "step": 968 + }, + { + "epoch": 0.5216338066078998, + "grad_norm": 1.5338487326156454, + "learning_rate": 4.890986128482854e-06, + "loss": 0.4108, + "step": 969 + }, + { + "epoch": 0.5221721283897449, + "grad_norm": 1.4204187507894679, + "learning_rate": 4.88226657119133e-06, + "loss": 0.4175, + "step": 970 + }, + { + "epoch": 0.52271045017159, + "grad_norm": 1.4916766712552136, + "learning_rate": 4.873547372138095e-06, + "loss": 0.4274, + "step": 971 + }, + { + "epoch": 0.5232487719534352, + "grad_norm": 1.514306526603469, + "learning_rate": 4.864828557853854e-06, + "loss": 0.4745, + "step": 972 + }, + { + "epoch": 0.5237870937352803, + "grad_norm": 1.774262113242822, + "learning_rate": 4.856110154868143e-06, + "loss": 0.4172, + "step": 973 + }, + { + "epoch": 0.5243254155171254, + "grad_norm": 1.4311594537408503, + "learning_rate": 4.847392189709246e-06, + "loss": 0.4499, + "step": 974 + }, + { + "epoch": 0.5248637372989705, + "grad_norm": 2.045966100772589, + "learning_rate": 4.8386746889041116e-06, + "loss": 0.496, + "step": 975 + }, + { + "epoch": 0.5254020590808156, + "grad_norm": 1.3914439869095196, + "learning_rate": 4.82995767897828e-06, + "loss": 0.4068, + "step": 976 + }, + { + "epoch": 0.5259403808626607, + "grad_norm": 1.3260222946498679, + "learning_rate": 4.8212411864557975e-06, + "loss": 0.4344, + "step": 977 + }, + { + "epoch": 0.5264787026445058, + "grad_norm": 1.7672350290368148, + "learning_rate": 4.812525237859131e-06, + "loss": 0.4647, + "step": 978 + }, + { + "epoch": 0.5270170244263509, + "grad_norm": 1.5287264304361414, + "learning_rate": 4.803809859709097e-06, + "loss": 0.4406, + "step": 979 + }, + { + "epoch": 0.527555346208196, + "grad_norm": 1.5180822455976997, + "learning_rate": 4.795095078524775e-06, + "loss": 0.4462, + "step": 980 + }, + { + "epoch": 0.5280936679900411, + "grad_norm": 1.5390017294524125, + "learning_rate": 4.78638092082343e-06, + "loss": 0.4427, + "step": 981 + }, + { + "epoch": 0.5286319897718862, + "grad_norm": 1.8490518419390272, + "learning_rate": 4.777667413120425e-06, + "loss": 0.4716, + "step": 982 + }, + { + "epoch": 0.5291703115537313, + "grad_norm": 1.9241747880139426, + "learning_rate": 4.7689545819291484e-06, + "loss": 0.4471, + "step": 983 + }, + { + "epoch": 0.5297086333355764, + "grad_norm": 1.5723366516079713, + "learning_rate": 4.760242453760932e-06, + "loss": 0.3616, + "step": 984 + }, + { + "epoch": 0.5302469551174215, + "grad_norm": 2.125474240340618, + "learning_rate": 4.751531055124965e-06, + "loss": 0.4567, + "step": 985 + }, + { + "epoch": 0.5307852768992666, + "grad_norm": 1.5872857045985345, + "learning_rate": 4.742820412528217e-06, + "loss": 0.4311, + "step": 986 + }, + { + "epoch": 0.5313235986811117, + "grad_norm": 1.5991351116825514, + "learning_rate": 4.73411055247536e-06, + "loss": 0.4572, + "step": 987 + }, + { + "epoch": 0.5318619204629568, + "grad_norm": 1.5620726404348677, + "learning_rate": 4.725401501468683e-06, + "loss": 0.4299, + "step": 988 + }, + { + "epoch": 0.5324002422448019, + "grad_norm": 1.6599112973852914, + "learning_rate": 4.716693286008011e-06, + "loss": 0.4444, + "step": 989 + }, + { + "epoch": 0.532938564026647, + "grad_norm": 1.7825302359359856, + "learning_rate": 4.707985932590631e-06, + "loss": 0.4321, + "step": 990 + }, + { + "epoch": 0.5334768858084921, + "grad_norm": 1.5739707930921258, + "learning_rate": 4.699279467711204e-06, + "loss": 0.4567, + "step": 991 + }, + { + "epoch": 0.5340152075903372, + "grad_norm": 1.5857670482566744, + "learning_rate": 4.69057391786169e-06, + "loss": 0.4312, + "step": 992 + }, + { + "epoch": 0.5345535293721823, + "grad_norm": 1.3615110605746865, + "learning_rate": 4.68186930953126e-06, + "loss": 0.376, + "step": 993 + }, + { + "epoch": 0.5350918511540274, + "grad_norm": 1.4263273424189502, + "learning_rate": 4.673165669206226e-06, + "loss": 0.4424, + "step": 994 + }, + { + "epoch": 0.5356301729358725, + "grad_norm": 2.8748098476059933, + "learning_rate": 4.6644630233699495e-06, + "loss": 0.4828, + "step": 995 + }, + { + "epoch": 0.5361684947177175, + "grad_norm": 1.7530111025052908, + "learning_rate": 4.65576139850277e-06, + "loss": 0.4565, + "step": 996 + }, + { + "epoch": 0.5367068164995626, + "grad_norm": 1.625700838321751, + "learning_rate": 4.647060821081918e-06, + "loss": 0.4397, + "step": 997 + }, + { + "epoch": 0.5372451382814077, + "grad_norm": 1.7382100638812064, + "learning_rate": 4.638361317581437e-06, + "loss": 0.4701, + "step": 998 + }, + { + "epoch": 0.5377834600632528, + "grad_norm": 2.153555864190946, + "learning_rate": 4.629662914472103e-06, + "loss": 0.45, + "step": 999 + }, + { + "epoch": 0.5383217818450979, + "grad_norm": 1.6756544006397587, + "learning_rate": 4.620965638221346e-06, + "loss": 0.4373, + "step": 1000 + }, + { + "epoch": 0.538860103626943, + "grad_norm": 2.115872641463188, + "learning_rate": 4.612269515293162e-06, + "loss": 0.4807, + "step": 1001 + }, + { + "epoch": 0.5393984254087881, + "grad_norm": 1.7162266935661588, + "learning_rate": 4.603574572148043e-06, + "loss": 0.4231, + "step": 1002 + }, + { + "epoch": 0.5399367471906332, + "grad_norm": 1.828685276454168, + "learning_rate": 4.59488083524289e-06, + "loss": 0.4405, + "step": 1003 + }, + { + "epoch": 0.5404750689724783, + "grad_norm": 1.6864896839159536, + "learning_rate": 4.58618833103093e-06, + "loss": 0.4144, + "step": 1004 + }, + { + "epoch": 0.5410133907543234, + "grad_norm": 1.4876643937775926, + "learning_rate": 4.5774970859616426e-06, + "loss": 0.4628, + "step": 1005 + }, + { + "epoch": 0.5415517125361685, + "grad_norm": 1.5038750034441302, + "learning_rate": 4.568807126480676e-06, + "loss": 0.4595, + "step": 1006 + }, + { + "epoch": 0.5420900343180136, + "grad_norm": 1.3366252716503892, + "learning_rate": 4.560118479029768e-06, + "loss": 0.4447, + "step": 1007 + }, + { + "epoch": 0.5426283560998587, + "grad_norm": 1.5955474786951926, + "learning_rate": 4.5514311700466575e-06, + "loss": 0.4731, + "step": 1008 + }, + { + "epoch": 0.5431666778817038, + "grad_norm": 1.415371321661975, + "learning_rate": 4.5427452259650185e-06, + "loss": 0.4565, + "step": 1009 + }, + { + "epoch": 0.5437049996635489, + "grad_norm": 1.414837591715847, + "learning_rate": 4.534060673214367e-06, + "loss": 0.439, + "step": 1010 + }, + { + "epoch": 0.544243321445394, + "grad_norm": 1.6390543819341332, + "learning_rate": 4.525377538219991e-06, + "loss": 0.4434, + "step": 1011 + }, + { + "epoch": 0.5447816432272391, + "grad_norm": 1.9027726313032218, + "learning_rate": 4.516695847402857e-06, + "loss": 0.4841, + "step": 1012 + }, + { + "epoch": 0.5453199650090842, + "grad_norm": 1.6549184700101718, + "learning_rate": 4.50801562717954e-06, + "loss": 0.4187, + "step": 1013 + }, + { + "epoch": 0.5458582867909293, + "grad_norm": 1.672495923944031, + "learning_rate": 4.499336903962146e-06, + "loss": 0.461, + "step": 1014 + }, + { + "epoch": 0.5463966085727744, + "grad_norm": 1.9002456572131434, + "learning_rate": 4.490659704158218e-06, + "loss": 0.4305, + "step": 1015 + }, + { + "epoch": 0.5469349303546195, + "grad_norm": 1.3438622389285284, + "learning_rate": 4.481984054170666e-06, + "loss": 0.4569, + "step": 1016 + }, + { + "epoch": 0.5474732521364646, + "grad_norm": 1.6738782134152472, + "learning_rate": 4.473309980397686e-06, + "loss": 0.4574, + "step": 1017 + }, + { + "epoch": 0.5480115739183097, + "grad_norm": 1.410079098904291, + "learning_rate": 4.464637509232679e-06, + "loss": 0.4616, + "step": 1018 + }, + { + "epoch": 0.5485498957001548, + "grad_norm": 1.5059024241541985, + "learning_rate": 4.455966667064164e-06, + "loss": 0.4257, + "step": 1019 + }, + { + "epoch": 0.5490882174819999, + "grad_norm": 1.8743979543800648, + "learning_rate": 4.447297480275708e-06, + "loss": 0.4468, + "step": 1020 + }, + { + "epoch": 0.5490882174819999, + "eval_loss": 0.44231292605400085, + "eval_runtime": 1542.3429, + "eval_samples_per_second": 16.216, + "eval_steps_per_second": 0.507, + "step": 1020 + }, + { + "epoch": 0.549626539263845, + "grad_norm": 2.326652305551719, + "learning_rate": 4.4386299752458405e-06, + "loss": 0.5123, + "step": 1021 + }, + { + "epoch": 0.5501648610456901, + "grad_norm": 1.5214313173590028, + "learning_rate": 4.429964178347973e-06, + "loss": 0.4525, + "step": 1022 + }, + { + "epoch": 0.5507031828275352, + "grad_norm": 1.578588355929213, + "learning_rate": 4.4213001159503185e-06, + "loss": 0.4511, + "step": 1023 + }, + { + "epoch": 0.5512415046093803, + "grad_norm": 1.5736153928065848, + "learning_rate": 4.4126378144158145e-06, + "loss": 0.402, + "step": 1024 + }, + { + "epoch": 0.5517798263912254, + "grad_norm": 1.4881049360513776, + "learning_rate": 4.4039773001020394e-06, + "loss": 0.4312, + "step": 1025 + }, + { + "epoch": 0.5523181481730705, + "grad_norm": 1.5453517436989277, + "learning_rate": 4.395318599361133e-06, + "loss": 0.4297, + "step": 1026 + }, + { + "epoch": 0.5528564699549156, + "grad_norm": 1.7401645944762647, + "learning_rate": 4.386661738539716e-06, + "loss": 0.4021, + "step": 1027 + }, + { + "epoch": 0.5533947917367606, + "grad_norm": 1.6594295806955806, + "learning_rate": 4.3780067439788125e-06, + "loss": 0.3936, + "step": 1028 + }, + { + "epoch": 0.5539331135186057, + "grad_norm": 1.4018911995650016, + "learning_rate": 4.3693536420137704e-06, + "loss": 0.4208, + "step": 1029 + }, + { + "epoch": 0.5544714353004508, + "grad_norm": 1.554369257290078, + "learning_rate": 4.360702458974172e-06, + "loss": 0.3869, + "step": 1030 + }, + { + "epoch": 0.5550097570822959, + "grad_norm": 1.7013778785431986, + "learning_rate": 4.3520532211837645e-06, + "loss": 0.4557, + "step": 1031 + }, + { + "epoch": 0.555548078864141, + "grad_norm": 1.5141795112180816, + "learning_rate": 4.343405954960378e-06, + "loss": 0.437, + "step": 1032 + }, + { + "epoch": 0.5560864006459861, + "grad_norm": 1.6876343830074998, + "learning_rate": 4.334760686615842e-06, + "loss": 0.4632, + "step": 1033 + }, + { + "epoch": 0.5566247224278312, + "grad_norm": 1.7137409506750598, + "learning_rate": 4.326117442455904e-06, + "loss": 0.451, + "step": 1034 + }, + { + "epoch": 0.5571630442096763, + "grad_norm": 2.2054388725094993, + "learning_rate": 4.3174762487801554e-06, + "loss": 0.4845, + "step": 1035 + }, + { + "epoch": 0.5577013659915214, + "grad_norm": 1.4514781472802996, + "learning_rate": 4.30883713188195e-06, + "loss": 0.4713, + "step": 1036 + }, + { + "epoch": 0.5582396877733665, + "grad_norm": 1.3155208362445518, + "learning_rate": 4.300200118048318e-06, + "loss": 0.4048, + "step": 1037 + }, + { + "epoch": 0.5587780095552116, + "grad_norm": 1.7594624250292574, + "learning_rate": 4.291565233559893e-06, + "loss": 0.4719, + "step": 1038 + }, + { + "epoch": 0.5593163313370567, + "grad_norm": 1.5899320924503517, + "learning_rate": 4.282932504690829e-06, + "loss": 0.4889, + "step": 1039 + }, + { + "epoch": 0.5598546531189018, + "grad_norm": 1.5400899090595648, + "learning_rate": 4.274301957708723e-06, + "loss": 0.48, + "step": 1040 + }, + { + "epoch": 0.5603929749007469, + "grad_norm": 1.9340975529821163, + "learning_rate": 4.265673618874527e-06, + "loss": 0.4558, + "step": 1041 + }, + { + "epoch": 0.560931296682592, + "grad_norm": 1.1875057467361612, + "learning_rate": 4.257047514442481e-06, + "loss": 0.4308, + "step": 1042 + }, + { + "epoch": 0.5614696184644371, + "grad_norm": 1.7255919834039524, + "learning_rate": 4.248423670660022e-06, + "loss": 0.4637, + "step": 1043 + }, + { + "epoch": 0.5620079402462822, + "grad_norm": 1.552937296818888, + "learning_rate": 4.239802113767711e-06, + "loss": 0.5167, + "step": 1044 + }, + { + "epoch": 0.5625462620281273, + "grad_norm": 1.4241418668403774, + "learning_rate": 4.231182869999146e-06, + "loss": 0.4262, + "step": 1045 + }, + { + "epoch": 0.5630845838099724, + "grad_norm": 1.4079020132555902, + "learning_rate": 4.222565965580892e-06, + "loss": 0.4527, + "step": 1046 + }, + { + "epoch": 0.5636229055918175, + "grad_norm": 1.3617602268653886, + "learning_rate": 4.2139514267323925e-06, + "loss": 0.4546, + "step": 1047 + }, + { + "epoch": 0.5641612273736626, + "grad_norm": 1.5838734348735288, + "learning_rate": 4.205339279665895e-06, + "loss": 0.3903, + "step": 1048 + }, + { + "epoch": 0.5646995491555077, + "grad_norm": 1.451984176062728, + "learning_rate": 4.196729550586367e-06, + "loss": 0.4211, + "step": 1049 + }, + { + "epoch": 0.5652378709373528, + "grad_norm": 1.5454288468811321, + "learning_rate": 4.18812226569142e-06, + "loss": 0.3856, + "step": 1050 + }, + { + "epoch": 0.5657761927191979, + "grad_norm": 1.6143068691418476, + "learning_rate": 4.17951745117123e-06, + "loss": 0.4137, + "step": 1051 + }, + { + "epoch": 0.566314514501043, + "grad_norm": 1.5780823976901985, + "learning_rate": 4.170915133208452e-06, + "loss": 0.4402, + "step": 1052 + }, + { + "epoch": 0.5668528362828881, + "grad_norm": 1.4482990847613153, + "learning_rate": 4.162315337978148e-06, + "loss": 0.5056, + "step": 1053 + }, + { + "epoch": 0.5673911580647332, + "grad_norm": 1.534829858260644, + "learning_rate": 4.153718091647702e-06, + "loss": 0.4212, + "step": 1054 + }, + { + "epoch": 0.5679294798465783, + "grad_norm": 1.6872941151721794, + "learning_rate": 4.145123420376745e-06, + "loss": 0.4604, + "step": 1055 + }, + { + "epoch": 0.5684678016284234, + "grad_norm": 1.3923901318290877, + "learning_rate": 4.136531350317069e-06, + "loss": 0.4608, + "step": 1056 + }, + { + "epoch": 0.5690061234102685, + "grad_norm": 1.7627677860939457, + "learning_rate": 4.127941907612553e-06, + "loss": 0.4345, + "step": 1057 + }, + { + "epoch": 0.5695444451921136, + "grad_norm": 1.6236383393521263, + "learning_rate": 4.11935511839908e-06, + "loss": 0.4599, + "step": 1058 + }, + { + "epoch": 0.5700827669739587, + "grad_norm": 1.5390392661613181, + "learning_rate": 4.110771008804463e-06, + "loss": 0.4822, + "step": 1059 + }, + { + "epoch": 0.5706210887558038, + "grad_norm": 1.6460116304075034, + "learning_rate": 4.102189604948356e-06, + "loss": 0.4277, + "step": 1060 + }, + { + "epoch": 0.5711594105376488, + "grad_norm": 1.4089445870425645, + "learning_rate": 4.093610932942184e-06, + "loss": 0.4055, + "step": 1061 + }, + { + "epoch": 0.571697732319494, + "grad_norm": 1.4912945610802475, + "learning_rate": 4.085035018889058e-06, + "loss": 0.4081, + "step": 1062 + }, + { + "epoch": 0.572236054101339, + "grad_norm": 1.7313554326427134, + "learning_rate": 4.076461888883696e-06, + "loss": 0.4516, + "step": 1063 + }, + { + "epoch": 0.5727743758831841, + "grad_norm": 1.438398770463997, + "learning_rate": 4.067891569012347e-06, + "loss": 0.4591, + "step": 1064 + }, + { + "epoch": 0.5733126976650292, + "grad_norm": 1.2911877198700585, + "learning_rate": 4.059324085352709e-06, + "loss": 0.3877, + "step": 1065 + }, + { + "epoch": 0.5738510194468743, + "grad_norm": 1.4799665950387828, + "learning_rate": 4.050759463973849e-06, + "loss": 0.4027, + "step": 1066 + }, + { + "epoch": 0.5743893412287194, + "grad_norm": 1.31856553741587, + "learning_rate": 4.042197730936124e-06, + "loss": 0.4385, + "step": 1067 + }, + { + "epoch": 0.5749276630105645, + "grad_norm": 1.4681673368671948, + "learning_rate": 4.033638912291104e-06, + "loss": 0.4699, + "step": 1068 + }, + { + "epoch": 0.5754659847924096, + "grad_norm": 1.8186933987892613, + "learning_rate": 4.025083034081492e-06, + "loss": 0.474, + "step": 1069 + }, + { + "epoch": 0.5760043065742547, + "grad_norm": 1.7243406009536202, + "learning_rate": 4.016530122341039e-06, + "loss": 0.4664, + "step": 1070 + }, + { + "epoch": 0.5765426283560998, + "grad_norm": 1.7574219154990909, + "learning_rate": 4.007980203094476e-06, + "loss": 0.412, + "step": 1071 + }, + { + "epoch": 0.5770809501379449, + "grad_norm": 3.3723520725361325, + "learning_rate": 3.999433302357427e-06, + "loss": 0.3745, + "step": 1072 + }, + { + "epoch": 0.57761927191979, + "grad_norm": 1.470644839329035, + "learning_rate": 3.990889446136326e-06, + "loss": 0.4192, + "step": 1073 + }, + { + "epoch": 0.5781575937016351, + "grad_norm": 1.8064402874305607, + "learning_rate": 3.982348660428349e-06, + "loss": 0.4633, + "step": 1074 + }, + { + "epoch": 0.5786959154834802, + "grad_norm": 1.5560108586108519, + "learning_rate": 3.9738109712213255e-06, + "loss": 0.4554, + "step": 1075 + }, + { + "epoch": 0.5792342372653253, + "grad_norm": 1.390022072661602, + "learning_rate": 3.965276404493667e-06, + "loss": 0.4468, + "step": 1076 + }, + { + "epoch": 0.5797725590471704, + "grad_norm": 1.5485174930428875, + "learning_rate": 3.956744986214281e-06, + "loss": 0.4406, + "step": 1077 + }, + { + "epoch": 0.5803108808290155, + "grad_norm": 1.377328803064819, + "learning_rate": 3.948216742342492e-06, + "loss": 0.3914, + "step": 1078 + }, + { + "epoch": 0.5808492026108606, + "grad_norm": 1.7377815121930535, + "learning_rate": 3.939691698827975e-06, + "loss": 0.4409, + "step": 1079 + }, + { + "epoch": 0.5813875243927057, + "grad_norm": 1.584949416405362, + "learning_rate": 3.931169881610655e-06, + "loss": 0.4909, + "step": 1080 + }, + { + "epoch": 0.5813875243927057, + "eval_loss": 0.43915173411369324, + "eval_runtime": 1551.2876, + "eval_samples_per_second": 16.122, + "eval_steps_per_second": 0.504, + "step": 1080 + }, + { + "epoch": 0.5819258461745508, + "grad_norm": 1.4259479318176305, + "learning_rate": 3.922651316620648e-06, + "loss": 0.419, + "step": 1081 + }, + { + "epoch": 0.5824641679563959, + "grad_norm": 1.883836889268125, + "learning_rate": 3.914136029778173e-06, + "loss": 0.4847, + "step": 1082 + }, + { + "epoch": 0.583002489738241, + "grad_norm": 1.5440830790183266, + "learning_rate": 3.905624046993474e-06, + "loss": 0.4484, + "step": 1083 + }, + { + "epoch": 0.5835408115200861, + "grad_norm": 1.711059696428319, + "learning_rate": 3.897115394166738e-06, + "loss": 0.4682, + "step": 1084 + }, + { + "epoch": 0.5840791333019312, + "grad_norm": 1.8908190002251042, + "learning_rate": 3.8886100971880235e-06, + "loss": 0.4325, + "step": 1085 + }, + { + "epoch": 0.5846174550837764, + "grad_norm": 1.5374015806352503, + "learning_rate": 3.880108181937178e-06, + "loss": 0.4434, + "step": 1086 + }, + { + "epoch": 0.5851557768656215, + "grad_norm": 1.864521131460447, + "learning_rate": 3.871609674283757e-06, + "loss": 0.4649, + "step": 1087 + }, + { + "epoch": 0.5856940986474666, + "grad_norm": 1.9214802187823141, + "learning_rate": 3.863114600086948e-06, + "loss": 0.452, + "step": 1088 + }, + { + "epoch": 0.5862324204293117, + "grad_norm": 1.3598584887277212, + "learning_rate": 3.854622985195492e-06, + "loss": 0.466, + "step": 1089 + }, + { + "epoch": 0.5867707422111568, + "grad_norm": 1.6127091744766286, + "learning_rate": 3.846134855447602e-06, + "loss": 0.4627, + "step": 1090 + }, + { + "epoch": 0.5873090639930019, + "grad_norm": 1.4648349504902127, + "learning_rate": 3.837650236670892e-06, + "loss": 0.3967, + "step": 1091 + }, + { + "epoch": 0.587847385774847, + "grad_norm": 1.8146408700451369, + "learning_rate": 3.829169154682283e-06, + "loss": 0.4271, + "step": 1092 + }, + { + "epoch": 0.5883857075566921, + "grad_norm": 1.7751846942753446, + "learning_rate": 3.8206916352879446e-06, + "loss": 0.4464, + "step": 1093 + }, + { + "epoch": 0.5889240293385372, + "grad_norm": 1.6612024138612147, + "learning_rate": 3.8122177042832e-06, + "loss": 0.4107, + "step": 1094 + }, + { + "epoch": 0.5894623511203823, + "grad_norm": 2.812616379162355, + "learning_rate": 3.8037473874524542e-06, + "loss": 0.4584, + "step": 1095 + }, + { + "epoch": 0.5900006729022274, + "grad_norm": 1.3709537212409602, + "learning_rate": 3.7952807105691185e-06, + "loss": 0.4356, + "step": 1096 + }, + { + "epoch": 0.5905389946840724, + "grad_norm": 1.2984038273503478, + "learning_rate": 3.7868176993955253e-06, + "loss": 0.426, + "step": 1097 + }, + { + "epoch": 0.5910773164659175, + "grad_norm": 1.6589883894837865, + "learning_rate": 3.7783583796828543e-06, + "loss": 0.4449, + "step": 1098 + }, + { + "epoch": 0.5916156382477626, + "grad_norm": 1.66006556219293, + "learning_rate": 3.769902777171051e-06, + "loss": 0.493, + "step": 1099 + }, + { + "epoch": 0.5921539600296077, + "grad_norm": 1.5937225644555308, + "learning_rate": 3.761450917588753e-06, + "loss": 0.4723, + "step": 1100 + }, + { + "epoch": 0.5926922818114528, + "grad_norm": 1.3456146090228862, + "learning_rate": 3.7530028266532074e-06, + "loss": 0.4137, + "step": 1101 + }, + { + "epoch": 0.5932306035932979, + "grad_norm": 1.679198037724048, + "learning_rate": 3.744558530070196e-06, + "loss": 0.4261, + "step": 1102 + }, + { + "epoch": 0.593768925375143, + "grad_norm": 1.581894355411804, + "learning_rate": 3.7361180535339504e-06, + "loss": 0.4612, + "step": 1103 + }, + { + "epoch": 0.5943072471569881, + "grad_norm": 1.4999393803804146, + "learning_rate": 3.7276814227270842e-06, + "loss": 0.4242, + "step": 1104 + }, + { + "epoch": 0.5948455689388332, + "grad_norm": 1.6700110113661726, + "learning_rate": 3.719248663320506e-06, + "loss": 0.4536, + "step": 1105 + }, + { + "epoch": 0.5953838907206783, + "grad_norm": 1.4628534581538355, + "learning_rate": 3.7108198009733454e-06, + "loss": 0.3885, + "step": 1106 + }, + { + "epoch": 0.5959222125025234, + "grad_norm": 1.5174908060004981, + "learning_rate": 3.7023948613328736e-06, + "loss": 0.4688, + "step": 1107 + }, + { + "epoch": 0.5964605342843685, + "grad_norm": 1.6277090494975097, + "learning_rate": 3.6939738700344264e-06, + "loss": 0.4404, + "step": 1108 + }, + { + "epoch": 0.5969988560662136, + "grad_norm": 2.5097831655290954, + "learning_rate": 3.6855568527013273e-06, + "loss": 0.4608, + "step": 1109 + }, + { + "epoch": 0.5975371778480587, + "grad_norm": 1.4992012722834578, + "learning_rate": 3.677143834944803e-06, + "loss": 0.4446, + "step": 1110 + }, + { + "epoch": 0.5980754996299038, + "grad_norm": 1.4139401580995998, + "learning_rate": 3.6687348423639147e-06, + "loss": 0.4098, + "step": 1111 + }, + { + "epoch": 0.5986138214117489, + "grad_norm": 2.0752058550686585, + "learning_rate": 3.6603299005454744e-06, + "loss": 0.4234, + "step": 1112 + }, + { + "epoch": 0.599152143193594, + "grad_norm": 1.6967487088214965, + "learning_rate": 3.6519290350639697e-06, + "loss": 0.4348, + "step": 1113 + }, + { + "epoch": 0.5996904649754391, + "grad_norm": 1.7094622508466781, + "learning_rate": 3.6435322714814813e-06, + "loss": 0.4584, + "step": 1114 + }, + { + "epoch": 0.6002287867572842, + "grad_norm": 1.5333043053128887, + "learning_rate": 3.635139635347612e-06, + "loss": 0.4211, + "step": 1115 + }, + { + "epoch": 0.6007671085391293, + "grad_norm": 1.447440380533825, + "learning_rate": 3.626751152199406e-06, + "loss": 0.4392, + "step": 1116 + }, + { + "epoch": 0.6013054303209744, + "grad_norm": 1.558545230893266, + "learning_rate": 3.6183668475612665e-06, + "loss": 0.4553, + "step": 1117 + }, + { + "epoch": 0.6018437521028195, + "grad_norm": 1.7341397982742823, + "learning_rate": 3.6099867469448874e-06, + "loss": 0.4521, + "step": 1118 + }, + { + "epoch": 0.6023820738846646, + "grad_norm": 3.5577384559068075, + "learning_rate": 3.601610875849168e-06, + "loss": 0.4999, + "step": 1119 + }, + { + "epoch": 0.6029203956665097, + "grad_norm": 1.3499033786926813, + "learning_rate": 3.5932392597601396e-06, + "loss": 0.4273, + "step": 1120 + }, + { + "epoch": 0.6034587174483548, + "grad_norm": 1.49775810523526, + "learning_rate": 3.584871924150883e-06, + "loss": 0.4275, + "step": 1121 + }, + { + "epoch": 0.6039970392301999, + "grad_norm": 1.4867216376875734, + "learning_rate": 3.576508894481458e-06, + "loss": 0.443, + "step": 1122 + }, + { + "epoch": 0.604535361012045, + "grad_norm": 1.8077118144262816, + "learning_rate": 3.5681501961988212e-06, + "loss": 0.408, + "step": 1123 + }, + { + "epoch": 0.6050736827938901, + "grad_norm": 2.0530433441295535, + "learning_rate": 3.5597958547367507e-06, + "loss": 0.3988, + "step": 1124 + }, + { + "epoch": 0.6056120045757352, + "grad_norm": 1.4118492293118154, + "learning_rate": 3.551445895515765e-06, + "loss": 0.477, + "step": 1125 + }, + { + "epoch": 0.6061503263575803, + "grad_norm": 1.7018214299556869, + "learning_rate": 3.5431003439430493e-06, + "loss": 0.4441, + "step": 1126 + }, + { + "epoch": 0.6066886481394254, + "grad_norm": 1.434018580532193, + "learning_rate": 3.5347592254123795e-06, + "loss": 0.4539, + "step": 1127 + }, + { + "epoch": 0.6072269699212705, + "grad_norm": 1.4867130289511963, + "learning_rate": 3.526422565304042e-06, + "loss": 0.4158, + "step": 1128 + }, + { + "epoch": 0.6077652917031156, + "grad_norm": 1.4715457603229556, + "learning_rate": 3.518090388984753e-06, + "loss": 0.425, + "step": 1129 + }, + { + "epoch": 0.6083036134849606, + "grad_norm": 1.4891631829297116, + "learning_rate": 3.5097627218075905e-06, + "loss": 0.4551, + "step": 1130 + }, + { + "epoch": 0.6088419352668057, + "grad_norm": 1.38559309859237, + "learning_rate": 3.5014395891119112e-06, + "loss": 0.3903, + "step": 1131 + }, + { + "epoch": 0.6093802570486508, + "grad_norm": 1.5211311736282844, + "learning_rate": 3.4931210162232716e-06, + "loss": 0.474, + "step": 1132 + }, + { + "epoch": 0.6099185788304959, + "grad_norm": 3.910273590345733, + "learning_rate": 3.484807028453356e-06, + "loss": 0.4386, + "step": 1133 + }, + { + "epoch": 0.610456900612341, + "grad_norm": 1.21915593287012, + "learning_rate": 3.476497651099897e-06, + "loss": 0.4214, + "step": 1134 + }, + { + "epoch": 0.6109952223941861, + "grad_norm": 7.218438211629208, + "learning_rate": 3.4681929094465987e-06, + "loss": 0.4368, + "step": 1135 + }, + { + "epoch": 0.6115335441760312, + "grad_norm": 1.5885679173464573, + "learning_rate": 3.4598928287630585e-06, + "loss": 0.4304, + "step": 1136 + }, + { + "epoch": 0.6120718659578763, + "grad_norm": 1.6276966755475062, + "learning_rate": 3.451597434304692e-06, + "loss": 0.4303, + "step": 1137 + }, + { + "epoch": 0.6126101877397214, + "grad_norm": 2.4974771072637227, + "learning_rate": 3.443306751312656e-06, + "loss": 0.4812, + "step": 1138 + }, + { + "epoch": 0.6131485095215665, + "grad_norm": 1.8523418655749138, + "learning_rate": 3.435020805013773e-06, + "loss": 0.4464, + "step": 1139 + }, + { + "epoch": 0.6136868313034116, + "grad_norm": 1.6153961476534389, + "learning_rate": 3.4267396206204477e-06, + "loss": 0.4258, + "step": 1140 + }, + { + "epoch": 0.6136868313034116, + "eval_loss": 0.4358210265636444, + "eval_runtime": 1559.0889, + "eval_samples_per_second": 16.041, + "eval_steps_per_second": 0.502, + "step": 1140 + }, + { + "epoch": 0.6142251530852567, + "grad_norm": 1.5200314946583775, + "learning_rate": 3.4184632233306004e-06, + "loss": 0.4328, + "step": 1141 + }, + { + "epoch": 0.6147634748671018, + "grad_norm": 1.753239287330404, + "learning_rate": 3.4101916383275836e-06, + "loss": 0.4164, + "step": 1142 + }, + { + "epoch": 0.6153017966489469, + "grad_norm": 1.3784614615536817, + "learning_rate": 3.4019248907801058e-06, + "loss": 0.407, + "step": 1143 + }, + { + "epoch": 0.615840118430792, + "grad_norm": 1.4916546024442217, + "learning_rate": 3.3936630058421567e-06, + "loss": 0.4449, + "step": 1144 + }, + { + "epoch": 0.6163784402126371, + "grad_norm": 1.411016335795447, + "learning_rate": 3.385406008652931e-06, + "loss": 0.4137, + "step": 1145 + }, + { + "epoch": 0.6169167619944822, + "grad_norm": 1.969929829038151, + "learning_rate": 3.3771539243367517e-06, + "loss": 0.4569, + "step": 1146 + }, + { + "epoch": 0.6174550837763273, + "grad_norm": 1.4268646662770854, + "learning_rate": 3.3689067780029895e-06, + "loss": 0.4399, + "step": 1147 + }, + { + "epoch": 0.6179934055581724, + "grad_norm": 1.4858645297475759, + "learning_rate": 3.3606645947459933e-06, + "loss": 0.4318, + "step": 1148 + }, + { + "epoch": 0.6185317273400175, + "grad_norm": 2.07970165108201, + "learning_rate": 3.3524273996450087e-06, + "loss": 0.4804, + "step": 1149 + }, + { + "epoch": 0.6190700491218626, + "grad_norm": 1.5524399522642343, + "learning_rate": 3.3441952177641046e-06, + "loss": 0.448, + "step": 1150 + }, + { + "epoch": 0.6196083709037077, + "grad_norm": 1.5025047668730835, + "learning_rate": 3.335968074152094e-06, + "loss": 0.4229, + "step": 1151 + }, + { + "epoch": 0.6201466926855528, + "grad_norm": 1.51932290948172, + "learning_rate": 3.32774599384246e-06, + "loss": 0.4238, + "step": 1152 + }, + { + "epoch": 0.6206850144673979, + "grad_norm": 1.4003637291864899, + "learning_rate": 3.319529001853282e-06, + "loss": 0.4618, + "step": 1153 + }, + { + "epoch": 0.621223336249243, + "grad_norm": 1.3792399628540106, + "learning_rate": 3.311317123187151e-06, + "loss": 0.4052, + "step": 1154 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 1.4341824487711958, + "learning_rate": 3.3031103828311044e-06, + "loss": 0.4452, + "step": 1155 + }, + { + "epoch": 0.6222999798129332, + "grad_norm": 1.8890388921678993, + "learning_rate": 3.294908805756543e-06, + "loss": 0.4311, + "step": 1156 + }, + { + "epoch": 0.6228383015947783, + "grad_norm": 1.6873174271659632, + "learning_rate": 3.286712416919156e-06, + "loss": 0.465, + "step": 1157 + }, + { + "epoch": 0.6233766233766234, + "grad_norm": 2.113957712483436, + "learning_rate": 3.2785212412588464e-06, + "loss": 0.4103, + "step": 1158 + }, + { + "epoch": 0.6239149451584685, + "grad_norm": 1.6169473829408894, + "learning_rate": 3.2703353036996553e-06, + "loss": 0.4042, + "step": 1159 + }, + { + "epoch": 0.6244532669403136, + "grad_norm": 1.6678579140480474, + "learning_rate": 3.262154629149684e-06, + "loss": 0.4849, + "step": 1160 + }, + { + "epoch": 0.6249915887221587, + "grad_norm": 1.5133551741537392, + "learning_rate": 3.253979242501023e-06, + "loss": 0.4479, + "step": 1161 + }, + { + "epoch": 0.6255299105040037, + "grad_norm": 1.5463516633606489, + "learning_rate": 3.2458091686296666e-06, + "loss": 0.4589, + "step": 1162 + }, + { + "epoch": 0.6260682322858488, + "grad_norm": 1.3908513399535982, + "learning_rate": 3.2376444323954487e-06, + "loss": 0.407, + "step": 1163 + }, + { + "epoch": 0.6266065540676939, + "grad_norm": 1.4911824388993882, + "learning_rate": 3.2294850586419603e-06, + "loss": 0.4016, + "step": 1164 + }, + { + "epoch": 0.627144875849539, + "grad_norm": 1.4342504928355473, + "learning_rate": 3.2213310721964753e-06, + "loss": 0.4269, + "step": 1165 + }, + { + "epoch": 0.6276831976313841, + "grad_norm": 1.5982636474188436, + "learning_rate": 3.2131824978698744e-06, + "loss": 0.4532, + "step": 1166 + }, + { + "epoch": 0.6282215194132292, + "grad_norm": 1.3672342575621805, + "learning_rate": 3.2050393604565722e-06, + "loss": 0.3972, + "step": 1167 + }, + { + "epoch": 0.6287598411950743, + "grad_norm": 1.6874817093257244, + "learning_rate": 3.196901684734439e-06, + "loss": 0.457, + "step": 1168 + }, + { + "epoch": 0.6292981629769194, + "grad_norm": 1.5723777384143767, + "learning_rate": 3.188769495464725e-06, + "loss": 0.3892, + "step": 1169 + }, + { + "epoch": 0.6298364847587645, + "grad_norm": 1.601524939347794, + "learning_rate": 3.180642817391988e-06, + "loss": 0.4433, + "step": 1170 + }, + { + "epoch": 0.6303748065406096, + "grad_norm": 2.25805654454037, + "learning_rate": 3.172521675244016e-06, + "loss": 0.4322, + "step": 1171 + }, + { + "epoch": 0.6309131283224547, + "grad_norm": 1.5555079250741115, + "learning_rate": 3.1644060937317523e-06, + "loss": 0.391, + "step": 1172 + }, + { + "epoch": 0.6314514501042998, + "grad_norm": 1.4992699551350894, + "learning_rate": 3.1562960975492194e-06, + "loss": 0.4044, + "step": 1173 + }, + { + "epoch": 0.6319897718861449, + "grad_norm": 1.5799132322735037, + "learning_rate": 3.1481917113734474e-06, + "loss": 0.3812, + "step": 1174 + }, + { + "epoch": 0.63252809366799, + "grad_norm": 1.7698333563655604, + "learning_rate": 3.140092959864392e-06, + "loss": 0.4353, + "step": 1175 + }, + { + "epoch": 0.6330664154498351, + "grad_norm": 1.568455528145148, + "learning_rate": 3.1319998676648695e-06, + "loss": 0.4307, + "step": 1176 + }, + { + "epoch": 0.6336047372316802, + "grad_norm": 1.6539679705814518, + "learning_rate": 3.12391245940047e-06, + "loss": 0.4269, + "step": 1177 + }, + { + "epoch": 0.6341430590135253, + "grad_norm": 1.7204853297231233, + "learning_rate": 3.115830759679492e-06, + "loss": 0.4857, + "step": 1178 + }, + { + "epoch": 0.6346813807953704, + "grad_norm": 1.6626863719528417, + "learning_rate": 3.1077547930928652e-06, + "loss": 0.4681, + "step": 1179 + }, + { + "epoch": 0.6352197025772155, + "grad_norm": 1.6842711637823262, + "learning_rate": 3.0996845842140716e-06, + "loss": 0.4312, + "step": 1180 + }, + { + "epoch": 0.6357580243590606, + "grad_norm": 1.7431784823037149, + "learning_rate": 3.091620157599075e-06, + "loss": 0.4206, + "step": 1181 + }, + { + "epoch": 0.6362963461409057, + "grad_norm": 1.7565059915579697, + "learning_rate": 3.0835615377862453e-06, + "loss": 0.4787, + "step": 1182 + }, + { + "epoch": 0.6368346679227508, + "grad_norm": 1.5940508036600212, + "learning_rate": 3.0755087492962844e-06, + "loss": 0.3977, + "step": 1183 + }, + { + "epoch": 0.6373729897045959, + "grad_norm": 1.4265440236436624, + "learning_rate": 3.0674618166321477e-06, + "loss": 0.4455, + "step": 1184 + }, + { + "epoch": 0.637911311486441, + "grad_norm": 1.5203806820148102, + "learning_rate": 3.059420764278975e-06, + "loss": 0.4421, + "step": 1185 + }, + { + "epoch": 0.6384496332682861, + "grad_norm": 1.7485388075672719, + "learning_rate": 3.0513856167040123e-06, + "loss": 0.4337, + "step": 1186 + }, + { + "epoch": 0.6389879550501312, + "grad_norm": 1.5758916072812403, + "learning_rate": 3.0433563983565415e-06, + "loss": 0.483, + "step": 1187 + }, + { + "epoch": 0.6395262768319763, + "grad_norm": 1.7757740619316615, + "learning_rate": 3.0353331336677984e-06, + "loss": 0.402, + "step": 1188 + }, + { + "epoch": 0.6400645986138214, + "grad_norm": 1.5639356203741708, + "learning_rate": 3.027315847050906e-06, + "loss": 0.4588, + "step": 1189 + }, + { + "epoch": 0.6406029203956665, + "grad_norm": 1.900913903628273, + "learning_rate": 3.0193045629007982e-06, + "loss": 0.4318, + "step": 1190 + }, + { + "epoch": 0.6411412421775116, + "grad_norm": 1.7813979669008324, + "learning_rate": 3.011299305594141e-06, + "loss": 0.4444, + "step": 1191 + }, + { + "epoch": 0.6416795639593567, + "grad_norm": 1.4267787696799576, + "learning_rate": 3.0033000994892646e-06, + "loss": 0.4394, + "step": 1192 + }, + { + "epoch": 0.6422178857412018, + "grad_norm": 1.425734282167891, + "learning_rate": 2.995306968926087e-06, + "loss": 0.4729, + "step": 1193 + }, + { + "epoch": 0.6427562075230469, + "grad_norm": 1.6415657973276232, + "learning_rate": 2.98731993822604e-06, + "loss": 0.4644, + "step": 1194 + }, + { + "epoch": 0.643294529304892, + "grad_norm": 1.8314597950910743, + "learning_rate": 2.97933903169199e-06, + "loss": 0.5308, + "step": 1195 + }, + { + "epoch": 0.643832851086737, + "grad_norm": 1.5314208582263587, + "learning_rate": 2.9713642736081755e-06, + "loss": 0.4539, + "step": 1196 + }, + { + "epoch": 0.6443711728685821, + "grad_norm": 1.7043966331574372, + "learning_rate": 2.9633956882401215e-06, + "loss": 0.4478, + "step": 1197 + }, + { + "epoch": 0.6449094946504272, + "grad_norm": 1.3896380014466228, + "learning_rate": 2.955433299834576e-06, + "loss": 0.4274, + "step": 1198 + }, + { + "epoch": 0.6454478164322723, + "grad_norm": 1.328466975562685, + "learning_rate": 2.947477132619423e-06, + "loss": 0.4151, + "step": 1199 + }, + { + "epoch": 0.6459861382141174, + "grad_norm": 1.4947495053829816, + "learning_rate": 2.939527210803624e-06, + "loss": 0.4225, + "step": 1200 + }, + { + "epoch": 0.6459861382141174, + "eval_loss": 0.43335118889808655, + "eval_runtime": 1568.1591, + "eval_samples_per_second": 15.949, + "eval_steps_per_second": 0.499, + "step": 1200 + }, + { + "epoch": 0.6465244599959626, + "grad_norm": 1.7770419353679783, + "learning_rate": 2.9315835585771334e-06, + "loss": 0.4443, + "step": 1201 + }, + { + "epoch": 0.6470627817778077, + "grad_norm": 1.509257884926516, + "learning_rate": 2.923646200110832e-06, + "loss": 0.403, + "step": 1202 + }, + { + "epoch": 0.6476011035596528, + "grad_norm": 1.413359799607147, + "learning_rate": 2.915715159556444e-06, + "loss": 0.3995, + "step": 1203 + }, + { + "epoch": 0.6481394253414979, + "grad_norm": 1.4051405846579907, + "learning_rate": 2.9077904610464745e-06, + "loss": 0.3597, + "step": 1204 + }, + { + "epoch": 0.648677747123343, + "grad_norm": 1.5857210618229394, + "learning_rate": 2.89987212869413e-06, + "loss": 0.448, + "step": 1205 + }, + { + "epoch": 0.6492160689051881, + "grad_norm": 1.3723187404527468, + "learning_rate": 2.8919601865932456e-06, + "loss": 0.4522, + "step": 1206 + }, + { + "epoch": 0.6497543906870332, + "grad_norm": 1.3511061410304184, + "learning_rate": 2.884054658818214e-06, + "loss": 0.3792, + "step": 1207 + }, + { + "epoch": 0.6502927124688783, + "grad_norm": 1.387760091675675, + "learning_rate": 2.8761555694239046e-06, + "loss": 0.4515, + "step": 1208 + }, + { + "epoch": 0.6508310342507234, + "grad_norm": 1.4247593593472396, + "learning_rate": 2.868262942445603e-06, + "loss": 0.4489, + "step": 1209 + }, + { + "epoch": 0.6513693560325685, + "grad_norm": 1.600671347691334, + "learning_rate": 2.8603768018989275e-06, + "loss": 0.3944, + "step": 1210 + }, + { + "epoch": 0.6519076778144136, + "grad_norm": 1.4284428882228806, + "learning_rate": 2.852497171779761e-06, + "loss": 0.432, + "step": 1211 + }, + { + "epoch": 0.6524459995962587, + "grad_norm": 1.8170320001458748, + "learning_rate": 2.8446240760641762e-06, + "loss": 0.483, + "step": 1212 + }, + { + "epoch": 0.6529843213781038, + "grad_norm": 1.872300633931277, + "learning_rate": 2.836757538708362e-06, + "loss": 0.4226, + "step": 1213 + }, + { + "epoch": 0.6535226431599489, + "grad_norm": 1.5545253276420463, + "learning_rate": 2.8288975836485523e-06, + "loss": 0.4452, + "step": 1214 + }, + { + "epoch": 0.654060964941794, + "grad_norm": 1.4689119979210103, + "learning_rate": 2.8210442348009543e-06, + "loss": 0.4206, + "step": 1215 + }, + { + "epoch": 0.6545992867236391, + "grad_norm": 1.495722266239985, + "learning_rate": 2.8131975160616686e-06, + "loss": 0.4555, + "step": 1216 + }, + { + "epoch": 0.6551376085054842, + "grad_norm": 1.4286754464458904, + "learning_rate": 2.805357451306626e-06, + "loss": 0.4531, + "step": 1217 + }, + { + "epoch": 0.6556759302873293, + "grad_norm": 1.6604089854519999, + "learning_rate": 2.797524064391511e-06, + "loss": 0.4351, + "step": 1218 + }, + { + "epoch": 0.6562142520691744, + "grad_norm": 1.677727217993553, + "learning_rate": 2.7896973791516867e-06, + "loss": 0.4797, + "step": 1219 + }, + { + "epoch": 0.6567525738510195, + "grad_norm": 1.8188528752490087, + "learning_rate": 2.781877419402126e-06, + "loss": 0.3942, + "step": 1220 + }, + { + "epoch": 0.6572908956328646, + "grad_norm": 1.518304729497582, + "learning_rate": 2.7740642089373356e-06, + "loss": 0.4567, + "step": 1221 + }, + { + "epoch": 0.6578292174147097, + "grad_norm": 1.9076520179847476, + "learning_rate": 2.76625777153129e-06, + "loss": 0.4761, + "step": 1222 + }, + { + "epoch": 0.6583675391965548, + "grad_norm": 1.6501027454283104, + "learning_rate": 2.758458130937346e-06, + "loss": 0.4568, + "step": 1223 + }, + { + "epoch": 0.6589058609783999, + "grad_norm": 1.4971909664683323, + "learning_rate": 2.7506653108881885e-06, + "loss": 0.4534, + "step": 1224 + }, + { + "epoch": 0.659444182760245, + "grad_norm": 1.8216935826384455, + "learning_rate": 2.742879335095743e-06, + "loss": 0.4872, + "step": 1225 + }, + { + "epoch": 0.6599825045420901, + "grad_norm": 1.441369836777809, + "learning_rate": 2.735100227251113e-06, + "loss": 0.3857, + "step": 1226 + }, + { + "epoch": 0.6605208263239352, + "grad_norm": 1.3907320663098741, + "learning_rate": 2.7273280110245e-06, + "loss": 0.4055, + "step": 1227 + }, + { + "epoch": 0.6610591481057803, + "grad_norm": 1.3629302314750185, + "learning_rate": 2.719562710065142e-06, + "loss": 0.4059, + "step": 1228 + }, + { + "epoch": 0.6615974698876254, + "grad_norm": 1.5181251515722511, + "learning_rate": 2.711804348001231e-06, + "loss": 0.4927, + "step": 1229 + }, + { + "epoch": 0.6621357916694705, + "grad_norm": 1.583461554714453, + "learning_rate": 2.704052948439842e-06, + "loss": 0.4139, + "step": 1230 + }, + { + "epoch": 0.6626741134513155, + "grad_norm": 1.597683792644596, + "learning_rate": 2.6963085349668718e-06, + "loss": 0.4299, + "step": 1231 + }, + { + "epoch": 0.6632124352331606, + "grad_norm": 1.4538764746820028, + "learning_rate": 2.6885711311469547e-06, + "loss": 0.4238, + "step": 1232 + }, + { + "epoch": 0.6637507570150057, + "grad_norm": 1.5760098860778269, + "learning_rate": 2.6808407605234006e-06, + "loss": 0.4605, + "step": 1233 + }, + { + "epoch": 0.6642890787968508, + "grad_norm": 1.8819638022647283, + "learning_rate": 2.673117446618114e-06, + "loss": 0.4176, + "step": 1234 + }, + { + "epoch": 0.6648274005786959, + "grad_norm": 1.7467867886896942, + "learning_rate": 2.665401212931532e-06, + "loss": 0.4284, + "step": 1235 + }, + { + "epoch": 0.665365722360541, + "grad_norm": 1.3582161008888671, + "learning_rate": 2.6576920829425434e-06, + "loss": 0.449, + "step": 1236 + }, + { + "epoch": 0.6659040441423861, + "grad_norm": 1.7112669988534182, + "learning_rate": 2.6499900801084283e-06, + "loss": 0.4702, + "step": 1237 + }, + { + "epoch": 0.6664423659242312, + "grad_norm": 2.099925951296545, + "learning_rate": 2.6422952278647705e-06, + "loss": 0.4592, + "step": 1238 + }, + { + "epoch": 0.6669806877060763, + "grad_norm": 1.4352705146813356, + "learning_rate": 2.6346075496254054e-06, + "loss": 0.384, + "step": 1239 + }, + { + "epoch": 0.6675190094879214, + "grad_norm": 1.89895053480487, + "learning_rate": 2.6269270687823337e-06, + "loss": 0.4632, + "step": 1240 + }, + { + "epoch": 0.6680573312697665, + "grad_norm": 1.527126991788229, + "learning_rate": 2.619253808705661e-06, + "loss": 0.4304, + "step": 1241 + }, + { + "epoch": 0.6685956530516116, + "grad_norm": 1.9088122860113825, + "learning_rate": 2.6115877927435152e-06, + "loss": 0.4615, + "step": 1242 + }, + { + "epoch": 0.6691339748334567, + "grad_norm": 1.5152814714510374, + "learning_rate": 2.6039290442219884e-06, + "loss": 0.4019, + "step": 1243 + }, + { + "epoch": 0.6696722966153018, + "grad_norm": 1.490222426325067, + "learning_rate": 2.5962775864450563e-06, + "loss": 0.425, + "step": 1244 + }, + { + "epoch": 0.6702106183971469, + "grad_norm": 1.5269175130136061, + "learning_rate": 2.588633442694508e-06, + "loss": 0.3988, + "step": 1245 + }, + { + "epoch": 0.670748940178992, + "grad_norm": 1.4416954872355545, + "learning_rate": 2.5809966362298805e-06, + "loss": 0.4603, + "step": 1246 + }, + { + "epoch": 0.6712872619608371, + "grad_norm": 2.6364873275752014, + "learning_rate": 2.573367190288385e-06, + "loss": 0.4648, + "step": 1247 + }, + { + "epoch": 0.6718255837426822, + "grad_norm": 1.788546820645697, + "learning_rate": 2.5657451280848355e-06, + "loss": 0.4635, + "step": 1248 + }, + { + "epoch": 0.6723639055245273, + "grad_norm": 1.3806063124644692, + "learning_rate": 2.5581304728115797e-06, + "loss": 0.4943, + "step": 1249 + }, + { + "epoch": 0.6729022273063724, + "grad_norm": 1.402487270939909, + "learning_rate": 2.550523247638426e-06, + "loss": 0.4006, + "step": 1250 + }, + { + "epoch": 0.6734405490882175, + "grad_norm": 1.910681275697032, + "learning_rate": 2.542923475712574e-06, + "loss": 0.4609, + "step": 1251 + }, + { + "epoch": 0.6739788708700626, + "grad_norm": 1.446121535462886, + "learning_rate": 2.5353311801585507e-06, + "loss": 0.4092, + "step": 1252 + }, + { + "epoch": 0.6745171926519077, + "grad_norm": 1.6008122915794563, + "learning_rate": 2.5277463840781236e-06, + "loss": 0.4648, + "step": 1253 + }, + { + "epoch": 0.6750555144337528, + "grad_norm": 1.8052193116478468, + "learning_rate": 2.520169110550248e-06, + "loss": 0.4325, + "step": 1254 + }, + { + "epoch": 0.6755938362155979, + "grad_norm": 2.0544496666589245, + "learning_rate": 2.5125993826309904e-06, + "loss": 0.4102, + "step": 1255 + }, + { + "epoch": 0.676132157997443, + "grad_norm": 1.5511129757696938, + "learning_rate": 2.5050372233534526e-06, + "loss": 0.4443, + "step": 1256 + }, + { + "epoch": 0.6766704797792881, + "grad_norm": 1.8672906417068529, + "learning_rate": 2.4974826557277115e-06, + "loss": 0.4516, + "step": 1257 + }, + { + "epoch": 0.6772088015611332, + "grad_norm": 1.4831806217941237, + "learning_rate": 2.489935702740741e-06, + "loss": 0.4347, + "step": 1258 + }, + { + "epoch": 0.6777471233429783, + "grad_norm": 1.5986607931002996, + "learning_rate": 2.4823963873563487e-06, + "loss": 0.427, + "step": 1259 + }, + { + "epoch": 0.6782854451248234, + "grad_norm": 1.481767434298922, + "learning_rate": 2.4748647325150966e-06, + "loss": 0.4135, + "step": 1260 + }, + { + "epoch": 0.6782854451248234, + "eval_loss": 0.43108630180358887, + "eval_runtime": 1581.7954, + "eval_samples_per_second": 15.811, + "eval_steps_per_second": 0.494, + "step": 1260 + }, + { + "epoch": 0.6788237669066685, + "grad_norm": 1.491812080960543, + "learning_rate": 2.467340761134242e-06, + "loss": 0.4392, + "step": 1261 + }, + { + "epoch": 0.6793620886885136, + "grad_norm": 1.5403059882131847, + "learning_rate": 2.459824496107662e-06, + "loss": 0.4631, + "step": 1262 + }, + { + "epoch": 0.6799004104703587, + "grad_norm": 1.4488066174399352, + "learning_rate": 2.4523159603057858e-06, + "loss": 0.4401, + "step": 1263 + }, + { + "epoch": 0.6804387322522037, + "grad_norm": 1.6997928715987718, + "learning_rate": 2.444815176575521e-06, + "loss": 0.4671, + "step": 1264 + }, + { + "epoch": 0.6809770540340488, + "grad_norm": 1.6242395825984155, + "learning_rate": 2.4373221677401916e-06, + "loss": 0.4227, + "step": 1265 + }, + { + "epoch": 0.6815153758158939, + "grad_norm": 1.3272959133305353, + "learning_rate": 2.429836956599463e-06, + "loss": 0.3586, + "step": 1266 + }, + { + "epoch": 0.682053697597739, + "grad_norm": 1.723455688742321, + "learning_rate": 2.422359565929268e-06, + "loss": 0.4275, + "step": 1267 + }, + { + "epoch": 0.6825920193795841, + "grad_norm": 1.3911086482449566, + "learning_rate": 2.414890018481752e-06, + "loss": 0.4383, + "step": 1268 + }, + { + "epoch": 0.6831303411614292, + "grad_norm": 1.515918050738459, + "learning_rate": 2.40742833698519e-06, + "loss": 0.4342, + "step": 1269 + }, + { + "epoch": 0.6836686629432743, + "grad_norm": 1.6928322026664087, + "learning_rate": 2.3999745441439243e-06, + "loss": 0.4156, + "step": 1270 + }, + { + "epoch": 0.6842069847251194, + "grad_norm": 1.3632558682947689, + "learning_rate": 2.3925286626382926e-06, + "loss": 0.3914, + "step": 1271 + }, + { + "epoch": 0.6847453065069645, + "grad_norm": 3.139130094162036, + "learning_rate": 2.385090715124562e-06, + "loss": 0.4637, + "step": 1272 + }, + { + "epoch": 0.6852836282888096, + "grad_norm": 1.434440598705869, + "learning_rate": 2.3776607242348547e-06, + "loss": 0.437, + "step": 1273 + }, + { + "epoch": 0.6858219500706547, + "grad_norm": 1.5144260531076574, + "learning_rate": 2.3702387125770882e-06, + "loss": 0.4234, + "step": 1274 + }, + { + "epoch": 0.6863602718524998, + "grad_norm": 1.693660818176695, + "learning_rate": 2.362824702734893e-06, + "loss": 0.4164, + "step": 1275 + }, + { + "epoch": 0.6868985936343449, + "grad_norm": 1.3894626651308215, + "learning_rate": 2.355418717267558e-06, + "loss": 0.4221, + "step": 1276 + }, + { + "epoch": 0.68743691541619, + "grad_norm": 1.697033782203384, + "learning_rate": 2.3480207787099534e-06, + "loss": 0.4383, + "step": 1277 + }, + { + "epoch": 0.6879752371980351, + "grad_norm": 1.4858347246883488, + "learning_rate": 2.340630909572465e-06, + "loss": 0.4265, + "step": 1278 + }, + { + "epoch": 0.6885135589798802, + "grad_norm": 1.500359176091357, + "learning_rate": 2.3332491323409234e-06, + "loss": 0.4481, + "step": 1279 + }, + { + "epoch": 0.6890518807617253, + "grad_norm": 1.5297356725220441, + "learning_rate": 2.32587546947654e-06, + "loss": 0.4348, + "step": 1280 + }, + { + "epoch": 0.6895902025435704, + "grad_norm": 2.508398158502729, + "learning_rate": 2.3185099434158352e-06, + "loss": 0.4437, + "step": 1281 + }, + { + "epoch": 0.6901285243254155, + "grad_norm": 1.523641981004582, + "learning_rate": 2.311152576570566e-06, + "loss": 0.4575, + "step": 1282 + }, + { + "epoch": 0.6906668461072606, + "grad_norm": 1.6114434265747755, + "learning_rate": 2.303803391327669e-06, + "loss": 0.4378, + "step": 1283 + }, + { + "epoch": 0.6912051678891057, + "grad_norm": 1.4928444150803868, + "learning_rate": 2.296462410049183e-06, + "loss": 0.4411, + "step": 1284 + }, + { + "epoch": 0.6917434896709508, + "grad_norm": 1.5345549032626111, + "learning_rate": 2.289129655072185e-06, + "loss": 0.4324, + "step": 1285 + }, + { + "epoch": 0.6922818114527959, + "grad_norm": 1.4298368477097725, + "learning_rate": 2.2818051487087183e-06, + "loss": 0.426, + "step": 1286 + }, + { + "epoch": 0.692820133234641, + "grad_norm": 1.8725369506254443, + "learning_rate": 2.2744889132457314e-06, + "loss": 0.4541, + "step": 1287 + }, + { + "epoch": 0.6933584550164861, + "grad_norm": 1.77702449875276, + "learning_rate": 2.267180970945003e-06, + "loss": 0.432, + "step": 1288 + }, + { + "epoch": 0.6938967767983312, + "grad_norm": 1.4563290123647166, + "learning_rate": 2.259881344043081e-06, + "loss": 0.3832, + "step": 1289 + }, + { + "epoch": 0.6944350985801763, + "grad_norm": 1.3449801230990073, + "learning_rate": 2.252590054751205e-06, + "loss": 0.3962, + "step": 1290 + }, + { + "epoch": 0.6949734203620214, + "grad_norm": 1.8854534900995603, + "learning_rate": 2.2453071252552515e-06, + "loss": 0.4807, + "step": 1291 + }, + { + "epoch": 0.6955117421438665, + "grad_norm": 1.762423954535133, + "learning_rate": 2.238032577715656e-06, + "loss": 0.384, + "step": 1292 + }, + { + "epoch": 0.6960500639257116, + "grad_norm": 1.476803369543656, + "learning_rate": 2.2307664342673506e-06, + "loss": 0.4539, + "step": 1293 + }, + { + "epoch": 0.6965883857075567, + "grad_norm": 1.4854619250041479, + "learning_rate": 2.2235087170196966e-06, + "loss": 0.4396, + "step": 1294 + }, + { + "epoch": 0.6971267074894018, + "grad_norm": 1.41098403179678, + "learning_rate": 2.2162594480564155e-06, + "loss": 0.4005, + "step": 1295 + }, + { + "epoch": 0.6976650292712469, + "grad_norm": 1.2989632950912373, + "learning_rate": 2.2090186494355203e-06, + "loss": 0.4151, + "step": 1296 + }, + { + "epoch": 0.698203351053092, + "grad_norm": 1.6133874577700047, + "learning_rate": 2.2017863431892534e-06, + "loss": 0.4285, + "step": 1297 + }, + { + "epoch": 0.698741672834937, + "grad_norm": 1.333799397613619, + "learning_rate": 2.1945625513240154e-06, + "loss": 0.4041, + "step": 1298 + }, + { + "epoch": 0.6992799946167821, + "grad_norm": 1.4390186504294415, + "learning_rate": 2.1873472958202997e-06, + "loss": 0.4365, + "step": 1299 + }, + { + "epoch": 0.6998183163986272, + "grad_norm": 1.2866738586576456, + "learning_rate": 2.1801405986326245e-06, + "loss": 0.4665, + "step": 1300 + }, + { + "epoch": 0.7003566381804723, + "grad_norm": 2.2273828713275865, + "learning_rate": 2.1729424816894685e-06, + "loss": 0.4564, + "step": 1301 + }, + { + "epoch": 0.7008949599623174, + "grad_norm": 1.4546138888578992, + "learning_rate": 2.165752966893203e-06, + "loss": 0.4051, + "step": 1302 + }, + { + "epoch": 0.7014332817441625, + "grad_norm": 1.3514329197218915, + "learning_rate": 2.158572076120019e-06, + "loss": 0.4154, + "step": 1303 + }, + { + "epoch": 0.7019716035260076, + "grad_norm": 1.3870510485604055, + "learning_rate": 2.1513998312198734e-06, + "loss": 0.4269, + "step": 1304 + }, + { + "epoch": 0.7025099253078527, + "grad_norm": 1.6439661727082362, + "learning_rate": 2.1442362540164123e-06, + "loss": 0.4472, + "step": 1305 + }, + { + "epoch": 0.7030482470896978, + "grad_norm": 2.036208978375709, + "learning_rate": 2.1370813663069086e-06, + "loss": 0.4952, + "step": 1306 + }, + { + "epoch": 0.7035865688715429, + "grad_norm": 1.4306434260587932, + "learning_rate": 2.1299351898621938e-06, + "loss": 0.3815, + "step": 1307 + }, + { + "epoch": 0.704124890653388, + "grad_norm": 1.5518498802370642, + "learning_rate": 2.122797746426595e-06, + "loss": 0.4656, + "step": 1308 + }, + { + "epoch": 0.7046632124352331, + "grad_norm": 1.353149193018473, + "learning_rate": 2.1156690577178657e-06, + "loss": 0.4414, + "step": 1309 + }, + { + "epoch": 0.7052015342170782, + "grad_norm": 1.3081505827837419, + "learning_rate": 2.108549145427117e-06, + "loss": 0.4355, + "step": 1310 + }, + { + "epoch": 0.7057398559989233, + "grad_norm": 1.5741831120177514, + "learning_rate": 2.1014380312187593e-06, + "loss": 0.4396, + "step": 1311 + }, + { + "epoch": 0.7062781777807684, + "grad_norm": 1.5628460516936316, + "learning_rate": 2.094335736730433e-06, + "loss": 0.3687, + "step": 1312 + }, + { + "epoch": 0.7068164995626135, + "grad_norm": 3.0284027392779986, + "learning_rate": 2.0872422835729384e-06, + "loss": 0.4463, + "step": 1313 + }, + { + "epoch": 0.7073548213444586, + "grad_norm": 1.3447501399327724, + "learning_rate": 2.0801576933301757e-06, + "loss": 0.4371, + "step": 1314 + }, + { + "epoch": 0.7078931431263038, + "grad_norm": 1.8116776445346612, + "learning_rate": 2.073081987559077e-06, + "loss": 0.4109, + "step": 1315 + }, + { + "epoch": 0.7084314649081489, + "grad_norm": 1.571648134209876, + "learning_rate": 2.06601518778954e-06, + "loss": 0.432, + "step": 1316 + }, + { + "epoch": 0.708969786689994, + "grad_norm": 1.596166756734421, + "learning_rate": 2.0589573155243663e-06, + "loss": 0.4291, + "step": 1317 + }, + { + "epoch": 0.7095081084718391, + "grad_norm": 1.4446289087866433, + "learning_rate": 2.051908392239186e-06, + "loss": 0.4094, + "step": 1318 + }, + { + "epoch": 0.7100464302536842, + "grad_norm": 1.377063116073787, + "learning_rate": 2.044868439382406e-06, + "loss": 0.4696, + "step": 1319 + }, + { + "epoch": 0.7105847520355293, + "grad_norm": 1.3694098512093758, + "learning_rate": 2.0378374783751352e-06, + "loss": 0.402, + "step": 1320 + }, + { + "epoch": 0.7105847520355293, + "eval_loss": 0.4282020330429077, + "eval_runtime": 1515.7705, + "eval_samples_per_second": 16.5, + "eval_steps_per_second": 0.516, + "step": 1320 + }, + { + "epoch": 0.7111230738173744, + "grad_norm": 1.929826065439873, + "learning_rate": 2.030815530611123e-06, + "loss": 0.4159, + "step": 1321 + }, + { + "epoch": 0.7116613955992195, + "grad_norm": 1.4082500795847726, + "learning_rate": 2.023802617456694e-06, + "loss": 0.3941, + "step": 1322 + }, + { + "epoch": 0.7121997173810646, + "grad_norm": 1.8816103595399847, + "learning_rate": 2.01679876025068e-06, + "loss": 0.4244, + "step": 1323 + }, + { + "epoch": 0.7127380391629097, + "grad_norm": 1.5683369901785116, + "learning_rate": 2.0098039803043612e-06, + "loss": 0.4332, + "step": 1324 + }, + { + "epoch": 0.7132763609447548, + "grad_norm": 1.4453103994083734, + "learning_rate": 2.0028182989013923e-06, + "loss": 0.3945, + "step": 1325 + }, + { + "epoch": 0.7138146827265999, + "grad_norm": 1.6267798252157584, + "learning_rate": 1.9958417372977474e-06, + "loss": 0.4528, + "step": 1326 + }, + { + "epoch": 0.714353004508445, + "grad_norm": 1.6214655041789812, + "learning_rate": 1.9888743167216493e-06, + "loss": 0.4074, + "step": 1327 + }, + { + "epoch": 0.7148913262902901, + "grad_norm": 1.8595682807437428, + "learning_rate": 1.9819160583735077e-06, + "loss": 0.4494, + "step": 1328 + }, + { + "epoch": 0.7154296480721352, + "grad_norm": 1.4662467013475076, + "learning_rate": 1.974966983425852e-06, + "loss": 0.4066, + "step": 1329 + }, + { + "epoch": 0.7159679698539803, + "grad_norm": 2.5261174973160716, + "learning_rate": 1.9680271130232693e-06, + "loss": 0.4394, + "step": 1330 + }, + { + "epoch": 0.7165062916358254, + "grad_norm": 1.8084272539130577, + "learning_rate": 1.9610964682823407e-06, + "loss": 0.4601, + "step": 1331 + }, + { + "epoch": 0.7170446134176705, + "grad_norm": 1.820018846201368, + "learning_rate": 1.9541750702915706e-06, + "loss": 0.4446, + "step": 1332 + }, + { + "epoch": 0.7175829351995155, + "grad_norm": 1.3923517314522877, + "learning_rate": 1.9472629401113325e-06, + "loss": 0.3857, + "step": 1333 + }, + { + "epoch": 0.7181212569813606, + "grad_norm": 1.527238991242769, + "learning_rate": 1.9403600987737976e-06, + "loss": 0.4381, + "step": 1334 + }, + { + "epoch": 0.7186595787632057, + "grad_norm": 1.4006251254778943, + "learning_rate": 1.9334665672828736e-06, + "loss": 0.4332, + "step": 1335 + }, + { + "epoch": 0.7191979005450508, + "grad_norm": 2.1367769390904, + "learning_rate": 1.926582366614141e-06, + "loss": 0.4331, + "step": 1336 + }, + { + "epoch": 0.7197362223268959, + "grad_norm": 1.661348731930383, + "learning_rate": 1.9197075177147866e-06, + "loss": 0.4877, + "step": 1337 + }, + { + "epoch": 0.720274544108741, + "grad_norm": 1.4928525414429736, + "learning_rate": 1.9128420415035442e-06, + "loss": 0.4239, + "step": 1338 + }, + { + "epoch": 0.7208128658905861, + "grad_norm": 1.533499882863047, + "learning_rate": 1.9059859588706287e-06, + "loss": 0.3951, + "step": 1339 + }, + { + "epoch": 0.7213511876724312, + "grad_norm": 1.8392687775713348, + "learning_rate": 1.8991392906776668e-06, + "loss": 0.4395, + "step": 1340 + }, + { + "epoch": 0.7218895094542763, + "grad_norm": 1.573889490157054, + "learning_rate": 1.8923020577576452e-06, + "loss": 0.4162, + "step": 1341 + }, + { + "epoch": 0.7224278312361214, + "grad_norm": 1.5526149616819422, + "learning_rate": 1.885474280914838e-06, + "loss": 0.4579, + "step": 1342 + }, + { + "epoch": 0.7229661530179665, + "grad_norm": 1.5191810245344743, + "learning_rate": 1.8786559809247485e-06, + "loss": 0.4216, + "step": 1343 + }, + { + "epoch": 0.7235044747998116, + "grad_norm": 1.5555786435185341, + "learning_rate": 1.8718471785340414e-06, + "loss": 0.4122, + "step": 1344 + }, + { + "epoch": 0.7240427965816567, + "grad_norm": 1.3557551585285899, + "learning_rate": 1.8650478944604844e-06, + "loss": 0.3932, + "step": 1345 + }, + { + "epoch": 0.7245811183635018, + "grad_norm": 1.4728885839955113, + "learning_rate": 1.8582581493928837e-06, + "loss": 0.4934, + "step": 1346 + }, + { + "epoch": 0.7251194401453469, + "grad_norm": 1.5560703862712066, + "learning_rate": 1.8514779639910152e-06, + "loss": 0.4565, + "step": 1347 + }, + { + "epoch": 0.725657761927192, + "grad_norm": 1.4005810948444959, + "learning_rate": 1.8447073588855707e-06, + "loss": 0.45, + "step": 1348 + }, + { + "epoch": 0.7261960837090371, + "grad_norm": 1.4372886671511238, + "learning_rate": 1.8379463546780923e-06, + "loss": 0.4076, + "step": 1349 + }, + { + "epoch": 0.7267344054908822, + "grad_norm": 1.3561213817272149, + "learning_rate": 1.8311949719409056e-06, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.592180627183088, + "learning_rate": 1.824453231217062e-06, + "loss": 0.4395, + "step": 1351 + }, + { + "epoch": 0.7278110490545724, + "grad_norm": 1.674234401633556, + "learning_rate": 1.8177211530202733e-06, + "loss": 0.5076, + "step": 1352 + }, + { + "epoch": 0.7283493708364175, + "grad_norm": 1.3869830990008478, + "learning_rate": 1.8109987578348504e-06, + "loss": 0.3823, + "step": 1353 + }, + { + "epoch": 0.7288876926182626, + "grad_norm": 1.8958736579636137, + "learning_rate": 1.8042860661156425e-06, + "loss": 0.4283, + "step": 1354 + }, + { + "epoch": 0.7294260144001077, + "grad_norm": 2.277391563720137, + "learning_rate": 1.7975830982879688e-06, + "loss": 0.4344, + "step": 1355 + }, + { + "epoch": 0.7299643361819528, + "grad_norm": 1.3788436987213148, + "learning_rate": 1.7908898747475656e-06, + "loss": 0.42, + "step": 1356 + }, + { + "epoch": 0.7305026579637979, + "grad_norm": 1.472584181988221, + "learning_rate": 1.784206415860516e-06, + "loss": 0.4554, + "step": 1357 + }, + { + "epoch": 0.731040979745643, + "grad_norm": 1.441497867695086, + "learning_rate": 1.7775327419631938e-06, + "loss": 0.3914, + "step": 1358 + }, + { + "epoch": 0.7315793015274881, + "grad_norm": 1.413962400530734, + "learning_rate": 1.7708688733621971e-06, + "loss": 0.4271, + "step": 1359 + }, + { + "epoch": 0.7321176233093332, + "grad_norm": 1.467777866704718, + "learning_rate": 1.7642148303342894e-06, + "loss": 0.4613, + "step": 1360 + }, + { + "epoch": 0.7326559450911783, + "grad_norm": 1.4588809601870538, + "learning_rate": 1.7575706331263392e-06, + "loss": 0.3732, + "step": 1361 + }, + { + "epoch": 0.7331942668730234, + "grad_norm": 1.9984141502445067, + "learning_rate": 1.7509363019552506e-06, + "loss": 0.4337, + "step": 1362 + }, + { + "epoch": 0.7337325886548685, + "grad_norm": 1.7211596185425657, + "learning_rate": 1.744311857007912e-06, + "loss": 0.4237, + "step": 1363 + }, + { + "epoch": 0.7342709104367136, + "grad_norm": 1.3275340316554045, + "learning_rate": 1.7376973184411294e-06, + "loss": 0.4026, + "step": 1364 + }, + { + "epoch": 0.7348092322185587, + "grad_norm": 1.3704150312314805, + "learning_rate": 1.7310927063815647e-06, + "loss": 0.4221, + "step": 1365 + }, + { + "epoch": 0.7353475540004037, + "grad_norm": 1.6240778919766734, + "learning_rate": 1.7244980409256768e-06, + "loss": 0.3956, + "step": 1366 + }, + { + "epoch": 0.7358858757822488, + "grad_norm": 1.5916150137066967, + "learning_rate": 1.7179133421396571e-06, + "loss": 0.449, + "step": 1367 + }, + { + "epoch": 0.7364241975640939, + "grad_norm": 1.3674325981426028, + "learning_rate": 1.7113386300593749e-06, + "loss": 0.469, + "step": 1368 + }, + { + "epoch": 0.736962519345939, + "grad_norm": 1.823579935483228, + "learning_rate": 1.7047739246903044e-06, + "loss": 0.4256, + "step": 1369 + }, + { + "epoch": 0.7375008411277841, + "grad_norm": 1.5992570631473233, + "learning_rate": 1.6982192460074787e-06, + "loss": 0.4364, + "step": 1370 + }, + { + "epoch": 0.7380391629096292, + "grad_norm": 1.83556587779534, + "learning_rate": 1.6916746139554186e-06, + "loss": 0.462, + "step": 1371 + }, + { + "epoch": 0.7385774846914743, + "grad_norm": 1.63962319033326, + "learning_rate": 1.6851400484480757e-06, + "loss": 0.4647, + "step": 1372 + }, + { + "epoch": 0.7391158064733194, + "grad_norm": 1.489565256988372, + "learning_rate": 1.6786155693687712e-06, + "loss": 0.4391, + "step": 1373 + }, + { + "epoch": 0.7396541282551645, + "grad_norm": 1.8781762497357959, + "learning_rate": 1.6721011965701344e-06, + "loss": 0.4429, + "step": 1374 + }, + { + "epoch": 0.7401924500370096, + "grad_norm": 1.394724821422672, + "learning_rate": 1.6655969498740455e-06, + "loss": 0.3781, + "step": 1375 + }, + { + "epoch": 0.7407307718188547, + "grad_norm": 1.7954529740174663, + "learning_rate": 1.6591028490715722e-06, + "loss": 0.4437, + "step": 1376 + }, + { + "epoch": 0.7412690936006998, + "grad_norm": 1.5625366322113399, + "learning_rate": 1.6526189139229072e-06, + "loss": 0.4221, + "step": 1377 + }, + { + "epoch": 0.7418074153825449, + "grad_norm": 1.49000718617141, + "learning_rate": 1.6461451641573156e-06, + "loss": 0.3824, + "step": 1378 + }, + { + "epoch": 0.74234573716439, + "grad_norm": 1.5501486593751905, + "learning_rate": 1.639681619473069e-06, + "loss": 0.4316, + "step": 1379 + }, + { + "epoch": 0.7428840589462351, + "grad_norm": 1.6012264627466746, + "learning_rate": 1.6332282995373867e-06, + "loss": 0.4414, + "step": 1380 + }, + { + "epoch": 0.7428840589462351, + "eval_loss": 0.4260067939758301, + "eval_runtime": 1520.5135, + "eval_samples_per_second": 16.448, + "eval_steps_per_second": 0.514, + "step": 1380 + }, + { + "epoch": 0.7434223807280802, + "grad_norm": 1.3868379821786618, + "learning_rate": 1.6267852239863763e-06, + "loss": 0.3962, + "step": 1381 + }, + { + "epoch": 0.7439607025099253, + "grad_norm": 1.563201406467786, + "learning_rate": 1.6203524124249742e-06, + "loss": 0.4359, + "step": 1382 + }, + { + "epoch": 0.7444990242917704, + "grad_norm": 2.0744885451879895, + "learning_rate": 1.613929884426887e-06, + "loss": 0.472, + "step": 1383 + }, + { + "epoch": 0.7450373460736155, + "grad_norm": 1.7165383734256863, + "learning_rate": 1.607517659534526e-06, + "loss": 0.4449, + "step": 1384 + }, + { + "epoch": 0.7455756678554606, + "grad_norm": 1.420966932605389, + "learning_rate": 1.6011157572589565e-06, + "loss": 0.4594, + "step": 1385 + }, + { + "epoch": 0.7461139896373057, + "grad_norm": 1.3843843466818937, + "learning_rate": 1.5947241970798332e-06, + "loss": 0.4021, + "step": 1386 + }, + { + "epoch": 0.7466523114191508, + "grad_norm": 2.021869994898455, + "learning_rate": 1.588342998445342e-06, + "loss": 0.4973, + "step": 1387 + }, + { + "epoch": 0.7471906332009959, + "grad_norm": 1.6308202289723368, + "learning_rate": 1.58197218077214e-06, + "loss": 0.4448, + "step": 1388 + }, + { + "epoch": 0.747728954982841, + "grad_norm": 1.5609319044422376, + "learning_rate": 1.5756117634452977e-06, + "loss": 0.4512, + "step": 1389 + }, + { + "epoch": 0.7482672767646861, + "grad_norm": 1.3798571945954525, + "learning_rate": 1.5692617658182402e-06, + "loss": 0.4332, + "step": 1390 + }, + { + "epoch": 0.7488055985465312, + "grad_norm": 1.5464889993436788, + "learning_rate": 1.5629222072126888e-06, + "loss": 0.4716, + "step": 1391 + }, + { + "epoch": 0.7493439203283763, + "grad_norm": 1.7517747662085987, + "learning_rate": 1.5565931069185946e-06, + "loss": 0.4305, + "step": 1392 + }, + { + "epoch": 0.7498822421102214, + "grad_norm": 1.5029346054542445, + "learning_rate": 1.5502744841940936e-06, + "loss": 0.4657, + "step": 1393 + }, + { + "epoch": 0.7504205638920665, + "grad_norm": 1.3544718143048395, + "learning_rate": 1.543966358265438e-06, + "loss": 0.418, + "step": 1394 + }, + { + "epoch": 0.7509588856739116, + "grad_norm": 1.52275975192662, + "learning_rate": 1.5376687483269404e-06, + "loss": 0.3732, + "step": 1395 + }, + { + "epoch": 0.7514972074557567, + "grad_norm": 1.691512607761959, + "learning_rate": 1.5313816735409148e-06, + "loss": 0.4606, + "step": 1396 + }, + { + "epoch": 0.7520355292376018, + "grad_norm": 1.6421517222533963, + "learning_rate": 1.5251051530376199e-06, + "loss": 0.413, + "step": 1397 + }, + { + "epoch": 0.7525738510194468, + "grad_norm": 1.7994036447279773, + "learning_rate": 1.518839205915202e-06, + "loss": 0.4167, + "step": 1398 + }, + { + "epoch": 0.753112172801292, + "grad_norm": 1.4116743542426848, + "learning_rate": 1.5125838512396278e-06, + "loss": 0.4502, + "step": 1399 + }, + { + "epoch": 0.753650494583137, + "grad_norm": 2.9318193198163414, + "learning_rate": 1.5063391080446404e-06, + "loss": 0.4523, + "step": 1400 + }, + { + "epoch": 0.7541888163649821, + "grad_norm": 1.3582596783082035, + "learning_rate": 1.500104995331692e-06, + "loss": 0.3758, + "step": 1401 + }, + { + "epoch": 0.7547271381468272, + "grad_norm": 2.1921211591651435, + "learning_rate": 1.493881532069889e-06, + "loss": 0.4725, + "step": 1402 + }, + { + "epoch": 0.7552654599286723, + "grad_norm": 1.5078767590789557, + "learning_rate": 1.487668737195932e-06, + "loss": 0.4137, + "step": 1403 + }, + { + "epoch": 0.7558037817105174, + "grad_norm": 1.7747344554372293, + "learning_rate": 1.4814666296140617e-06, + "loss": 0.4519, + "step": 1404 + }, + { + "epoch": 0.7563421034923625, + "grad_norm": 1.4869616706516326, + "learning_rate": 1.4752752281960003e-06, + "loss": 0.3805, + "step": 1405 + }, + { + "epoch": 0.7568804252742076, + "grad_norm": 1.688795973706041, + "learning_rate": 1.4690945517808897e-06, + "loss": 0.4993, + "step": 1406 + }, + { + "epoch": 0.7574187470560527, + "grad_norm": 1.583736337415557, + "learning_rate": 1.4629246191752406e-06, + "loss": 0.4382, + "step": 1407 + }, + { + "epoch": 0.7579570688378978, + "grad_norm": 1.405921968173557, + "learning_rate": 1.4567654491528732e-06, + "loss": 0.3952, + "step": 1408 + }, + { + "epoch": 0.7584953906197429, + "grad_norm": 1.3449184128012615, + "learning_rate": 1.4506170604548575e-06, + "loss": 0.4443, + "step": 1409 + }, + { + "epoch": 0.759033712401588, + "grad_norm": 1.5849926738123288, + "learning_rate": 1.4444794717894596e-06, + "loss": 0.4131, + "step": 1410 + }, + { + "epoch": 0.7595720341834331, + "grad_norm": 1.6555281403636608, + "learning_rate": 1.4383527018320825e-06, + "loss": 0.4414, + "step": 1411 + }, + { + "epoch": 0.7601103559652782, + "grad_norm": 1.6263621942357136, + "learning_rate": 1.432236769225211e-06, + "loss": 0.4346, + "step": 1412 + }, + { + "epoch": 0.7606486777471233, + "grad_norm": 2.0460094225135044, + "learning_rate": 1.426131692578354e-06, + "loss": 0.4493, + "step": 1413 + }, + { + "epoch": 0.7611869995289684, + "grad_norm": 1.472378438798274, + "learning_rate": 1.4200374904679853e-06, + "loss": 0.4562, + "step": 1414 + }, + { + "epoch": 0.7617253213108135, + "grad_norm": 1.7242311556580157, + "learning_rate": 1.413954181437493e-06, + "loss": 0.4043, + "step": 1415 + }, + { + "epoch": 0.7622636430926586, + "grad_norm": 1.6120964716761355, + "learning_rate": 1.4078817839971193e-06, + "loss": 0.4815, + "step": 1416 + }, + { + "epoch": 0.7628019648745037, + "grad_norm": 2.00633033152504, + "learning_rate": 1.4018203166239032e-06, + "loss": 0.5084, + "step": 1417 + }, + { + "epoch": 0.7633402866563488, + "grad_norm": 1.593451139015103, + "learning_rate": 1.3957697977616275e-06, + "loss": 0.4089, + "step": 1418 + }, + { + "epoch": 0.7638786084381939, + "grad_norm": 1.520947317999593, + "learning_rate": 1.38973024582076e-06, + "loss": 0.4204, + "step": 1419 + }, + { + "epoch": 0.764416930220039, + "grad_norm": 1.5671907812915762, + "learning_rate": 1.3837016791784002e-06, + "loss": 0.4011, + "step": 1420 + }, + { + "epoch": 0.7649552520018841, + "grad_norm": 2.3136360187940435, + "learning_rate": 1.3776841161782174e-06, + "loss": 0.5217, + "step": 1421 + }, + { + "epoch": 0.7654935737837292, + "grad_norm": 1.6259616459954453, + "learning_rate": 1.3716775751304024e-06, + "loss": 0.4094, + "step": 1422 + }, + { + "epoch": 0.7660318955655743, + "grad_norm": 1.2851781752532265, + "learning_rate": 1.365682074311609e-06, + "loss": 0.4371, + "step": 1423 + }, + { + "epoch": 0.7665702173474194, + "grad_norm": 1.6356127807123704, + "learning_rate": 1.3596976319648957e-06, + "loss": 0.4305, + "step": 1424 + }, + { + "epoch": 0.7671085391292645, + "grad_norm": 1.7847217896835836, + "learning_rate": 1.3537242662996741e-06, + "loss": 0.4228, + "step": 1425 + }, + { + "epoch": 0.7676468609111096, + "grad_norm": 1.9347446509271482, + "learning_rate": 1.347761995491651e-06, + "loss": 0.3528, + "step": 1426 + }, + { + "epoch": 0.7681851826929547, + "grad_norm": 1.7975930657160712, + "learning_rate": 1.3418108376827738e-06, + "loss": 0.4782, + "step": 1427 + }, + { + "epoch": 0.7687235044747998, + "grad_norm": 1.4744627345322843, + "learning_rate": 1.3358708109811775e-06, + "loss": 0.3919, + "step": 1428 + }, + { + "epoch": 0.769261826256645, + "grad_norm": 2.7855979759464926, + "learning_rate": 1.3299419334611213e-06, + "loss": 0.4646, + "step": 1429 + }, + { + "epoch": 0.7698001480384901, + "grad_norm": 1.4805916259048137, + "learning_rate": 1.324024223162947e-06, + "loss": 0.3906, + "step": 1430 + }, + { + "epoch": 0.7703384698203352, + "grad_norm": 1.7443733531704324, + "learning_rate": 1.3181176980930133e-06, + "loss": 0.4046, + "step": 1431 + }, + { + "epoch": 0.7708767916021803, + "grad_norm": 1.3403811088010225, + "learning_rate": 1.3122223762236446e-06, + "loss": 0.4585, + "step": 1432 + }, + { + "epoch": 0.7714151133840254, + "grad_norm": 1.8083215069181602, + "learning_rate": 1.306338275493077e-06, + "loss": 0.4488, + "step": 1433 + }, + { + "epoch": 0.7719534351658704, + "grad_norm": 2.257570529751952, + "learning_rate": 1.3004654138054035e-06, + "loss": 0.4411, + "step": 1434 + }, + { + "epoch": 0.7724917569477155, + "grad_norm": 1.5282453915471157, + "learning_rate": 1.2946038090305186e-06, + "loss": 0.3982, + "step": 1435 + }, + { + "epoch": 0.7730300787295606, + "grad_norm": 1.3350543760395588, + "learning_rate": 1.2887534790040623e-06, + "loss": 0.3529, + "step": 1436 + }, + { + "epoch": 0.7735684005114057, + "grad_norm": 1.5872897107277366, + "learning_rate": 1.2829144415273703e-06, + "loss": 0.4175, + "step": 1437 + }, + { + "epoch": 0.7741067222932508, + "grad_norm": 1.461133941363055, + "learning_rate": 1.2770867143674176e-06, + "loss": 0.4225, + "step": 1438 + }, + { + "epoch": 0.7746450440750959, + "grad_norm": 1.977273812214763, + "learning_rate": 1.2712703152567634e-06, + "loss": 0.3955, + "step": 1439 + }, + { + "epoch": 0.775183365856941, + "grad_norm": 1.6743349069669249, + "learning_rate": 1.2654652618934977e-06, + "loss": 0.3861, + "step": 1440 + }, + { + "epoch": 0.775183365856941, + "eval_loss": 0.42436715960502625, + "eval_runtime": 1522.7354, + "eval_samples_per_second": 16.424, + "eval_steps_per_second": 0.514, + "step": 1440 + }, + { + "epoch": 0.7757216876387861, + "grad_norm": 1.499262565396223, + "learning_rate": 1.2596715719411877e-06, + "loss": 0.4024, + "step": 1441 + }, + { + "epoch": 0.7762600094206312, + "grad_norm": 1.6235233768215886, + "learning_rate": 1.253889263028827e-06, + "loss": 0.3789, + "step": 1442 + }, + { + "epoch": 0.7767983312024763, + "grad_norm": 1.4115144384917186, + "learning_rate": 1.2481183527507734e-06, + "loss": 0.4605, + "step": 1443 + }, + { + "epoch": 0.7773366529843214, + "grad_norm": 1.4061010836073027, + "learning_rate": 1.2423588586667058e-06, + "loss": 0.394, + "step": 1444 + }, + { + "epoch": 0.7778749747661665, + "grad_norm": 1.4756730352326592, + "learning_rate": 1.2366107983015636e-06, + "loss": 0.3997, + "step": 1445 + }, + { + "epoch": 0.7784132965480116, + "grad_norm": 1.7767670811956109, + "learning_rate": 1.2308741891454978e-06, + "loss": 0.4388, + "step": 1446 + }, + { + "epoch": 0.7789516183298567, + "grad_norm": 1.9567881229548667, + "learning_rate": 1.2251490486538143e-06, + "loss": 0.4457, + "step": 1447 + }, + { + "epoch": 0.7794899401117018, + "grad_norm": 1.7149877959759003, + "learning_rate": 1.2194353942469217e-06, + "loss": 0.4482, + "step": 1448 + }, + { + "epoch": 0.7800282618935469, + "grad_norm": 1.5521839437257912, + "learning_rate": 1.2137332433102806e-06, + "loss": 0.469, + "step": 1449 + }, + { + "epoch": 0.780566583675392, + "grad_norm": 2.688209146479993, + "learning_rate": 1.2080426131943496e-06, + "loss": 0.3849, + "step": 1450 + }, + { + "epoch": 0.7811049054572371, + "grad_norm": 1.4274278905750635, + "learning_rate": 1.2023635212145262e-06, + "loss": 0.3923, + "step": 1451 + }, + { + "epoch": 0.7816432272390822, + "grad_norm": 1.5796240111966617, + "learning_rate": 1.1966959846511068e-06, + "loss": 0.4567, + "step": 1452 + }, + { + "epoch": 0.7821815490209273, + "grad_norm": 2.368565849047706, + "learning_rate": 1.191040020749223e-06, + "loss": 0.3885, + "step": 1453 + }, + { + "epoch": 0.7827198708027724, + "grad_norm": 1.7831232578884653, + "learning_rate": 1.1853956467187943e-06, + "loss": 0.3873, + "step": 1454 + }, + { + "epoch": 0.7832581925846175, + "grad_norm": 2.2089394022551363, + "learning_rate": 1.1797628797344752e-06, + "loss": 0.4341, + "step": 1455 + }, + { + "epoch": 0.7837965143664626, + "grad_norm": 1.7921663918566133, + "learning_rate": 1.1741417369356011e-06, + "loss": 0.4138, + "step": 1456 + }, + { + "epoch": 0.7843348361483077, + "grad_norm": 1.503278809860387, + "learning_rate": 1.1685322354261402e-06, + "loss": 0.4608, + "step": 1457 + }, + { + "epoch": 0.7848731579301528, + "grad_norm": 1.567305564830315, + "learning_rate": 1.1629343922746334e-06, + "loss": 0.4444, + "step": 1458 + }, + { + "epoch": 0.7854114797119979, + "grad_norm": 1.4431401966395603, + "learning_rate": 1.1573482245141525e-06, + "loss": 0.4353, + "step": 1459 + }, + { + "epoch": 0.785949801493843, + "grad_norm": 1.7031469874820835, + "learning_rate": 1.1517737491422415e-06, + "loss": 0.4433, + "step": 1460 + }, + { + "epoch": 0.7864881232756881, + "grad_norm": 1.9609977211459744, + "learning_rate": 1.1462109831208679e-06, + "loss": 0.4482, + "step": 1461 + }, + { + "epoch": 0.7870264450575332, + "grad_norm": 2.150596318263902, + "learning_rate": 1.1406599433763694e-06, + "loss": 0.4755, + "step": 1462 + }, + { + "epoch": 0.7875647668393783, + "grad_norm": 1.3265638431410287, + "learning_rate": 1.1351206467994018e-06, + "loss": 0.4102, + "step": 1463 + }, + { + "epoch": 0.7881030886212234, + "grad_norm": 4.188075621147485, + "learning_rate": 1.129593110244892e-06, + "loss": 0.3644, + "step": 1464 + }, + { + "epoch": 0.7886414104030685, + "grad_norm": 1.5439643283706193, + "learning_rate": 1.1240773505319824e-06, + "loss": 0.4707, + "step": 1465 + }, + { + "epoch": 0.7891797321849136, + "grad_norm": 1.695949064351043, + "learning_rate": 1.1185733844439778e-06, + "loss": 0.4506, + "step": 1466 + }, + { + "epoch": 0.7897180539667586, + "grad_norm": 1.4925323276596911, + "learning_rate": 1.113081228728301e-06, + "loss": 0.4062, + "step": 1467 + }, + { + "epoch": 0.7902563757486037, + "grad_norm": 1.810916777909123, + "learning_rate": 1.1076009000964384e-06, + "loss": 0.4617, + "step": 1468 + }, + { + "epoch": 0.7907946975304488, + "grad_norm": 1.5391006325796759, + "learning_rate": 1.102132415223886e-06, + "loss": 0.4341, + "step": 1469 + }, + { + "epoch": 0.7913330193122939, + "grad_norm": 1.3539603638585116, + "learning_rate": 1.0966757907501058e-06, + "loss": 0.4045, + "step": 1470 + }, + { + "epoch": 0.791871341094139, + "grad_norm": 1.585969494802185, + "learning_rate": 1.0912310432784673e-06, + "loss": 0.4889, + "step": 1471 + }, + { + "epoch": 0.7924096628759841, + "grad_norm": 1.3636312861290756, + "learning_rate": 1.0857981893762048e-06, + "loss": 0.4352, + "step": 1472 + }, + { + "epoch": 0.7929479846578292, + "grad_norm": 1.5823372906311277, + "learning_rate": 1.0803772455743572e-06, + "loss": 0.398, + "step": 1473 + }, + { + "epoch": 0.7934863064396743, + "grad_norm": 1.5278694836184388, + "learning_rate": 1.0749682283677288e-06, + "loss": 0.4228, + "step": 1474 + }, + { + "epoch": 0.7940246282215194, + "grad_norm": 1.1652690918407183, + "learning_rate": 1.0695711542148313e-06, + "loss": 0.3811, + "step": 1475 + }, + { + "epoch": 0.7945629500033645, + "grad_norm": 1.4886602129753284, + "learning_rate": 1.0641860395378367e-06, + "loss": 0.4037, + "step": 1476 + }, + { + "epoch": 0.7951012717852096, + "grad_norm": 1.5390850918633818, + "learning_rate": 1.0588129007225266e-06, + "loss": 0.3754, + "step": 1477 + }, + { + "epoch": 0.7956395935670547, + "grad_norm": 1.676720868561217, + "learning_rate": 1.0534517541182431e-06, + "loss": 0.4599, + "step": 1478 + }, + { + "epoch": 0.7961779153488998, + "grad_norm": 1.676144009500296, + "learning_rate": 1.0481026160378394e-06, + "loss": 0.4203, + "step": 1479 + }, + { + "epoch": 0.7967162371307449, + "grad_norm": 1.3949722623692342, + "learning_rate": 1.042765502757625e-06, + "loss": 0.4149, + "step": 1480 + }, + { + "epoch": 0.79725455891259, + "grad_norm": 1.6398344004557446, + "learning_rate": 1.0374404305173247e-06, + "loss": 0.4215, + "step": 1481 + }, + { + "epoch": 0.7977928806944351, + "grad_norm": 1.6715940485370635, + "learning_rate": 1.0321274155200234e-06, + "loss": 0.4393, + "step": 1482 + }, + { + "epoch": 0.7983312024762802, + "grad_norm": 1.395308837290767, + "learning_rate": 1.0268264739321194e-06, + "loss": 0.4398, + "step": 1483 + }, + { + "epoch": 0.7988695242581253, + "grad_norm": 1.6597231226511682, + "learning_rate": 1.0215376218832723e-06, + "loss": 0.4185, + "step": 1484 + }, + { + "epoch": 0.7994078460399704, + "grad_norm": 1.5059702316944186, + "learning_rate": 1.0162608754663572e-06, + "loss": 0.4428, + "step": 1485 + }, + { + "epoch": 0.7999461678218155, + "grad_norm": 1.774717767949121, + "learning_rate": 1.0109962507374139e-06, + "loss": 0.456, + "step": 1486 + }, + { + "epoch": 0.8004844896036606, + "grad_norm": 1.5763966693479707, + "learning_rate": 1.0057437637155997e-06, + "loss": 0.4742, + "step": 1487 + }, + { + "epoch": 0.8010228113855057, + "grad_norm": 1.66961890257069, + "learning_rate": 1.0005034303831352e-06, + "loss": 0.4479, + "step": 1488 + }, + { + "epoch": 0.8015611331673508, + "grad_norm": 1.4312052717987154, + "learning_rate": 9.95275266685264e-07, + "loss": 0.3894, + "step": 1489 + }, + { + "epoch": 0.8020994549491959, + "grad_norm": 1.5395533368166758, + "learning_rate": 9.900592885301986e-07, + "loss": 0.433, + "step": 1490 + }, + { + "epoch": 0.802637776731041, + "grad_norm": 1.7267038818610854, + "learning_rate": 9.848555117890734e-07, + "loss": 0.4399, + "step": 1491 + }, + { + "epoch": 0.8031760985128861, + "grad_norm": 1.588155903799363, + "learning_rate": 9.796639522958972e-07, + "loss": 0.4662, + "step": 1492 + }, + { + "epoch": 0.8037144202947312, + "grad_norm": 1.278378381771794, + "learning_rate": 9.744846258475032e-07, + "loss": 0.4023, + "step": 1493 + }, + { + "epoch": 0.8042527420765763, + "grad_norm": 1.630276962177858, + "learning_rate": 9.693175482035038e-07, + "loss": 0.4352, + "step": 1494 + }, + { + "epoch": 0.8047910638584214, + "grad_norm": 1.7375887913272672, + "learning_rate": 9.641627350862371e-07, + "loss": 0.4451, + "step": 1495 + }, + { + "epoch": 0.8053293856402665, + "grad_norm": 1.5671830810820253, + "learning_rate": 9.590202021807266e-07, + "loss": 0.4944, + "step": 1496 + }, + { + "epoch": 0.8058677074221116, + "grad_norm": 1.5984498803682108, + "learning_rate": 9.538899651346278e-07, + "loss": 0.4171, + "step": 1497 + }, + { + "epoch": 0.8064060292039567, + "grad_norm": 1.4646889528560627, + "learning_rate": 9.487720395581829e-07, + "loss": 0.3802, + "step": 1498 + }, + { + "epoch": 0.8069443509858018, + "grad_norm": 1.3512741257951366, + "learning_rate": 9.436664410241736e-07, + "loss": 0.4309, + "step": 1499 + }, + { + "epoch": 0.8074826727676468, + "grad_norm": 1.5243040161927932, + "learning_rate": 9.385731850678714e-07, + "loss": 0.4321, + "step": 1500 + }, + { + "epoch": 0.8074826727676468, + "eval_loss": 0.42280885577201843, + "eval_runtime": 1525.8015, + "eval_samples_per_second": 16.391, + "eval_steps_per_second": 0.513, + "step": 1500 + }, + { + "epoch": 0.8080209945494919, + "grad_norm": 1.7335916518675676, + "learning_rate": 9.334922871869933e-07, + "loss": 0.4613, + "step": 1501 + }, + { + "epoch": 0.808559316331337, + "grad_norm": 1.4183990627505498, + "learning_rate": 9.284237628416537e-07, + "loss": 0.4245, + "step": 1502 + }, + { + "epoch": 0.8090976381131821, + "grad_norm": 1.6705452727321846, + "learning_rate": 9.233676274543141e-07, + "loss": 0.4186, + "step": 1503 + }, + { + "epoch": 0.8096359598950272, + "grad_norm": 1.6195072788491132, + "learning_rate": 9.183238964097408e-07, + "loss": 0.4606, + "step": 1504 + }, + { + "epoch": 0.8101742816768723, + "grad_norm": 1.5392537994753088, + "learning_rate": 9.132925850549573e-07, + "loss": 0.4261, + "step": 1505 + }, + { + "epoch": 0.8107126034587174, + "grad_norm": 1.5937406024477896, + "learning_rate": 9.082737086991955e-07, + "loss": 0.378, + "step": 1506 + }, + { + "epoch": 0.8112509252405625, + "grad_norm": 1.6757621701627432, + "learning_rate": 9.0326728261385e-07, + "loss": 0.4782, + "step": 1507 + }, + { + "epoch": 0.8117892470224076, + "grad_norm": 2.005066048659624, + "learning_rate": 8.982733220324319e-07, + "loss": 0.4419, + "step": 1508 + }, + { + "epoch": 0.8123275688042527, + "grad_norm": 1.5506134684388948, + "learning_rate": 8.932918421505244e-07, + "loss": 0.4669, + "step": 1509 + }, + { + "epoch": 0.8128658905860978, + "grad_norm": 1.8474324824508042, + "learning_rate": 8.883228581257297e-07, + "loss": 0.4416, + "step": 1510 + }, + { + "epoch": 0.8134042123679429, + "grad_norm": 1.5536434524734581, + "learning_rate": 8.83366385077632e-07, + "loss": 0.4377, + "step": 1511 + }, + { + "epoch": 0.813942534149788, + "grad_norm": 1.399796692285853, + "learning_rate": 8.784224380877454e-07, + "loss": 0.4392, + "step": 1512 + }, + { + "epoch": 0.8144808559316331, + "grad_norm": 1.5556950965685121, + "learning_rate": 8.734910321994717e-07, + "loss": 0.406, + "step": 1513 + }, + { + "epoch": 0.8150191777134782, + "grad_norm": 1.5480188724931883, + "learning_rate": 8.685721824180499e-07, + "loss": 0.4433, + "step": 1514 + }, + { + "epoch": 0.8155574994953233, + "grad_norm": 1.4971651714962706, + "learning_rate": 8.636659037105149e-07, + "loss": 0.3966, + "step": 1515 + }, + { + "epoch": 0.8160958212771684, + "grad_norm": 1.6155911416639859, + "learning_rate": 8.587722110056529e-07, + "loss": 0.4212, + "step": 1516 + }, + { + "epoch": 0.8166341430590135, + "grad_norm": 1.976217129048654, + "learning_rate": 8.538911191939475e-07, + "loss": 0.4107, + "step": 1517 + }, + { + "epoch": 0.8171724648408586, + "grad_norm": 1.9846803772964912, + "learning_rate": 8.490226431275456e-07, + "loss": 0.4094, + "step": 1518 + }, + { + "epoch": 0.8177107866227037, + "grad_norm": 3.0586074935315133, + "learning_rate": 8.441667976202045e-07, + "loss": 0.4492, + "step": 1519 + }, + { + "epoch": 0.8182491084045488, + "grad_norm": 1.6149445557914077, + "learning_rate": 8.393235974472497e-07, + "loss": 0.4361, + "step": 1520 + }, + { + "epoch": 0.8187874301863939, + "grad_norm": 1.4631036764406664, + "learning_rate": 8.344930573455323e-07, + "loss": 0.4343, + "step": 1521 + }, + { + "epoch": 0.819325751968239, + "grad_norm": 1.3342306529935604, + "learning_rate": 8.296751920133794e-07, + "loss": 0.3546, + "step": 1522 + }, + { + "epoch": 0.8198640737500841, + "grad_norm": 2.0226246030817356, + "learning_rate": 8.248700161105483e-07, + "loss": 0.4281, + "step": 1523 + }, + { + "epoch": 0.8204023955319292, + "grad_norm": 1.9696807317895189, + "learning_rate": 8.200775442581893e-07, + "loss": 0.4215, + "step": 1524 + }, + { + "epoch": 0.8209407173137743, + "grad_norm": 1.4820095683603027, + "learning_rate": 8.152977910387955e-07, + "loss": 0.4928, + "step": 1525 + }, + { + "epoch": 0.8214790390956194, + "grad_norm": 1.5809021302001485, + "learning_rate": 8.105307709961602e-07, + "loss": 0.442, + "step": 1526 + }, + { + "epoch": 0.8220173608774645, + "grad_norm": 1.3682019844229378, + "learning_rate": 8.057764986353317e-07, + "loss": 0.448, + "step": 1527 + }, + { + "epoch": 0.8225556826593096, + "grad_norm": 1.6136391165039332, + "learning_rate": 8.010349884225699e-07, + "loss": 0.4458, + "step": 1528 + }, + { + "epoch": 0.8230940044411547, + "grad_norm": 1.2595845723052967, + "learning_rate": 7.963062547853023e-07, + "loss": 0.4014, + "step": 1529 + }, + { + "epoch": 0.8236323262229998, + "grad_norm": 2.650357568288943, + "learning_rate": 7.915903121120816e-07, + "loss": 0.4475, + "step": 1530 + }, + { + "epoch": 0.8241706480048449, + "grad_norm": 1.5993270434912978, + "learning_rate": 7.868871747525353e-07, + "loss": 0.3952, + "step": 1531 + }, + { + "epoch": 0.82470896978669, + "grad_norm": 1.5445035783730348, + "learning_rate": 7.821968570173321e-07, + "loss": 0.4546, + "step": 1532 + }, + { + "epoch": 0.825247291568535, + "grad_norm": 1.7600163478435773, + "learning_rate": 7.775193731781316e-07, + "loss": 0.3925, + "step": 1533 + }, + { + "epoch": 0.8257856133503801, + "grad_norm": 1.9376227278838558, + "learning_rate": 7.728547374675421e-07, + "loss": 0.4142, + "step": 1534 + }, + { + "epoch": 0.8263239351322252, + "grad_norm": 1.5661272939035957, + "learning_rate": 7.682029640790783e-07, + "loss": 0.408, + "step": 1535 + }, + { + "epoch": 0.8268622569140703, + "grad_norm": 1.7751314318755442, + "learning_rate": 7.635640671671168e-07, + "loss": 0.4748, + "step": 1536 + }, + { + "epoch": 0.8274005786959154, + "grad_norm": 1.4328800747976576, + "learning_rate": 7.589380608468549e-07, + "loss": 0.445, + "step": 1537 + }, + { + "epoch": 0.8279389004777605, + "grad_norm": 1.770544068666416, + "learning_rate": 7.543249591942647e-07, + "loss": 0.3877, + "step": 1538 + }, + { + "epoch": 0.8284772222596056, + "grad_norm": 1.4644257793154838, + "learning_rate": 7.497247762460535e-07, + "loss": 0.4729, + "step": 1539 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 2.0251569316621354, + "learning_rate": 7.451375259996196e-07, + "loss": 0.3926, + "step": 1540 + }, + { + "epoch": 0.8295538658232958, + "grad_norm": 1.5659705563939743, + "learning_rate": 7.405632224130094e-07, + "loss": 0.3978, + "step": 1541 + }, + { + "epoch": 0.8300921876051409, + "grad_norm": 1.5791357169071338, + "learning_rate": 7.360018794048757e-07, + "loss": 0.4482, + "step": 1542 + }, + { + "epoch": 0.830630509386986, + "grad_norm": 1.5219436138787439, + "learning_rate": 7.314535108544346e-07, + "loss": 0.3993, + "step": 1543 + }, + { + "epoch": 0.8311688311688312, + "grad_norm": 1.5116221556805869, + "learning_rate": 7.26918130601425e-07, + "loss": 0.4431, + "step": 1544 + }, + { + "epoch": 0.8317071529506763, + "grad_norm": 1.5355423700033741, + "learning_rate": 7.223957524460612e-07, + "loss": 0.3847, + "step": 1545 + }, + { + "epoch": 0.8322454747325214, + "grad_norm": 1.6301347275924607, + "learning_rate": 7.17886390148999e-07, + "loss": 0.4149, + "step": 1546 + }, + { + "epoch": 0.8327837965143665, + "grad_norm": 1.39164969438826, + "learning_rate": 7.133900574312885e-07, + "loss": 0.444, + "step": 1547 + }, + { + "epoch": 0.8333221182962116, + "grad_norm": 1.6360359120384138, + "learning_rate": 7.089067679743322e-07, + "loss": 0.4387, + "step": 1548 + }, + { + "epoch": 0.8338604400780567, + "grad_norm": 1.1463330927551836, + "learning_rate": 7.044365354198462e-07, + "loss": 0.367, + "step": 1549 + }, + { + "epoch": 0.8343987618599018, + "grad_norm": 1.3951952353250727, + "learning_rate": 6.999793733698168e-07, + "loss": 0.4537, + "step": 1550 + }, + { + "epoch": 0.8349370836417469, + "grad_norm": 1.444313279525601, + "learning_rate": 6.955352953864592e-07, + "loss": 0.4517, + "step": 1551 + }, + { + "epoch": 0.835475405423592, + "grad_norm": 1.4922885632634126, + "learning_rate": 6.91104314992177e-07, + "loss": 0.4182, + "step": 1552 + }, + { + "epoch": 0.8360137272054371, + "grad_norm": 1.361490120387784, + "learning_rate": 6.866864456695189e-07, + "loss": 0.3819, + "step": 1553 + }, + { + "epoch": 0.8365520489872822, + "grad_norm": 1.3785822196112183, + "learning_rate": 6.822817008611409e-07, + "loss": 0.4315, + "step": 1554 + }, + { + "epoch": 0.8370903707691273, + "grad_norm": 1.786812938484116, + "learning_rate": 6.778900939697642e-07, + "loss": 0.4352, + "step": 1555 + }, + { + "epoch": 0.8376286925509724, + "grad_norm": 1.51980814160385, + "learning_rate": 6.735116383581325e-07, + "loss": 0.4681, + "step": 1556 + }, + { + "epoch": 0.8381670143328175, + "grad_norm": 1.6909398106864937, + "learning_rate": 6.691463473489751e-07, + "loss": 0.3764, + "step": 1557 + }, + { + "epoch": 0.8387053361146626, + "grad_norm": 1.3032028525505768, + "learning_rate": 6.647942342249619e-07, + "loss": 0.4571, + "step": 1558 + }, + { + "epoch": 0.8392436578965077, + "grad_norm": 2.673478994173862, + "learning_rate": 6.604553122286672e-07, + "loss": 0.4424, + "step": 1559 + }, + { + "epoch": 0.8397819796783528, + "grad_norm": 1.8774151039134228, + "learning_rate": 6.561295945625246e-07, + "loss": 0.4289, + "step": 1560 + }, + { + "epoch": 0.8397819796783528, + "eval_loss": 0.42163270711898804, + "eval_runtime": 1532.1805, + "eval_samples_per_second": 16.323, + "eval_steps_per_second": 0.51, + "step": 1560 + }, + { + "epoch": 0.8403203014601979, + "grad_norm": 1.3658795551777532, + "learning_rate": 6.51817094388793e-07, + "loss": 0.4041, + "step": 1561 + }, + { + "epoch": 0.840858623242043, + "grad_norm": 2.0775682420189683, + "learning_rate": 6.475178248295111e-07, + "loss": 0.4626, + "step": 1562 + }, + { + "epoch": 0.8413969450238881, + "grad_norm": 2.0811838469436137, + "learning_rate": 6.432317989664599e-07, + "loss": 0.4316, + "step": 1563 + }, + { + "epoch": 0.8419352668057332, + "grad_norm": 1.6387122228577398, + "learning_rate": 6.389590298411236e-07, + "loss": 0.4198, + "step": 1564 + }, + { + "epoch": 0.8424735885875783, + "grad_norm": 1.6679858558099225, + "learning_rate": 6.346995304546482e-07, + "loss": 0.3999, + "step": 1565 + }, + { + "epoch": 0.8430119103694234, + "grad_norm": 1.4149904617289844, + "learning_rate": 6.304533137678026e-07, + "loss": 0.418, + "step": 1566 + }, + { + "epoch": 0.8435502321512685, + "grad_norm": 1.58157239985269, + "learning_rate": 6.262203927009403e-07, + "loss": 0.4279, + "step": 1567 + }, + { + "epoch": 0.8440885539331136, + "grad_norm": 1.7638599414290634, + "learning_rate": 6.220007801339562e-07, + "loss": 0.4042, + "step": 1568 + }, + { + "epoch": 0.8446268757149586, + "grad_norm": 1.5007385916657803, + "learning_rate": 6.17794488906252e-07, + "loss": 0.4402, + "step": 1569 + }, + { + "epoch": 0.8451651974968037, + "grad_norm": 1.345366896432651, + "learning_rate": 6.136015318166966e-07, + "loss": 0.3642, + "step": 1570 + }, + { + "epoch": 0.8457035192786488, + "grad_norm": 1.5235663558748846, + "learning_rate": 6.094219216235841e-07, + "loss": 0.3964, + "step": 1571 + }, + { + "epoch": 0.8462418410604939, + "grad_norm": 1.3657476470037149, + "learning_rate": 6.052556710445972e-07, + "loss": 0.3748, + "step": 1572 + }, + { + "epoch": 0.846780162842339, + "grad_norm": 1.4394596688138968, + "learning_rate": 6.011027927567681e-07, + "loss": 0.441, + "step": 1573 + }, + { + "epoch": 0.8473184846241841, + "grad_norm": 1.5318361149430813, + "learning_rate": 5.969632993964414e-07, + "loss": 0.4621, + "step": 1574 + }, + { + "epoch": 0.8478568064060292, + "grad_norm": 1.6075753885114712, + "learning_rate": 5.928372035592306e-07, + "loss": 0.4645, + "step": 1575 + }, + { + "epoch": 0.8483951281878743, + "grad_norm": 1.5722006469692726, + "learning_rate": 5.887245177999867e-07, + "loss": 0.4446, + "step": 1576 + }, + { + "epoch": 0.8489334499697194, + "grad_norm": 1.4551383751314828, + "learning_rate": 5.846252546327547e-07, + "loss": 0.43, + "step": 1577 + }, + { + "epoch": 0.8494717717515645, + "grad_norm": 1.4487392657122655, + "learning_rate": 5.805394265307391e-07, + "loss": 0.4032, + "step": 1578 + }, + { + "epoch": 0.8500100935334096, + "grad_norm": 1.6691803468661808, + "learning_rate": 5.764670459262622e-07, + "loss": 0.4328, + "step": 1579 + }, + { + "epoch": 0.8505484153152547, + "grad_norm": 1.6197190610235175, + "learning_rate": 5.724081252107311e-07, + "loss": 0.4045, + "step": 1580 + }, + { + "epoch": 0.8510867370970998, + "grad_norm": 1.6633094952520224, + "learning_rate": 5.683626767345951e-07, + "loss": 0.4271, + "step": 1581 + }, + { + "epoch": 0.8516250588789449, + "grad_norm": 1.3383638616282105, + "learning_rate": 5.6433071280731e-07, + "loss": 0.3742, + "step": 1582 + }, + { + "epoch": 0.85216338066079, + "grad_norm": 1.3573201978569531, + "learning_rate": 5.60312245697302e-07, + "loss": 0.355, + "step": 1583 + }, + { + "epoch": 0.8527017024426351, + "grad_norm": 1.5087600985731158, + "learning_rate": 5.563072876319292e-07, + "loss": 0.4275, + "step": 1584 + }, + { + "epoch": 0.8532400242244802, + "grad_norm": 1.9174671861368988, + "learning_rate": 5.523158507974452e-07, + "loss": 0.4523, + "step": 1585 + }, + { + "epoch": 0.8537783460063253, + "grad_norm": 1.2701535232392451, + "learning_rate": 5.483379473389599e-07, + "loss": 0.4157, + "step": 1586 + }, + { + "epoch": 0.8543166677881704, + "grad_norm": 1.3648674048032239, + "learning_rate": 5.443735893604041e-07, + "loss": 0.443, + "step": 1587 + }, + { + "epoch": 0.8548549895700155, + "grad_norm": 1.7303772028968518, + "learning_rate": 5.404227889244939e-07, + "loss": 0.3945, + "step": 1588 + }, + { + "epoch": 0.8553933113518606, + "grad_norm": 1.4650825399074572, + "learning_rate": 5.364855580526923e-07, + "loss": 0.4183, + "step": 1589 + }, + { + "epoch": 0.8559316331337057, + "grad_norm": 1.7612420028556155, + "learning_rate": 5.325619087251704e-07, + "loss": 0.4472, + "step": 1590 + }, + { + "epoch": 0.8564699549155508, + "grad_norm": 1.6090688100302808, + "learning_rate": 5.28651852880776e-07, + "loss": 0.4348, + "step": 1591 + }, + { + "epoch": 0.8570082766973959, + "grad_norm": 1.59025634923398, + "learning_rate": 5.247554024169949e-07, + "loss": 0.4132, + "step": 1592 + }, + { + "epoch": 0.857546598479241, + "grad_norm": 1.8249117304980227, + "learning_rate": 5.20872569189913e-07, + "loss": 0.415, + "step": 1593 + }, + { + "epoch": 0.8580849202610861, + "grad_norm": 1.3724204134525155, + "learning_rate": 5.170033650141837e-07, + "loss": 0.4645, + "step": 1594 + }, + { + "epoch": 0.8586232420429312, + "grad_norm": 2.066798117946357, + "learning_rate": 5.131478016629888e-07, + "loss": 0.4225, + "step": 1595 + }, + { + "epoch": 0.8591615638247763, + "grad_norm": 2.780252323052268, + "learning_rate": 5.093058908680043e-07, + "loss": 0.4048, + "step": 1596 + }, + { + "epoch": 0.8596998856066214, + "grad_norm": 1.4726854180656292, + "learning_rate": 5.054776443193626e-07, + "loss": 0.4337, + "step": 1597 + }, + { + "epoch": 0.8602382073884665, + "grad_norm": 1.7991832445280496, + "learning_rate": 5.016630736656213e-07, + "loss": 0.3871, + "step": 1598 + }, + { + "epoch": 0.8607765291703116, + "grad_norm": 1.6803342666413155, + "learning_rate": 4.978621905137238e-07, + "loss": 0.4332, + "step": 1599 + }, + { + "epoch": 0.8613148509521567, + "grad_norm": 1.4355251448306459, + "learning_rate": 4.940750064289657e-07, + "loss": 0.3924, + "step": 1600 + }, + { + "epoch": 0.8618531727340017, + "grad_norm": 1.3604897046592517, + "learning_rate": 4.903015329349581e-07, + "loss": 0.4057, + "step": 1601 + }, + { + "epoch": 0.8623914945158468, + "grad_norm": 1.6598958205265515, + "learning_rate": 4.865417815135958e-07, + "loss": 0.3885, + "step": 1602 + }, + { + "epoch": 0.8629298162976919, + "grad_norm": 1.4613049538096838, + "learning_rate": 4.827957636050179e-07, + "loss": 0.3922, + "step": 1603 + }, + { + "epoch": 0.863468138079537, + "grad_norm": 1.5965664706849296, + "learning_rate": 4.790634906075775e-07, + "loss": 0.4828, + "step": 1604 + }, + { + "epoch": 0.8640064598613821, + "grad_norm": 1.8120189192545764, + "learning_rate": 4.753449738778021e-07, + "loss": 0.429, + "step": 1605 + }, + { + "epoch": 0.8645447816432272, + "grad_norm": 1.8371969884713577, + "learning_rate": 4.716402247303631e-07, + "loss": 0.4074, + "step": 1606 + }, + { + "epoch": 0.8650831034250723, + "grad_norm": 1.5256250240541858, + "learning_rate": 4.6794925443804097e-07, + "loss": 0.4015, + "step": 1607 + }, + { + "epoch": 0.8656214252069174, + "grad_norm": 1.6504131905617414, + "learning_rate": 4.642720742316886e-07, + "loss": 0.4619, + "step": 1608 + }, + { + "epoch": 0.8661597469887625, + "grad_norm": 1.7464812669613627, + "learning_rate": 4.6060869530019983e-07, + "loss": 0.4537, + "step": 1609 + }, + { + "epoch": 0.8666980687706076, + "grad_norm": 1.8767060082708276, + "learning_rate": 4.569591287904723e-07, + "loss": 0.4612, + "step": 1610 + }, + { + "epoch": 0.8672363905524527, + "grad_norm": 1.3070105173969313, + "learning_rate": 4.5332338580737824e-07, + "loss": 0.3629, + "step": 1611 + }, + { + "epoch": 0.8677747123342978, + "grad_norm": 4.572221630177869, + "learning_rate": 4.4970147741372315e-07, + "loss": 0.4587, + "step": 1612 + }, + { + "epoch": 0.8683130341161429, + "grad_norm": 1.4960042467223587, + "learning_rate": 4.460934146302215e-07, + "loss": 0.4734, + "step": 1613 + }, + { + "epoch": 0.868851355897988, + "grad_norm": 1.9121190508560355, + "learning_rate": 4.424992084354551e-07, + "loss": 0.4016, + "step": 1614 + }, + { + "epoch": 0.8693896776798331, + "grad_norm": 1.706342167134769, + "learning_rate": 4.389188697658453e-07, + "loss": 0.4207, + "step": 1615 + }, + { + "epoch": 0.8699279994616782, + "grad_norm": 1.5621521598790504, + "learning_rate": 4.3535240951561695e-07, + "loss": 0.4101, + "step": 1616 + }, + { + "epoch": 0.8704663212435233, + "grad_norm": 1.4806315484210542, + "learning_rate": 4.3179983853676386e-07, + "loss": 0.4608, + "step": 1617 + }, + { + "epoch": 0.8710046430253684, + "grad_norm": 1.526083402719131, + "learning_rate": 4.2826116763902135e-07, + "loss": 0.4183, + "step": 1618 + }, + { + "epoch": 0.8715429648072135, + "grad_norm": 1.6689772565592038, + "learning_rate": 4.247364075898258e-07, + "loss": 0.4288, + "step": 1619 + }, + { + "epoch": 0.8720812865890586, + "grad_norm": 1.3834588776364911, + "learning_rate": 4.2122556911428744e-07, + "loss": 0.4032, + "step": 1620 + }, + { + "epoch": 0.8720812865890586, + "eval_loss": 0.42079228162765503, + "eval_runtime": 1541.5294, + "eval_samples_per_second": 16.224, + "eval_steps_per_second": 0.507, + "step": 1620 + }, + { + "epoch": 0.8726196083709037, + "grad_norm": 1.5791149363732657, + "learning_rate": 4.177286628951566e-07, + "loss": 0.4388, + "step": 1621 + }, + { + "epoch": 0.8731579301527488, + "grad_norm": 1.7565308716827732, + "learning_rate": 4.142456995727906e-07, + "loss": 0.4403, + "step": 1622 + }, + { + "epoch": 0.8736962519345939, + "grad_norm": 1.8536625820585364, + "learning_rate": 4.107766897451204e-07, + "loss": 0.377, + "step": 1623 + }, + { + "epoch": 0.874234573716439, + "grad_norm": 1.557798623706775, + "learning_rate": 4.073216439676203e-07, + "loss": 0.4099, + "step": 1624 + }, + { + "epoch": 0.8747728954982841, + "grad_norm": 1.5848805929742247, + "learning_rate": 4.0388057275327466e-07, + "loss": 0.4127, + "step": 1625 + }, + { + "epoch": 0.8753112172801292, + "grad_norm": 1.4737469672067065, + "learning_rate": 4.004534865725462e-07, + "loss": 0.4125, + "step": 1626 + }, + { + "epoch": 0.8758495390619743, + "grad_norm": 1.4866822244945306, + "learning_rate": 3.970403958533436e-07, + "loss": 0.4081, + "step": 1627 + }, + { + "epoch": 0.8763878608438194, + "grad_norm": 1.6255821682103373, + "learning_rate": 3.936413109809906e-07, + "loss": 0.4465, + "step": 1628 + }, + { + "epoch": 0.8769261826256645, + "grad_norm": 1.4642881317646486, + "learning_rate": 3.902562422981937e-07, + "loss": 0.4286, + "step": 1629 + }, + { + "epoch": 0.8774645044075096, + "grad_norm": 1.580573409189922, + "learning_rate": 3.8688520010501276e-07, + "loss": 0.4527, + "step": 1630 + }, + { + "epoch": 0.8780028261893547, + "grad_norm": 2.0543315708956387, + "learning_rate": 3.835281946588254e-07, + "loss": 0.4377, + "step": 1631 + }, + { + "epoch": 0.8785411479711998, + "grad_norm": 1.5115782436115135, + "learning_rate": 3.801852361743008e-07, + "loss": 0.4525, + "step": 1632 + }, + { + "epoch": 0.8790794697530449, + "grad_norm": 1.8374746527735237, + "learning_rate": 3.7685633482336504e-07, + "loss": 0.4242, + "step": 1633 + }, + { + "epoch": 0.87961779153489, + "grad_norm": 1.5036770046647692, + "learning_rate": 3.7354150073516947e-07, + "loss": 0.4474, + "step": 1634 + }, + { + "epoch": 0.880156113316735, + "grad_norm": 1.658882270187231, + "learning_rate": 3.702407439960648e-07, + "loss": 0.4321, + "step": 1635 + }, + { + "epoch": 0.8806944350985801, + "grad_norm": 1.6020319338410256, + "learning_rate": 3.669540746495653e-07, + "loss": 0.4212, + "step": 1636 + }, + { + "epoch": 0.8812327568804252, + "grad_norm": 1.7415071086793177, + "learning_rate": 3.636815026963214e-07, + "loss": 0.4229, + "step": 1637 + }, + { + "epoch": 0.8817710786622703, + "grad_norm": 1.328144623680027, + "learning_rate": 3.604230380940871e-07, + "loss": 0.4135, + "step": 1638 + }, + { + "epoch": 0.8823094004441154, + "grad_norm": 1.8361744282067538, + "learning_rate": 3.5717869075769187e-07, + "loss": 0.4448, + "step": 1639 + }, + { + "epoch": 0.8828477222259605, + "grad_norm": 1.4454157174291669, + "learning_rate": 3.5394847055900794e-07, + "loss": 0.4339, + "step": 1640 + }, + { + "epoch": 0.8833860440078056, + "grad_norm": 1.6322475345286311, + "learning_rate": 3.5073238732692305e-07, + "loss": 0.4176, + "step": 1641 + }, + { + "epoch": 0.8839243657896507, + "grad_norm": 1.445292085363601, + "learning_rate": 3.475304508473071e-07, + "loss": 0.4554, + "step": 1642 + }, + { + "epoch": 0.8844626875714958, + "grad_norm": 1.4938616353672438, + "learning_rate": 3.44342670862986e-07, + "loss": 0.4088, + "step": 1643 + }, + { + "epoch": 0.8850010093533409, + "grad_norm": 1.47760594711673, + "learning_rate": 3.411690570737097e-07, + "loss": 0.3793, + "step": 1644 + }, + { + "epoch": 0.885539331135186, + "grad_norm": 1.6041036008050786, + "learning_rate": 3.3800961913612427e-07, + "loss": 0.4648, + "step": 1645 + }, + { + "epoch": 0.8860776529170311, + "grad_norm": 1.6055085861001368, + "learning_rate": 3.3486436666374024e-07, + "loss": 0.3958, + "step": 1646 + }, + { + "epoch": 0.8866159746988762, + "grad_norm": 1.592597656491022, + "learning_rate": 3.3173330922690594e-07, + "loss": 0.4534, + "step": 1647 + }, + { + "epoch": 0.8871542964807213, + "grad_norm": 1.3972942678399092, + "learning_rate": 3.2861645635277715e-07, + "loss": 0.4075, + "step": 1648 + }, + { + "epoch": 0.8876926182625664, + "grad_norm": 1.299571800868061, + "learning_rate": 3.255138175252859e-07, + "loss": 0.4322, + "step": 1649 + }, + { + "epoch": 0.8882309400444115, + "grad_norm": 1.6074089216828915, + "learning_rate": 3.22425402185117e-07, + "loss": 0.4442, + "step": 1650 + }, + { + "epoch": 0.8887692618262566, + "grad_norm": 1.6515277192815747, + "learning_rate": 3.1935121972967387e-07, + "loss": 0.3974, + "step": 1651 + }, + { + "epoch": 0.8893075836081017, + "grad_norm": 1.9560867162587892, + "learning_rate": 3.1629127951305407e-07, + "loss": 0.4419, + "step": 1652 + }, + { + "epoch": 0.8898459053899468, + "grad_norm": 1.4109620050170866, + "learning_rate": 3.132455908460175e-07, + "loss": 0.4006, + "step": 1653 + }, + { + "epoch": 0.8903842271717919, + "grad_norm": 1.3778369174445322, + "learning_rate": 3.1021416299595985e-07, + "loss": 0.3917, + "step": 1654 + }, + { + "epoch": 0.890922548953637, + "grad_norm": 1.7547858079840999, + "learning_rate": 3.0719700518688447e-07, + "loss": 0.4698, + "step": 1655 + }, + { + "epoch": 0.8914608707354821, + "grad_norm": 1.5659476763978994, + "learning_rate": 3.0419412659937477e-07, + "loss": 0.4172, + "step": 1656 + }, + { + "epoch": 0.8919991925173272, + "grad_norm": 3.093400384631848, + "learning_rate": 3.0120553637056293e-07, + "loss": 0.3883, + "step": 1657 + }, + { + "epoch": 0.8925375142991724, + "grad_norm": 1.4466790084982413, + "learning_rate": 2.9823124359410706e-07, + "loss": 0.391, + "step": 1658 + }, + { + "epoch": 0.8930758360810175, + "grad_norm": 1.2602029099448362, + "learning_rate": 2.9527125732015995e-07, + "loss": 0.41, + "step": 1659 + }, + { + "epoch": 0.8936141578628626, + "grad_norm": 1.5682198116188635, + "learning_rate": 2.923255865553432e-07, + "loss": 0.4361, + "step": 1660 + }, + { + "epoch": 0.8941524796447077, + "grad_norm": 1.7284038118874672, + "learning_rate": 2.8939424026271923e-07, + "loss": 0.4248, + "step": 1661 + }, + { + "epoch": 0.8946908014265528, + "grad_norm": 1.4256983828332148, + "learning_rate": 2.8647722736176333e-07, + "loss": 0.4291, + "step": 1662 + }, + { + "epoch": 0.8952291232083979, + "grad_norm": 1.4976102627551229, + "learning_rate": 2.8357455672833933e-07, + "loss": 0.3813, + "step": 1663 + }, + { + "epoch": 0.895767444990243, + "grad_norm": 1.8854495681463317, + "learning_rate": 2.8068623719466725e-07, + "loss": 0.4516, + "step": 1664 + }, + { + "epoch": 0.8963057667720881, + "grad_norm": 1.5693149002013742, + "learning_rate": 2.7781227754930253e-07, + "loss": 0.4585, + "step": 1665 + }, + { + "epoch": 0.8968440885539332, + "grad_norm": 1.573734503341506, + "learning_rate": 2.7495268653710493e-07, + "loss": 0.4483, + "step": 1666 + }, + { + "epoch": 0.8973824103357783, + "grad_norm": 1.5481263062327042, + "learning_rate": 2.7210747285921435e-07, + "loss": 0.4468, + "step": 1667 + }, + { + "epoch": 0.8979207321176234, + "grad_norm": 1.7822442462595496, + "learning_rate": 2.692766451730233e-07, + "loss": 0.4234, + "step": 1668 + }, + { + "epoch": 0.8984590538994685, + "grad_norm": 1.8797060608535148, + "learning_rate": 2.6646021209215003e-07, + "loss": 0.4063, + "step": 1669 + }, + { + "epoch": 0.8989973756813135, + "grad_norm": 1.4047802142985153, + "learning_rate": 2.636581821864148e-07, + "loss": 0.3933, + "step": 1670 + }, + { + "epoch": 0.8995356974631586, + "grad_norm": 1.9919594742667397, + "learning_rate": 2.6087056398180823e-07, + "loss": 0.4259, + "step": 1671 + }, + { + "epoch": 0.9000740192450037, + "grad_norm": 1.439697905572551, + "learning_rate": 2.580973659604735e-07, + "loss": 0.4234, + "step": 1672 + }, + { + "epoch": 0.9006123410268488, + "grad_norm": 1.4340034850095604, + "learning_rate": 2.553385965606736e-07, + "loss": 0.4011, + "step": 1673 + }, + { + "epoch": 0.9011506628086939, + "grad_norm": 1.6008407880111504, + "learning_rate": 2.525942641767687e-07, + "loss": 0.4064, + "step": 1674 + }, + { + "epoch": 0.901688984590539, + "grad_norm": 1.393769083088064, + "learning_rate": 2.498643771591908e-07, + "loss": 0.3878, + "step": 1675 + }, + { + "epoch": 0.9022273063723841, + "grad_norm": 1.5473000323872435, + "learning_rate": 2.47148943814417e-07, + "loss": 0.4125, + "step": 1676 + }, + { + "epoch": 0.9027656281542292, + "grad_norm": 1.504947787937997, + "learning_rate": 2.4444797240494533e-07, + "loss": 0.4328, + "step": 1677 + }, + { + "epoch": 0.9033039499360743, + "grad_norm": 1.8071042005817233, + "learning_rate": 2.4176147114927e-07, + "loss": 0.4429, + "step": 1678 + }, + { + "epoch": 0.9038422717179194, + "grad_norm": 1.5975781936612632, + "learning_rate": 2.3908944822185144e-07, + "loss": 0.4279, + "step": 1679 + }, + { + "epoch": 0.9043805934997645, + "grad_norm": 1.4408734852067904, + "learning_rate": 2.364319117531011e-07, + "loss": 0.404, + "step": 1680 + }, + { + "epoch": 0.9043805934997645, + "eval_loss": 0.42025431990623474, + "eval_runtime": 1550.3923, + "eval_samples_per_second": 16.131, + "eval_steps_per_second": 0.504, + "step": 1680 + }, + { + "epoch": 0.9049189152816096, + "grad_norm": 1.6629310324181896, + "learning_rate": 2.3378886982934778e-07, + "loss": 0.4876, + "step": 1681 + }, + { + "epoch": 0.9054572370634547, + "grad_norm": 1.5275509334845596, + "learning_rate": 2.311603304928173e-07, + "loss": 0.4428, + "step": 1682 + }, + { + "epoch": 0.9059955588452998, + "grad_norm": 1.6372832685609333, + "learning_rate": 2.285463017416073e-07, + "loss": 0.4815, + "step": 1683 + }, + { + "epoch": 0.9065338806271449, + "grad_norm": 1.846596894090347, + "learning_rate": 2.2594679152966258e-07, + "loss": 0.4724, + "step": 1684 + }, + { + "epoch": 0.90707220240899, + "grad_norm": 1.7091710123282846, + "learning_rate": 2.2336180776675154e-07, + "loss": 0.4447, + "step": 1685 + }, + { + "epoch": 0.9076105241908351, + "grad_norm": 1.4759554995733482, + "learning_rate": 2.2079135831843956e-07, + "loss": 0.4421, + "step": 1686 + }, + { + "epoch": 0.9081488459726802, + "grad_norm": 1.4044547819882969, + "learning_rate": 2.1823545100606914e-07, + "loss": 0.4438, + "step": 1687 + }, + { + "epoch": 0.9086871677545253, + "grad_norm": 1.6839786445608516, + "learning_rate": 2.1569409360673422e-07, + "loss": 0.4295, + "step": 1688 + }, + { + "epoch": 0.9092254895363704, + "grad_norm": 1.695687328944884, + "learning_rate": 2.131672938532553e-07, + "loss": 0.4001, + "step": 1689 + }, + { + "epoch": 0.9097638113182155, + "grad_norm": 1.6064285368620497, + "learning_rate": 2.1065505943415775e-07, + "loss": 0.426, + "step": 1690 + }, + { + "epoch": 0.9103021331000606, + "grad_norm": 1.805677873651136, + "learning_rate": 2.0815739799364743e-07, + "loss": 0.4109, + "step": 1691 + }, + { + "epoch": 0.9108404548819057, + "grad_norm": 1.6393066274059234, + "learning_rate": 2.0567431713158726e-07, + "loss": 0.4377, + "step": 1692 + }, + { + "epoch": 0.9113787766637508, + "grad_norm": 1.6183131956225818, + "learning_rate": 2.032058244034757e-07, + "loss": 0.4412, + "step": 1693 + }, + { + "epoch": 0.9119170984455959, + "grad_norm": 1.5002695967364554, + "learning_rate": 2.007519273204206e-07, + "loss": 0.4437, + "step": 1694 + }, + { + "epoch": 0.912455420227441, + "grad_norm": 1.647362717510626, + "learning_rate": 1.9831263334911977e-07, + "loss": 0.4808, + "step": 1695 + }, + { + "epoch": 0.9129937420092861, + "grad_norm": 1.5964438963275278, + "learning_rate": 1.95887949911836e-07, + "loss": 0.4393, + "step": 1696 + }, + { + "epoch": 0.9135320637911312, + "grad_norm": 1.8713869106599383, + "learning_rate": 1.934778843863766e-07, + "loss": 0.434, + "step": 1697 + }, + { + "epoch": 0.9140703855729763, + "grad_norm": 1.9039547376831083, + "learning_rate": 1.9108244410606823e-07, + "loss": 0.4364, + "step": 1698 + }, + { + "epoch": 0.9146087073548214, + "grad_norm": 1.5450254177283191, + "learning_rate": 1.887016363597366e-07, + "loss": 0.4589, + "step": 1699 + }, + { + "epoch": 0.9151470291366665, + "grad_norm": 1.543879530191546, + "learning_rate": 1.8633546839168403e-07, + "loss": 0.4064, + "step": 1700 + }, + { + "epoch": 0.9156853509185116, + "grad_norm": 1.5304353330893454, + "learning_rate": 1.839839474016658e-07, + "loss": 0.442, + "step": 1701 + }, + { + "epoch": 0.9162236727003567, + "grad_norm": 2.3452574340826233, + "learning_rate": 1.8164708054487002e-07, + "loss": 0.422, + "step": 1702 + }, + { + "epoch": 0.9167619944822017, + "grad_norm": 1.9150867244566236, + "learning_rate": 1.7932487493189598e-07, + "loss": 0.4294, + "step": 1703 + }, + { + "epoch": 0.9173003162640468, + "grad_norm": 1.6124806051656038, + "learning_rate": 1.7701733762873152e-07, + "loss": 0.428, + "step": 1704 + }, + { + "epoch": 0.9178386380458919, + "grad_norm": 1.4187608860726189, + "learning_rate": 1.7472447565673177e-07, + "loss": 0.4038, + "step": 1705 + }, + { + "epoch": 0.918376959827737, + "grad_norm": 1.4661931221135862, + "learning_rate": 1.7244629599259767e-07, + "loss": 0.3848, + "step": 1706 + }, + { + "epoch": 0.9189152816095821, + "grad_norm": 1.6206434175751971, + "learning_rate": 1.7018280556835632e-07, + "loss": 0.3851, + "step": 1707 + }, + { + "epoch": 0.9194536033914272, + "grad_norm": 1.8423442465927384, + "learning_rate": 1.6793401127133513e-07, + "loss": 0.4079, + "step": 1708 + }, + { + "epoch": 0.9199919251732723, + "grad_norm": 1.3950233471823357, + "learning_rate": 1.6569991994414835e-07, + "loss": 0.3994, + "step": 1709 + }, + { + "epoch": 0.9205302469551174, + "grad_norm": 1.5142214065755961, + "learning_rate": 1.6348053838466937e-07, + "loss": 0.4189, + "step": 1710 + }, + { + "epoch": 0.9210685687369625, + "grad_norm": 1.5917351975615364, + "learning_rate": 1.6127587334601458e-07, + "loss": 0.4314, + "step": 1711 + }, + { + "epoch": 0.9216068905188076, + "grad_norm": 1.605064219083874, + "learning_rate": 1.5908593153651952e-07, + "loss": 0.4237, + "step": 1712 + }, + { + "epoch": 0.9221452123006527, + "grad_norm": 1.7341654884483175, + "learning_rate": 1.5691071961972116e-07, + "loss": 0.4131, + "step": 1713 + }, + { + "epoch": 0.9226835340824978, + "grad_norm": 1.6343186301580133, + "learning_rate": 1.547502442143356e-07, + "loss": 0.4233, + "step": 1714 + }, + { + "epoch": 0.9232218558643429, + "grad_norm": 1.5099995374537671, + "learning_rate": 1.526045118942404e-07, + "loss": 0.3982, + "step": 1715 + }, + { + "epoch": 0.923760177646188, + "grad_norm": 1.7958348974891065, + "learning_rate": 1.504735291884507e-07, + "loss": 0.4331, + "step": 1716 + }, + { + "epoch": 0.9242984994280331, + "grad_norm": 1.7356588334735397, + "learning_rate": 1.4835730258110303e-07, + "loss": 0.4357, + "step": 1717 + }, + { + "epoch": 0.9248368212098782, + "grad_norm": 2.500196744283525, + "learning_rate": 1.4625583851143432e-07, + "loss": 0.3799, + "step": 1718 + }, + { + "epoch": 0.9253751429917233, + "grad_norm": 1.3646453068750661, + "learning_rate": 1.4416914337376132e-07, + "loss": 0.4128, + "step": 1719 + }, + { + "epoch": 0.9259134647735684, + "grad_norm": 1.642640642870041, + "learning_rate": 1.420972235174628e-07, + "loss": 0.4506, + "step": 1720 + }, + { + "epoch": 0.9264517865554135, + "grad_norm": 1.592814733182936, + "learning_rate": 1.4004008524695912e-07, + "loss": 0.4296, + "step": 1721 + }, + { + "epoch": 0.9269901083372586, + "grad_norm": 1.4652552983592342, + "learning_rate": 1.3799773482169378e-07, + "loss": 0.4233, + "step": 1722 + }, + { + "epoch": 0.9275284301191037, + "grad_norm": 1.7410090898687602, + "learning_rate": 1.3597017845611181e-07, + "loss": 0.4594, + "step": 1723 + }, + { + "epoch": 0.9280667519009488, + "grad_norm": 1.559448064084867, + "learning_rate": 1.3395742231964658e-07, + "loss": 0.4336, + "step": 1724 + }, + { + "epoch": 0.9286050736827939, + "grad_norm": 1.9623398348887997, + "learning_rate": 1.3195947253669518e-07, + "loss": 0.4724, + "step": 1725 + }, + { + "epoch": 0.929143395464639, + "grad_norm": 1.4765323135961603, + "learning_rate": 1.2997633518660125e-07, + "loss": 0.4122, + "step": 1726 + }, + { + "epoch": 0.9296817172464841, + "grad_norm": 1.9030353185015407, + "learning_rate": 1.2800801630364013e-07, + "loss": 0.4414, + "step": 1727 + }, + { + "epoch": 0.9302200390283292, + "grad_norm": 1.3486307498615422, + "learning_rate": 1.2605452187699484e-07, + "loss": 0.4799, + "step": 1728 + }, + { + "epoch": 0.9307583608101743, + "grad_norm": 1.4474994381201687, + "learning_rate": 1.2411585785074232e-07, + "loss": 0.4353, + "step": 1729 + }, + { + "epoch": 0.9312966825920194, + "grad_norm": 1.460955137197927, + "learning_rate": 1.221920301238333e-07, + "loss": 0.4248, + "step": 1730 + }, + { + "epoch": 0.9318350043738645, + "grad_norm": 1.8140612572363009, + "learning_rate": 1.2028304455007412e-07, + "loss": 0.3888, + "step": 1731 + }, + { + "epoch": 0.9323733261557096, + "grad_norm": 1.4724419135884532, + "learning_rate": 1.1838890693811055e-07, + "loss": 0.3868, + "step": 1732 + }, + { + "epoch": 0.9329116479375547, + "grad_norm": 1.4562877473919869, + "learning_rate": 1.1650962305140845e-07, + "loss": 0.4305, + "step": 1733 + }, + { + "epoch": 0.9334499697193998, + "grad_norm": 2.0045234339432763, + "learning_rate": 1.1464519860823698e-07, + "loss": 0.5062, + "step": 1734 + }, + { + "epoch": 0.9339882915012448, + "grad_norm": 1.8962618785171959, + "learning_rate": 1.1279563928165094e-07, + "loss": 0.4049, + "step": 1735 + }, + { + "epoch": 0.93452661328309, + "grad_norm": 1.580337734175196, + "learning_rate": 1.1096095069947466e-07, + "loss": 0.4465, + "step": 1736 + }, + { + "epoch": 0.935064935064935, + "grad_norm": 1.6703156179249958, + "learning_rate": 1.091411384442831e-07, + "loss": 0.4174, + "step": 1737 + }, + { + "epoch": 0.9356032568467801, + "grad_norm": 1.4707795804039079, + "learning_rate": 1.0733620805338462e-07, + "loss": 0.3582, + "step": 1738 + }, + { + "epoch": 0.9361415786286252, + "grad_norm": 1.5443607495595517, + "learning_rate": 1.0554616501880722e-07, + "loss": 0.4322, + "step": 1739 + }, + { + "epoch": 0.9366799004104703, + "grad_norm": 1.647874029047969, + "learning_rate": 1.0377101478727835e-07, + "loss": 0.4465, + "step": 1740 + }, + { + "epoch": 0.9366799004104703, + "eval_loss": 0.41988879442214966, + "eval_runtime": 1559.0337, + "eval_samples_per_second": 16.042, + "eval_steps_per_second": 0.502, + "step": 1740 + }, + { + "epoch": 0.9372182221923154, + "grad_norm": 1.6210033117188805, + "learning_rate": 1.0201076276021072e-07, + "loss": 0.4432, + "step": 1741 + }, + { + "epoch": 0.9377565439741605, + "grad_norm": 1.9123170938822815, + "learning_rate": 1.0026541429368431e-07, + "loss": 0.4024, + "step": 1742 + }, + { + "epoch": 0.9382948657560056, + "grad_norm": 2.5680416907462864, + "learning_rate": 9.853497469843043e-08, + "loss": 0.3973, + "step": 1743 + }, + { + "epoch": 0.9388331875378507, + "grad_norm": 1.462242975230514, + "learning_rate": 9.681944923981724e-08, + "loss": 0.455, + "step": 1744 + }, + { + "epoch": 0.9393715093196958, + "grad_norm": 1.4330622858448745, + "learning_rate": 9.511884313782915e-08, + "loss": 0.409, + "step": 1745 + }, + { + "epoch": 0.9399098311015409, + "grad_norm": 1.5924131568344673, + "learning_rate": 9.343316156705751e-08, + "loss": 0.4709, + "step": 1746 + }, + { + "epoch": 0.940448152883386, + "grad_norm": 2.1748083360521, + "learning_rate": 9.176240965668049e-08, + "loss": 0.4975, + "step": 1747 + }, + { + "epoch": 0.9409864746652311, + "grad_norm": 2.240808535802813, + "learning_rate": 9.01065924904465e-08, + "loss": 0.4817, + "step": 1748 + }, + { + "epoch": 0.9415247964470762, + "grad_norm": 1.7231015704313604, + "learning_rate": 8.846571510666369e-08, + "loss": 0.4094, + "step": 1749 + }, + { + "epoch": 0.9420631182289213, + "grad_norm": 1.4693480082476622, + "learning_rate": 8.683978249817981e-08, + "loss": 0.4453, + "step": 1750 + }, + { + "epoch": 0.9426014400107664, + "grad_norm": 1.6509935540008158, + "learning_rate": 8.52287996123674e-08, + "loss": 0.4065, + "step": 1751 + }, + { + "epoch": 0.9431397617926115, + "grad_norm": 1.6701873629796138, + "learning_rate": 8.363277135111314e-08, + "loss": 0.3761, + "step": 1752 + }, + { + "epoch": 0.9436780835744566, + "grad_norm": 1.2809352240300242, + "learning_rate": 8.205170257079786e-08, + "loss": 0.4159, + "step": 1753 + }, + { + "epoch": 0.9442164053563017, + "grad_norm": 1.62872520153001, + "learning_rate": 8.048559808228496e-08, + "loss": 0.3973, + "step": 1754 + }, + { + "epoch": 0.9447547271381468, + "grad_norm": 1.6888413344801536, + "learning_rate": 7.89344626509031e-08, + "loss": 0.4219, + "step": 1755 + }, + { + "epoch": 0.9452930489199919, + "grad_norm": 1.6223202323912347, + "learning_rate": 7.739830099643464e-08, + "loss": 0.4303, + "step": 1756 + }, + { + "epoch": 0.945831370701837, + "grad_norm": 1.2810729846885742, + "learning_rate": 7.587711779309947e-08, + "loss": 0.3868, + "step": 1757 + }, + { + "epoch": 0.9463696924836821, + "grad_norm": 1.6840497326805903, + "learning_rate": 7.437091766954119e-08, + "loss": 0.434, + "step": 1758 + }, + { + "epoch": 0.9469080142655272, + "grad_norm": 1.765752446431431, + "learning_rate": 7.287970520881205e-08, + "loss": 0.4461, + "step": 1759 + }, + { + "epoch": 0.9474463360473723, + "grad_norm": 1.4694297184744327, + "learning_rate": 7.140348494836191e-08, + "loss": 0.4374, + "step": 1760 + }, + { + "epoch": 0.9479846578292174, + "grad_norm": 1.456090878683348, + "learning_rate": 6.994226138002047e-08, + "loss": 0.4204, + "step": 1761 + }, + { + "epoch": 0.9485229796110625, + "grad_norm": 1.5114503786906142, + "learning_rate": 6.849603894998725e-08, + "loss": 0.4431, + "step": 1762 + }, + { + "epoch": 0.9490613013929076, + "grad_norm": 1.9303693867033398, + "learning_rate": 6.706482205881548e-08, + "loss": 0.4292, + "step": 1763 + }, + { + "epoch": 0.9495996231747527, + "grad_norm": 1.3436489528854563, + "learning_rate": 6.564861506139996e-08, + "loss": 0.3854, + "step": 1764 + }, + { + "epoch": 0.9501379449565978, + "grad_norm": 1.3843500014884988, + "learning_rate": 6.424742226696312e-08, + "loss": 0.3969, + "step": 1765 + }, + { + "epoch": 0.9506762667384429, + "grad_norm": 1.3401735876692071, + "learning_rate": 6.286124793904336e-08, + "loss": 0.4183, + "step": 1766 + }, + { + "epoch": 0.951214588520288, + "grad_norm": 1.685672633138118, + "learning_rate": 6.149009629547897e-08, + "loss": 0.4468, + "step": 1767 + }, + { + "epoch": 0.951752910302133, + "grad_norm": 1.8943339017606036, + "learning_rate": 6.013397150839983e-08, + "loss": 0.4361, + "step": 1768 + }, + { + "epoch": 0.9522912320839781, + "grad_norm": 1.7967244404705551, + "learning_rate": 5.8792877704211274e-08, + "loss": 0.4491, + "step": 1769 + }, + { + "epoch": 0.9528295538658232, + "grad_norm": 1.4606147240071112, + "learning_rate": 5.746681896358131e-08, + "loss": 0.4019, + "step": 1770 + }, + { + "epoch": 0.9533678756476683, + "grad_norm": 1.455938194249448, + "learning_rate": 5.615579932143067e-08, + "loss": 0.3948, + "step": 1771 + }, + { + "epoch": 0.9539061974295135, + "grad_norm": 1.2759206549407909, + "learning_rate": 5.485982276691892e-08, + "loss": 0.3949, + "step": 1772 + }, + { + "epoch": 0.9544445192113586, + "grad_norm": 1.5731889340664074, + "learning_rate": 5.35788932434328e-08, + "loss": 0.4422, + "step": 1773 + }, + { + "epoch": 0.9549828409932037, + "grad_norm": 1.4900834870938766, + "learning_rate": 5.2313014648573966e-08, + "loss": 0.3651, + "step": 1774 + }, + { + "epoch": 0.9555211627750488, + "grad_norm": 1.3653648358156305, + "learning_rate": 5.1062190834146875e-08, + "loss": 0.403, + "step": 1775 + }, + { + "epoch": 0.9560594845568939, + "grad_norm": 1.5012692588758656, + "learning_rate": 4.9826425606148145e-08, + "loss": 0.4056, + "step": 1776 + }, + { + "epoch": 0.956597806338739, + "grad_norm": 1.7114437223613954, + "learning_rate": 4.860572272475384e-08, + "loss": 0.4219, + "step": 1777 + }, + { + "epoch": 0.9571361281205841, + "grad_norm": 1.5710449681536929, + "learning_rate": 4.740008590430778e-08, + "loss": 0.4504, + "step": 1778 + }, + { + "epoch": 0.9576744499024292, + "grad_norm": 1.5334464777855485, + "learning_rate": 4.620951881331215e-08, + "loss": 0.4078, + "step": 1779 + }, + { + "epoch": 0.9582127716842743, + "grad_norm": 1.665311340751073, + "learning_rate": 4.5034025074414124e-08, + "loss": 0.388, + "step": 1780 + }, + { + "epoch": 0.9587510934661194, + "grad_norm": 1.6819133415223784, + "learning_rate": 4.3873608264397014e-08, + "loss": 0.4318, + "step": 1781 + }, + { + "epoch": 0.9592894152479645, + "grad_norm": 2.1910803064926947, + "learning_rate": 4.272827191416584e-08, + "loss": 0.3862, + "step": 1782 + }, + { + "epoch": 0.9598277370298096, + "grad_norm": 1.3743310605178427, + "learning_rate": 4.159801950874176e-08, + "loss": 0.382, + "step": 1783 + }, + { + "epoch": 0.9603660588116547, + "grad_norm": 1.753291691489888, + "learning_rate": 4.048285448724709e-08, + "loss": 0.4677, + "step": 1784 + }, + { + "epoch": 0.9609043805934998, + "grad_norm": 1.4424214242693971, + "learning_rate": 3.938278024289644e-08, + "loss": 0.4012, + "step": 1785 + }, + { + "epoch": 0.9614427023753449, + "grad_norm": 1.4573151134275804, + "learning_rate": 3.829780012298612e-08, + "loss": 0.4058, + "step": 1786 + }, + { + "epoch": 0.96198102415719, + "grad_norm": 1.4245212432098524, + "learning_rate": 3.722791742888476e-08, + "loss": 0.3958, + "step": 1787 + }, + { + "epoch": 0.9625193459390351, + "grad_norm": 1.533496999870574, + "learning_rate": 3.617313541602274e-08, + "loss": 0.4195, + "step": 1788 + }, + { + "epoch": 0.9630576677208802, + "grad_norm": 1.854726516234056, + "learning_rate": 3.5133457293881626e-08, + "loss": 0.4376, + "step": 1789 + }, + { + "epoch": 0.9635959895027253, + "grad_norm": 1.9373159151394588, + "learning_rate": 3.410888622598585e-08, + "loss": 0.4312, + "step": 1790 + }, + { + "epoch": 0.9641343112845704, + "grad_norm": 2.153201724460075, + "learning_rate": 3.3099425329890525e-08, + "loss": 0.4494, + "step": 1791 + }, + { + "epoch": 0.9646726330664155, + "grad_norm": 1.4498518000265068, + "learning_rate": 3.210507767717586e-08, + "loss": 0.4199, + "step": 1792 + }, + { + "epoch": 0.9652109548482606, + "grad_norm": 1.6032986767797375, + "learning_rate": 3.1125846293433846e-08, + "loss": 0.3771, + "step": 1793 + }, + { + "epoch": 0.9657492766301057, + "grad_norm": 2.1622319654687057, + "learning_rate": 3.0161734158261625e-08, + "loss": 0.4214, + "step": 1794 + }, + { + "epoch": 0.9662875984119508, + "grad_norm": 1.4345400536711836, + "learning_rate": 2.9212744205252553e-08, + "loss": 0.3797, + "step": 1795 + }, + { + "epoch": 0.9668259201937959, + "grad_norm": 1.6565073229021858, + "learning_rate": 2.8278879321983477e-08, + "loss": 0.3874, + "step": 1796 + }, + { + "epoch": 0.967364241975641, + "grad_norm": 2.0557097314570196, + "learning_rate": 2.736014235001194e-08, + "loss": 0.4341, + "step": 1797 + }, + { + "epoch": 0.9679025637574861, + "grad_norm": 1.64490095462292, + "learning_rate": 2.6456536084862872e-08, + "loss": 0.3979, + "step": 1798 + }, + { + "epoch": 0.9684408855393312, + "grad_norm": 1.6729564375619899, + "learning_rate": 2.5568063276021347e-08, + "loss": 0.397, + "step": 1799 + }, + { + "epoch": 0.9689792073211763, + "grad_norm": 1.5597222162662605, + "learning_rate": 2.4694726626925403e-08, + "loss": 0.432, + "step": 1800 + }, + { + "epoch": 0.9689792073211763, + "eval_loss": 0.4197918474674225, + "eval_runtime": 1571.0705, + "eval_samples_per_second": 15.919, + "eval_steps_per_second": 0.498, + "step": 1800 + }, + { + "epoch": 0.9695175291030214, + "grad_norm": 1.4076281710448164, + "learning_rate": 2.383652879495657e-08, + "loss": 0.3963, + "step": 1801 + }, + { + "epoch": 0.9700558508848665, + "grad_norm": 1.645367632025504, + "learning_rate": 2.299347239143157e-08, + "loss": 0.4272, + "step": 1802 + }, + { + "epoch": 0.9705941726667116, + "grad_norm": 1.3956889574044051, + "learning_rate": 2.2165559981595642e-08, + "loss": 0.429, + "step": 1803 + }, + { + "epoch": 0.9711324944485566, + "grad_norm": 1.4793349281728767, + "learning_rate": 2.1352794084613658e-08, + "loss": 0.4479, + "step": 1804 + }, + { + "epoch": 0.9716708162304017, + "grad_norm": 1.580535608856093, + "learning_rate": 2.0555177173562925e-08, + "loss": 0.431, + "step": 1805 + }, + { + "epoch": 0.9722091380122468, + "grad_norm": 1.7015563233283766, + "learning_rate": 1.9772711675425937e-08, + "loss": 0.3984, + "step": 1806 + }, + { + "epoch": 0.9727474597940919, + "grad_norm": 1.5158636017258738, + "learning_rate": 1.9005399971080974e-08, + "loss": 0.4166, + "step": 1807 + }, + { + "epoch": 0.973285781575937, + "grad_norm": 1.4220838677616172, + "learning_rate": 1.8253244395298186e-08, + "loss": 0.3988, + "step": 1808 + }, + { + "epoch": 0.9738241033577821, + "grad_norm": 1.3963959999222404, + "learning_rate": 1.7516247236731288e-08, + "loss": 0.4224, + "step": 1809 + }, + { + "epoch": 0.9743624251396272, + "grad_norm": 1.7337278360138024, + "learning_rate": 1.679441073790755e-08, + "loss": 0.4738, + "step": 1810 + }, + { + "epoch": 0.9749007469214723, + "grad_norm": 1.4861221398216466, + "learning_rate": 1.6087737095225598e-08, + "loss": 0.4449, + "step": 1811 + }, + { + "epoch": 0.9754390687033174, + "grad_norm": 1.3145810749185178, + "learning_rate": 1.539622845894595e-08, + "loss": 0.3885, + "step": 1812 + }, + { + "epoch": 0.9759773904851625, + "grad_norm": 1.3176971825763986, + "learning_rate": 1.471988693318549e-08, + "loss": 0.4232, + "step": 1813 + }, + { + "epoch": 0.9765157122670076, + "grad_norm": 1.442309770679218, + "learning_rate": 1.4058714575910238e-08, + "loss": 0.4328, + "step": 1814 + }, + { + "epoch": 0.9770540340488527, + "grad_norm": 1.5157478456952573, + "learning_rate": 1.3412713398930355e-08, + "loss": 0.3911, + "step": 1815 + }, + { + "epoch": 0.9775923558306978, + "grad_norm": 1.779840899462066, + "learning_rate": 1.2781885367892377e-08, + "loss": 0.4179, + "step": 1816 + }, + { + "epoch": 0.9781306776125429, + "grad_norm": 1.6067561255260123, + "learning_rate": 1.2166232402275325e-08, + "loss": 0.3987, + "step": 1817 + }, + { + "epoch": 0.978668999394388, + "grad_norm": 1.4429159861518235, + "learning_rate": 1.156575637538182e-08, + "loss": 0.3752, + "step": 1818 + }, + { + "epoch": 0.9792073211762331, + "grad_norm": 1.6134101059886168, + "learning_rate": 1.0980459114335318e-08, + "loss": 0.4491, + "step": 1819 + }, + { + "epoch": 0.9797456429580782, + "grad_norm": 1.3430032688894593, + "learning_rate": 1.0410342400073992e-08, + "loss": 0.4446, + "step": 1820 + }, + { + "epoch": 0.9802839647399233, + "grad_norm": 1.5854543749606242, + "learning_rate": 9.855407967344078e-09, + "loss": 0.4022, + "step": 1821 + }, + { + "epoch": 0.9808222865217684, + "grad_norm": 1.3429626400579588, + "learning_rate": 9.31565750469543e-09, + "loss": 0.4173, + "step": 1822 + }, + { + "epoch": 0.9813606083036135, + "grad_norm": 1.8181594324695687, + "learning_rate": 8.791092654476529e-09, + "loss": 0.4699, + "step": 1823 + }, + { + "epoch": 0.9818989300854586, + "grad_norm": 1.3189784151442827, + "learning_rate": 8.281715012827817e-09, + "loss": 0.3847, + "step": 1824 + }, + { + "epoch": 0.9824372518673037, + "grad_norm": 1.29942395236663, + "learning_rate": 7.78752612968059e-09, + "loss": 0.3989, + "step": 1825 + }, + { + "epoch": 0.9829755736491488, + "grad_norm": 1.6481398184837366, + "learning_rate": 7.3085275087475535e-09, + "loss": 0.385, + "step": 1826 + }, + { + "epoch": 0.9835138954309939, + "grad_norm": 1.2097016930732503, + "learning_rate": 6.844720607522282e-09, + "loss": 0.4635, + "step": 1827 + }, + { + "epoch": 0.984052217212839, + "grad_norm": 1.3353672523995217, + "learning_rate": 6.3961068372725425e-09, + "loss": 0.4659, + "step": 1828 + }, + { + "epoch": 0.9845905389946841, + "grad_norm": 1.6604758834668205, + "learning_rate": 5.962687563036418e-09, + "loss": 0.4182, + "step": 1829 + }, + { + "epoch": 0.9851288607765292, + "grad_norm": 1.365766973195823, + "learning_rate": 5.544464103618419e-09, + "loss": 0.4496, + "step": 1830 + }, + { + "epoch": 0.9856671825583743, + "grad_norm": 1.7311791534397065, + "learning_rate": 5.1414377315855965e-09, + "loss": 0.4091, + "step": 1831 + }, + { + "epoch": 0.9862055043402194, + "grad_norm": 1.6223056568910816, + "learning_rate": 4.753609673263104e-09, + "loss": 0.435, + "step": 1832 + }, + { + "epoch": 0.9867438261220645, + "grad_norm": 1.4811187708876057, + "learning_rate": 4.380981108730309e-09, + "loss": 0.4229, + "step": 1833 + }, + { + "epoch": 0.9872821479039096, + "grad_norm": 1.5639619332709622, + "learning_rate": 4.023553171819128e-09, + "loss": 0.4434, + "step": 1834 + }, + { + "epoch": 0.9878204696857547, + "grad_norm": 1.4607336838401341, + "learning_rate": 3.681326950107922e-09, + "loss": 0.3892, + "step": 1835 + }, + { + "epoch": 0.9883587914675998, + "grad_norm": 1.4459818740856154, + "learning_rate": 3.3543034849192746e-09, + "loss": 0.4613, + "step": 1836 + }, + { + "epoch": 0.9888971132494448, + "grad_norm": 1.727956071768554, + "learning_rate": 3.0424837713188825e-09, + "loss": 0.4321, + "step": 1837 + }, + { + "epoch": 0.98943543503129, + "grad_norm": 1.4250494159267046, + "learning_rate": 2.7458687581072284e-09, + "loss": 0.4361, + "step": 1838 + }, + { + "epoch": 0.989973756813135, + "grad_norm": 1.6825614414547043, + "learning_rate": 2.4644593478240218e-09, + "loss": 0.4247, + "step": 1839 + }, + { + "epoch": 0.9905120785949801, + "grad_norm": 1.3394226647545722, + "learning_rate": 2.1982563967376525e-09, + "loss": 0.4224, + "step": 1840 + }, + { + "epoch": 0.9910504003768252, + "grad_norm": 1.3878090062249357, + "learning_rate": 1.9472607148490752e-09, + "loss": 0.4671, + "step": 1841 + }, + { + "epoch": 0.9915887221586703, + "grad_norm": 1.8045067084462034, + "learning_rate": 1.71147306588626e-09, + "loss": 0.4093, + "step": 1842 + }, + { + "epoch": 0.9921270439405154, + "grad_norm": 1.6487465697670387, + "learning_rate": 1.4908941673008604e-09, + "loss": 0.4768, + "step": 1843 + }, + { + "epoch": 0.9926653657223605, + "grad_norm": 1.3894142004683563, + "learning_rate": 1.2855246902693241e-09, + "loss": 0.4126, + "step": 1844 + }, + { + "epoch": 0.9932036875042056, + "grad_norm": 1.5382669595746958, + "learning_rate": 1.0953652596878972e-09, + "loss": 0.4662, + "step": 1845 + }, + { + "epoch": 0.9937420092860507, + "grad_norm": 1.5055759777025033, + "learning_rate": 9.204164541720683e-10, + "loss": 0.3911, + "step": 1846 + }, + { + "epoch": 0.9942803310678958, + "grad_norm": 1.4883627722190473, + "learning_rate": 7.606788060543491e-10, + "loss": 0.4005, + "step": 1847 + }, + { + "epoch": 0.9948186528497409, + "grad_norm": 1.7929841052447726, + "learning_rate": 6.16152801383163e-10, + "loss": 0.4239, + "step": 1848 + }, + { + "epoch": 0.995356974631586, + "grad_norm": 1.3514634100350202, + "learning_rate": 4.86838879921736e-10, + "loss": 0.4122, + "step": 1849 + }, + { + "epoch": 0.9958952964134311, + "grad_norm": 1.5688583282415778, + "learning_rate": 3.7273743514476544e-10, + "loss": 0.3613, + "step": 1850 + }, + { + "epoch": 0.9964336181952762, + "grad_norm": 1.3790895255701852, + "learning_rate": 2.73848814238975e-10, + "loss": 0.3974, + "step": 1851 + }, + { + "epoch": 0.9969719399771213, + "grad_norm": 1.4609310145673613, + "learning_rate": 1.9017331810256002e-10, + "loss": 0.4287, + "step": 1852 + }, + { + "epoch": 0.9975102617589664, + "grad_norm": 1.6915446904327818, + "learning_rate": 1.2171120134185643e-10, + "loss": 0.4238, + "step": 1853 + }, + { + "epoch": 0.9980485835408115, + "grad_norm": 1.636253995850887, + "learning_rate": 6.846267227356152e-11, + "loss": 0.4105, + "step": 1854 + }, + { + "epoch": 0.9985869053226566, + "grad_norm": 1.3210272324277625, + "learning_rate": 3.042789292140302e-11, + "loss": 0.3978, + "step": 1855 + }, + { + "epoch": 0.9991252271045017, + "grad_norm": 1.7798971238230394, + "learning_rate": 7.606979016694383e-12, + "loss": 0.4537, + "step": 1856 + }, + { + "epoch": 0.9996635488863468, + "grad_norm": 1.6132079869080023, + "learning_rate": 0.0, + "loss": 0.4395, + "step": 1857 + }, + { + "epoch": 0.9996635488863468, + "step": 1857, + "total_flos": 1.243798906601472e+16, + "train_loss": 0.0, + "train_runtime": 0.4818, + "train_samples_per_second": 987062.335, + "train_steps_per_second": 3854.561 + } + ], + "logging_steps": 1.0, + "max_steps": 1857, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 60, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.243798906601472e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..715fb29 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5406678525d2d2e3ccaccd87229e64dd435334f091405159de955e7657b33804 +size 7096