diff --git a/.gitattributes b/.gitattributes
index 53d7257..21b3632 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -44,4 +44,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/README.md b/README.md
index e7f08f2..abd6bd0 100644
--- a/README.md
+++ b/README.md
@@ -1,47 +1,407 @@
---
-license: Apache License 2.0
-
-#model-type:
-##如 gpt、phi、llama、chatglm、baichuan 等
-#- gpt
-
-#domain:
-##如 nlp、cv、audio、multi-modal
-#- nlp
-
-#language:
-##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
-#- cn
-
-#metrics:
-##如 CIDEr、Blue、ROUGE 等
-#- CIDEr
-
-#tags:
-##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
-#- pretrained
-
-#tools:
-##如 vllm、fastchat、llamacpp、AdaSeq 等
-#- vllm
+license: llama3.1
+base_model: meta-llama/Meta-Llama-3.1-8B
+tags:
+- generated_from_trainer
+datasets:
+- cognitivecomputations/Dolphin-2.9
+- m-a-p/CodeFeedback-Filtered-Instruction
+- cognitivecomputations/dolphin-coder
+- cognitivecomputations/samantha-data
+- microsoft/orca-math-word-problems-200k
+- mlabonne/FineTome-100k
+- arcee/agent_data
+- PawanKrd/math-gpt-4o-200k
+- cognitivecomputations/SystemChat-2.0
---
-### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重,可浏览“模型文件”页面获取。
-#### 您可以通过如下git clone命令,或者ModelScope SDK来下载模型
-SDK下载
-```bash
-#安装ModelScope
-pip install modelscope
+# Dolphin 2.9.4 Llama 3.1 8b 🐬
+
+Curated and trained by Eric Hartford and Cognitive Computations
+
+[](https://discord.gg/h3K4XGj2RH)
+Discord: https://discord.gg/h3K4XGj2RH
+
+
+
+Our appreciation for the sponsors of Dolphin 2.9.4:
+- [Crusoe Cloud](https://crusoe.ai/) - provided excellent on-demand 8xL40S node
+
+This model is based on Meta Llama 3.1 8b, and is governed by the Llama 3.1 license.
+
+The base model has 128K context, and our finetuning used 8192 sequence length.
+
+Dolphin 2.9.4 uses ChatML prompt template format.
+
+example:
+
```
-```python
-#SDK模型下载
-from modelscope import snapshot_download
-model_dir = snapshot_download('dphn/dolphin-2.9.4-llama3.1-8b')
-```
-Git下载
-```
-#Git模型下载
-git clone https://www.modelscope.cn/dphn/dolphin-2.9.4-llama3.1-8b.git
+<|im_start|>system
+You are Dolphin, a helpful AI assistant.<|im_end|>
+<|im_start|>user
+{prompt}<|im_end|>
+<|im_start|>assistant
+
```
-
如果您是本模型的贡献者,我们邀请您根据模型贡献文档,及时完善模型卡片内容。
\ No newline at end of file
+Dolphin-2.9.4 has a variety of instruction following, conversational, and coding skills. It also has agentic abilities and supports function calling.
+It is especially trained to obey the system prompt, and follow instructions in many languages.
+
+Dolphin is uncensored. We have filtered the dataset to remove alignment and bias. This makes the model more compliant. You are advised to implement your own alignment layer before exposing the model as a service. It will be highly compliant with any requests, even unethical ones. Please read my blog post about uncensored models. https://erichartford.com/uncensored-models You are responsible for any content you create using this model. Enjoy responsibly.
+
+
+Evals
+
+```
+hf (pretrained=/workspace/axolotl/dolphin-2.9.4-llama3.1-8b-hf,dtype=bfloat16), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (4)
+| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
+|-----------------------------------------------------------|-------|------|-----:|-----------------------|---|-----:|---|------|
+|leaderboard |N/A |none | 0|acc |↑ |0.2926|± |0.0041|
+| | |none | 0|acc_norm |↑ |0.4513|± |0.0053|
+| | |none | 0|exact_match |↑ |0.0982|± |0.0079|
+| | |none | 0|inst_level_loose_acc |↑ |0.3825|± |N/A |
+| | |none | 0|inst_level_strict_acc |↑ |0.3597|± |N/A |
+| | |none | 0|prompt_level_loose_acc |↑ |0.2421|± |0.0184|
+| | |none | 0|prompt_level_strict_acc|↑ |0.2181|± |0.0178|
+| - leaderboard_bbh |N/A |none | 3|acc_norm |↑ |0.4931|± |0.0061|
+| - leaderboard_bbh_boolean_expressions | 0|none | 3|acc_norm |↑ |0.8000|± |0.0253|
+| - leaderboard_bbh_causal_judgement | 0|none | 3|acc_norm |↑ |0.5615|± |0.0364|
+| - leaderboard_bbh_date_understanding | 0|none | 3|acc_norm |↑ |0.4520|± |0.0315|
+| - leaderboard_bbh_disambiguation_qa | 0|none | 3|acc_norm |↑ |0.6640|± |0.0299|
+| - leaderboard_bbh_formal_fallacies | 0|none | 3|acc_norm |↑ |0.5600|± |0.0315|
+| - leaderboard_bbh_geometric_shapes | 0|none | 3|acc_norm |↑ |0.3640|± |0.0305|
+| - leaderboard_bbh_hyperbaton | 0|none | 3|acc_norm |↑ |0.6320|± |0.0306|
+| - leaderboard_bbh_logical_deduction_five_objects | 0|none | 3|acc_norm |↑ |0.4600|± |0.0316|
+| - leaderboard_bbh_logical_deduction_seven_objects | 0|none | 3|acc_norm |↑ |0.4360|± |0.0314|
+| - leaderboard_bbh_logical_deduction_three_objects | 0|none | 3|acc_norm |↑ |0.6160|± |0.0308|
+| - leaderboard_bbh_movie_recommendation | 0|none | 3|acc_norm |↑ |0.7880|± |0.0259|
+| - leaderboard_bbh_navigate | 0|none | 3|acc_norm |↑ |0.5200|± |0.0317|
+| - leaderboard_bbh_object_counting | 0|none | 3|acc_norm |↑ |0.4520|± |0.0315|
+| - leaderboard_bbh_penguins_in_a_table | 0|none | 3|acc_norm |↑ |0.5205|± |0.0415|
+| - leaderboard_bbh_reasoning_about_colored_objects | 0|none | 3|acc_norm |↑ |0.5120|± |0.0317|
+| - leaderboard_bbh_ruin_names | 0|none | 3|acc_norm |↑ |0.6320|± |0.0306|
+| - leaderboard_bbh_salient_translation_error_detection | 0|none | 3|acc_norm |↑ |0.4320|± |0.0314|
+| - leaderboard_bbh_snarks | 0|none | 3|acc_norm |↑ |0.5843|± |0.0370|
+| - leaderboard_bbh_sports_understanding | 0|none | 3|acc_norm |↑ |0.7040|± |0.0289|
+| - leaderboard_bbh_temporal_sequences | 0|none | 3|acc_norm |↑ |0.1440|± |0.0222|
+| - leaderboard_bbh_tracking_shuffled_objects_five_objects | 0|none | 3|acc_norm |↑ |0.1560|± |0.0230|
+| - leaderboard_bbh_tracking_shuffled_objects_seven_objects| 0|none | 3|acc_norm |↑ |0.1320|± |0.0215|
+| - leaderboard_bbh_tracking_shuffled_objects_three_objects| 0|none | 3|acc_norm |↑ |0.2840|± |0.0286|
+| - leaderboard_bbh_web_of_lies | 0|none | 3|acc_norm |↑ |0.4840|± |0.0317|
+| - leaderboard_gpqa |N/A |none | 0|acc_norm |↑ |0.2903|± |0.0132|
+| - leaderboard_gpqa_diamond | 1|none | 0|acc_norm |↑ |0.2980|± |0.0326|
+| - leaderboard_gpqa_extended | 1|none | 0|acc_norm |↑ |0.2839|± |0.0193|
+| - leaderboard_gpqa_main | 1|none | 0|acc_norm |↑ |0.2946|± |0.0216|
+| - leaderboard_ifeval | 2|none | 0|inst_level_loose_acc |↑ |0.3825|± |N/A |
+| | |none | 0|inst_level_strict_acc |↑ |0.3597|± |N/A |
+| | |none | 0|prompt_level_loose_acc |↑ |0.2421|± |0.0184|
+| | |none | 0|prompt_level_strict_acc|↑ |0.2181|± |0.0178|
+| - leaderboard_math_algebra_hard | 1|none | 4|exact_match |↑ |0.1596|± |0.0209|
+| - leaderboard_math_counting_and_prob_hard | 1|none | 4|exact_match |↑ |0.0488|± |0.0195|
+| - leaderboard_math_geometry_hard | 1|none | 4|exact_match |↑ |0.0530|± |0.0196|
+| - leaderboard_math_hard |N/A |none | 4|exact_match |↑ |0.0982|± |0.0079|
+| - leaderboard_math_intermediate_algebra_hard | 1|none | 4|exact_match |↑ |0.0143|± |0.0071|
+| - leaderboard_math_num_theory_hard | 1|none | 4|exact_match |↑ |0.0455|± |0.0168|
+| - leaderboard_math_prealgebra_hard | 1|none | 4|exact_match |↑ |0.2591|± |0.0316|
+| - leaderboard_math_precalculus_hard | 1|none | 4|exact_match |↑ |0.0519|± |0.0192|
+| - leaderboard_mmlu_pro | 0.1|none | 5|acc |↑ |0.2926|± |0.0041|
+| - leaderboard_musr |N/A |none | 0|acc_norm |↑ |0.3862|± |0.0173|
+| - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm |↑ |0.5280|± |0.0316|
+| - leaderboard_musr_object_placements | 1|none | 0|acc_norm |↑ |0.3594|± |0.0300|
+| - leaderboard_musr_team_allocation | 1|none | 0|acc_norm |↑ |0.2720|± |0.0282|
+
+| Groups |Version|Filter|n-shot| Metric | |Value | |Stderr|
+|------------------------|-------|------|-----:|-----------------------|---|-----:|---|------|
+|leaderboard |N/A |none | 0|acc |↑ |0.2926|± |0.0041|
+| | |none | 0|acc_norm |↑ |0.4513|± |0.0053|
+| | |none | 0|exact_match |↑ |0.0982|± |0.0079|
+| | |none | 0|inst_level_loose_acc |↑ |0.3825|± |N/A |
+| | |none | 0|inst_level_strict_acc |↑ |0.3597|± |N/A |
+| | |none | 0|prompt_level_loose_acc |↑ |0.2421|± |0.0184|
+| | |none | 0|prompt_level_strict_acc|↑ |0.2181|± |0.0178|
+| - leaderboard_bbh |N/A |none | 3|acc_norm |↑ |0.4931|± |0.0061|
+| - leaderboard_gpqa |N/A |none | 0|acc_norm |↑ |0.2903|± |0.0132|
+| - leaderboard_math_hard|N/A |none | 4|exact_match |↑ |0.0982|± |0.0079|
+| - leaderboard_musr |N/A |none | 0|acc_norm |↑ |0.3862|± |0.0173|
+```
+
+
+
+[
](https://github.com/axolotl-ai-cloud/axolotl)
+See axolotl config
+
+axolotl version: `0.4.1`
+```yaml
+base_model: meta-llama/Meta-Llama-3.1-8B
+model_type: LlamaForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: false
+# load_in_4bit: true
+strict: false
+
+datasets:
+ - path: /workspace/datasets/dolphin-2.9.4/dolphin201-sharegpt2.jsonl
+ type: sharegpt
+ conversation: chatml
+
+chat_template: chatml
+# adapter: qlora
+# lora_r: 128
+# lora_alpha: 16
+# lora_modules_to_save: [embed_tokens, lm_head]
+# lora_dropout: 0.05
+# lora_target_linear: true
+
+unfrozen_parameters:
+- input_layernorm
+- model.norm
+- post_attention_layernorm
+- self_attn.rotary_emb
+- ^lm_head.weight$
+- ^model.embed_tokens.weight$
+# mlp.down_proj layers
+- model.layers.1.mlp.down_proj
+- model.layers.0.mlp.down_proj
+- model.layers.30.mlp.down_proj
+- model.layers.2.mlp.down_proj
+- model.layers.21.mlp.down_proj
+- model.layers.22.mlp.down_proj
+- model.layers.29.mlp.down_proj
+- model.layers.5.mlp.down_proj
+- model.layers.4.mlp.down_proj
+- model.layers.20.mlp.down_proj
+- model.layers.23.mlp.down_proj
+- model.layers.19.mlp.down_proj
+- model.layers.3.mlp.down_proj
+- model.layers.17.mlp.down_proj
+- model.layers.6.mlp.down_proj
+- model.layers.31.mlp.down_proj
+# mlp.up_proj layers
+- model.layers.4.mlp.up_proj
+- model.layers.3.mlp.up_proj
+- model.layers.0.mlp.up_proj
+- model.layers.5.mlp.up_proj
+- model.layers.7.mlp.up_proj
+- model.layers.6.mlp.up_proj
+- model.layers.2.mlp.up_proj
+- model.layers.1.mlp.up_proj
+- model.layers.8.mlp.up_proj
+- model.layers.12.mlp.up_proj
+- model.layers.14.mlp.up_proj
+- model.layers.9.mlp.up_proj
+- model.layers.15.mlp.up_proj
+- model.layers.17.mlp.up_proj
+- model.layers.13.mlp.up_proj
+- model.layers.19.mlp.up_proj
+# self_attn.k_proj layers
+- model.layers.29.self_attn.k_proj
+- model.layers.25.self_attn.k_proj
+- model.layers.23.self_attn.k_proj
+- model.layers.28.self_attn.k_proj
+- model.layers.21.self_attn.k_proj
+- model.layers.19.self_attn.k_proj
+- model.layers.22.self_attn.k_proj
+- model.layers.20.self_attn.k_proj
+- model.layers.24.self_attn.k_proj
+- model.layers.31.self_attn.k_proj
+- model.layers.27.self_attn.k_proj
+- model.layers.26.self_attn.k_proj
+- model.layers.17.self_attn.k_proj
+- model.layers.11.self_attn.k_proj
+- model.layers.18.self_attn.k_proj
+- model.layers.14.self_attn.k_proj
+# self_attn.o_proj layers
+- model.layers.14.self_attn.o_proj
+- model.layers.7.self_attn.o_proj
+- model.layers.5.self_attn.o_proj
+- model.layers.11.self_attn.o_proj
+- model.layers.6.self_attn.o_proj
+- model.layers.24.self_attn.o_proj
+- model.layers.9.self_attn.o_proj
+- model.layers.13.self_attn.o_proj
+- model.layers.10.self_attn.o_proj
+- model.layers.12.self_attn.o_proj
+- model.layers.8.self_attn.o_proj
+- model.layers.25.self_attn.o_proj
+- model.layers.21.self_attn.o_proj
+- model.layers.23.self_attn.o_proj
+- model.layers.15.self_attn.o_proj
+- model.layers.16.self_attn.o_proj
+# self_attn.q_proj layers
+- model.layers.8.self_attn.q_proj
+- model.layers.13.self_attn.q_proj
+- model.layers.9.self_attn.q_proj
+- model.layers.14.self_attn.q_proj
+- model.layers.10.self_attn.q_proj
+- model.layers.11.self_attn.q_proj
+- model.layers.0.self_attn.q_proj
+- model.layers.15.self_attn.q_proj
+- model.layers.1.self_attn.q_proj
+- model.layers.6.self_attn.q_proj
+- model.layers.5.self_attn.q_proj
+- model.layers.7.self_attn.q_proj
+- model.layers.12.self_attn.q_proj
+- model.layers.16.self_attn.q_proj
+- model.layers.17.self_attn.q_proj
+- model.layers.26.self_attn.q_proj
+# self_attn.v_proj layers
+- model.layers.26.self_attn.v_proj
+- model.layers.17.self_attn.v_proj
+- model.layers.3.self_attn.v_proj
+- model.layers.28.self_attn.v_proj
+- model.layers.29.self_attn.v_proj
+- model.layers.21.self_attn.v_proj
+- model.layers.15.self_attn.v_proj
+- model.layers.16.self_attn.v_proj
+- model.layers.20.self_attn.v_proj
+- model.layers.25.self_attn.v_proj
+- model.layers.6.self_attn.v_proj
+- model.layers.23.self_attn.v_proj
+- model.layers.4.self_attn.v_proj
+- model.layers.1.self_attn.v_proj
+- model.layers.22.self_attn.v_proj
+- model.layers.14.self_attn.v_proj
+# mlp.gate_proj layers
+- model.layers.1.mlp.gate_proj
+- model.layers.2.mlp.gate_proj
+- model.layers.3.mlp.gate_proj
+- model.layers.4.mlp.gate_proj
+- model.layers.0.mlp.gate_proj
+- model.layers.25.mlp.gate_proj
+- model.layers.26.mlp.gate_proj
+- model.layers.5.mlp.gate_proj
+- model.layers.24.mlp.gate_proj
+- model.layers.28.mlp.gate_proj
+- model.layers.23.mlp.gate_proj
+- model.layers.27.mlp.gate_proj
+- model.layers.21.mlp.gate_proj
+- model.layers.22.mlp.gate_proj
+- model.layers.29.mlp.gate_proj
+- model.layers.20.mlp.gate_proj
+
+
+
+
+dataset_prepared_path: /workspace/axolotl/dolph-2.9.4-nemo-prepared
+val_set_size: 0.01
+output_dir: /workspace/axolotl/dolphin-2.9.4-llama3.1-8b
+
+sequence_len: 8192
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project: dolphin-2.9.4-llama3.1-8b
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 16
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 5e-6
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32:
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+ use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+# evals_per_epoch: 4
+eval_table_size:
+saves_per_epoch: 1
+save_total_limit: 2
+save_steps:
+debug:
+deepspeed: deepspeed_configs/zero3_bf16.json
+weight_decay: 0.1
+special_tokens:
+ eos_token: "<|im_end|>"
+ bos_token: "<|begin_of_text|>"
+ pad_token: "<|finetune_right_pad_id|>"
+tokens:
+ - "<|im_start|>"
+
+
+# fsdp:
+# - full_shard
+# - auto_wrap
+# fsdp_config:
+# fsdp_limit_all_gathers: true
+# fsdp_sync_module_states: true
+# fsdp_offload_params: true
+# fsdp_use_orig_params: false
+# fsdp_cpu_ram_efficient_loading: true
+# fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
+# fsdp_state_dict_type: FULL_STATE_DICT
+# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+# fsdp_sharding_strategy: FULL_SHARD
+# fsdp_forward_prefetch: false
+# fsdp_backward_prefetch: BACKWARD_PRE
+```
+
+
+
+# workspace/axolotl/dolphin-2.9.4-llama3.1-8b
+
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.5655
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 5e-06
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 16
+- total_train_batch_size: 256
+- total_eval_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 100
+- num_epochs: 3
+
+### Training results
+
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.5837 | 1.0180 | 1161 | 0.5814 |
+| 0.5525 | 2.0179 | 2322 | 0.5671 |
+| 0.5514 | 2.9624 | 3420 | 0.5655 |
+
+
+### Framework versions
+
+- Transformers 4.44.0.dev0
+- Pytorch 2.4.0+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..e860e11
--- /dev/null
+++ b/config.json
@@ -0,0 +1,35 @@
+{
+ "_name_or_path": "meta-llama/Meta-Llama-3.1-8B",
+ "architectures": [
+ "LlamaForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "bos_token_id": 128000,
+ "eos_token_id": [128001, 128008, 128009, 128256],
+ "hidden_act": "silu",
+ "hidden_size": 4096,
+ "initializer_range": 0.02,
+ "intermediate_size": 14336,
+ "max_position_embeddings": 131072,
+ "mlp_bias": false,
+ "model_type": "llama",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 32,
+ "num_key_value_heads": 8,
+ "pretraining_tp": 1,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 8.0,
+ "high_freq_factor": 4.0,
+ "low_freq_factor": 1.0,
+ "original_max_position_embeddings": 8192,
+ "rope_type": "llama3"
+ },
+ "rope_theta": 500000.0,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.44.0.dev0",
+ "use_cache": false,
+ "vocab_size": 128258
+}
diff --git a/configuration.json b/configuration.json
new file mode 100644
index 0000000..159097f
--- /dev/null
+++ b/configuration.json
@@ -0,0 +1 @@
+{"framework": "pytorch", "task": "others", "allow_remote": true}
\ No newline at end of file
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000..3cfa120
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,9 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 128000,
+ "do_sample": true,
+ "eos_token_id": [128001, 128008, 128009, 128256],
+ "temperature": 0.6,
+ "top_p": 0.9,
+ "transformers_version": "4.44.0.dev0"
+}
diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors
new file mode 100644
index 0000000..ff42c2e
--- /dev/null
+++ b/model-00001-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79c3df60bcc0c481f166d65e685f5a1e5c6e9b846f67d7cb476b2b267c17f3b9
+size 135
diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors
new file mode 100644
index 0000000..e457368
--- /dev/null
+++ b/model-00002-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57152fab7ff8c285b7d50c70941f139ebc32f00a26b1b7b6e4ffcf6bbc0ba4a1
+size 135
diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors
new file mode 100644
index 0000000..fe7038b
--- /dev/null
+++ b/model-00003-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7943a969ca2cafa86020e6be056955430e68117ac070309c8070e9d25c6ab612
+size 135
diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors
new file mode 100644
index 0000000..7ead629
--- /dev/null
+++ b/model-00004-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11a99634b76a25de6eacd96e8419e7a4ea0a1087f85db089f0d8c419813d6253
+size 135
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000..8d8b4ac
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+ "metadata": {
+ "total_size": 16060555264
+ },
+ "weight_map": {
+ "lm_head.weight": "model-00004-of-00004.safetensors",
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+ "model.norm.weight": "model-00004-of-00004.safetensors"
+ }
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000..9414f9e
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|im_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000..8e137c7
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9c5cd789d40eb50c3e811660a41958e2a3e5521876f9037d15326e516b14e49
+size 9086034
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..8ffbd5f
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,2079 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128256": {
+ "content": "<|im_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128257": {
+ "content": "<|im_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": false
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|im_end|>",
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|finetune_right_pad_id|>",
+ "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000..303fecd
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,8013 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.9996163752945689,
+ "eval_steps": 500,
+ "global_step": 1140,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0008768564695566394,
+ "grad_norm": 3.8354088038954104,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 0.8827,
+ "step": 1
+ },
+ {
+ "epoch": 0.0017537129391132788,
+ "grad_norm": 3.854484535409196,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 0.8816,
+ "step": 2
+ },
+ {
+ "epoch": 0.0026305694086699184,
+ "grad_norm": 3.871894613191576,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 0.8801,
+ "step": 3
+ },
+ {
+ "epoch": 0.0035074258782265577,
+ "grad_norm": 4.015192807591418,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 0.8778,
+ "step": 4
+ },
+ {
+ "epoch": 0.004384282347783197,
+ "grad_norm": 3.8093684146898625,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 0.8711,
+ "step": 5
+ },
+ {
+ "epoch": 0.005261138817339837,
+ "grad_norm": 3.8610474891808035,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 0.8774,
+ "step": 6
+ },
+ {
+ "epoch": 0.0061379952868964765,
+ "grad_norm": 3.7967273935876027,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 0.8669,
+ "step": 7
+ },
+ {
+ "epoch": 0.007014851756453115,
+ "grad_norm": 3.6775126026184703,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 0.8605,
+ "step": 8
+ },
+ {
+ "epoch": 0.007891708226009755,
+ "grad_norm": 3.8340713786963674,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 0.8735,
+ "step": 9
+ },
+ {
+ "epoch": 0.008768564695566394,
+ "grad_norm": 3.7479501504503463,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 0.8843,
+ "step": 10
+ },
+ {
+ "epoch": 0.009645421165123035,
+ "grad_norm": 3.6317203672346734,
+ "learning_rate": 5.5e-07,
+ "loss": 0.8637,
+ "step": 11
+ },
+ {
+ "epoch": 0.010522277634679673,
+ "grad_norm": 3.512911808429478,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 0.8649,
+ "step": 12
+ },
+ {
+ "epoch": 0.011399134104236312,
+ "grad_norm": 3.5056527507086486,
+ "learning_rate": 6.5e-07,
+ "loss": 0.8514,
+ "step": 13
+ },
+ {
+ "epoch": 0.012275990573792953,
+ "grad_norm": 3.150666271402955,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 0.844,
+ "step": 14
+ },
+ {
+ "epoch": 0.013152847043349592,
+ "grad_norm": 2.92608322776606,
+ "learning_rate": 7.5e-07,
+ "loss": 0.8382,
+ "step": 15
+ },
+ {
+ "epoch": 0.01402970351290623,
+ "grad_norm": 3.0202821236842246,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 0.8419,
+ "step": 16
+ },
+ {
+ "epoch": 0.014906559982462871,
+ "grad_norm": 2.9419098502173515,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 0.8362,
+ "step": 17
+ },
+ {
+ "epoch": 0.01578341645201951,
+ "grad_norm": 2.7926753613205433,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 0.825,
+ "step": 18
+ },
+ {
+ "epoch": 0.01666027292157615,
+ "grad_norm": 2.4471605086654096,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 0.7904,
+ "step": 19
+ },
+ {
+ "epoch": 0.017537129391132788,
+ "grad_norm": 1.8918627793518321,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 0.7968,
+ "step": 20
+ },
+ {
+ "epoch": 0.018413985860689427,
+ "grad_norm": 1.713937144355921,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 0.7828,
+ "step": 21
+ },
+ {
+ "epoch": 0.01929084233024607,
+ "grad_norm": 1.4451729443975803,
+ "learning_rate": 1.1e-06,
+ "loss": 0.78,
+ "step": 22
+ },
+ {
+ "epoch": 0.020167698799802708,
+ "grad_norm": 1.0866085026095695,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 0.7807,
+ "step": 23
+ },
+ {
+ "epoch": 0.021044555269359347,
+ "grad_norm": 1.022948274017058,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 0.758,
+ "step": 24
+ },
+ {
+ "epoch": 0.021921411738915986,
+ "grad_norm": 0.976807823206357,
+ "learning_rate": 1.25e-06,
+ "loss": 0.7783,
+ "step": 25
+ },
+ {
+ "epoch": 0.022798268208472625,
+ "grad_norm": 2.5562950715507275,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7815,
+ "step": 26
+ },
+ {
+ "epoch": 0.023675124678029263,
+ "grad_norm": 1.7956421603987698,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.759,
+ "step": 27
+ },
+ {
+ "epoch": 0.024551981147585906,
+ "grad_norm": 1.3622207205502601,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.7551,
+ "step": 28
+ },
+ {
+ "epoch": 0.025428837617142545,
+ "grad_norm": 0.9842354354215974,
+ "learning_rate": 1.45e-06,
+ "loss": 0.7625,
+ "step": 29
+ },
+ {
+ "epoch": 0.026305694086699184,
+ "grad_norm": 0.7679059075291825,
+ "learning_rate": 1.5e-06,
+ "loss": 0.7513,
+ "step": 30
+ },
+ {
+ "epoch": 0.027182550556255822,
+ "grad_norm": 0.709914193704945,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.7309,
+ "step": 31
+ },
+ {
+ "epoch": 0.02805940702581246,
+ "grad_norm": 0.5711165082308596,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.7358,
+ "step": 32
+ },
+ {
+ "epoch": 0.0289362634953691,
+ "grad_norm": 0.6732600160748007,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.746,
+ "step": 33
+ },
+ {
+ "epoch": 0.029813119964925743,
+ "grad_norm": 0.519623223105866,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.7408,
+ "step": 34
+ },
+ {
+ "epoch": 0.03068997643448238,
+ "grad_norm": 0.4967853550459734,
+ "learning_rate": 1.75e-06,
+ "loss": 0.7284,
+ "step": 35
+ },
+ {
+ "epoch": 0.03156683290403902,
+ "grad_norm": 0.4558474579400771,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.7337,
+ "step": 36
+ },
+ {
+ "epoch": 0.03244368937359566,
+ "grad_norm": 0.5187940265183988,
+ "learning_rate": 1.85e-06,
+ "loss": 0.7459,
+ "step": 37
+ },
+ {
+ "epoch": 0.0333205458431523,
+ "grad_norm": 0.46649520265418404,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.7238,
+ "step": 38
+ },
+ {
+ "epoch": 0.03419740231270894,
+ "grad_norm": 0.4621107554297482,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.7243,
+ "step": 39
+ },
+ {
+ "epoch": 0.035074258782265576,
+ "grad_norm": 0.4493723053379801,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.7395,
+ "step": 40
+ },
+ {
+ "epoch": 0.035951115251822215,
+ "grad_norm": 0.4196555282378131,
+ "learning_rate": 2.05e-06,
+ "loss": 0.7371,
+ "step": 41
+ },
+ {
+ "epoch": 0.036827971721378853,
+ "grad_norm": 0.3836269605839978,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.7172,
+ "step": 42
+ },
+ {
+ "epoch": 0.0377048281909355,
+ "grad_norm": 0.38056806308372326,
+ "learning_rate": 2.15e-06,
+ "loss": 0.7163,
+ "step": 43
+ },
+ {
+ "epoch": 0.03858168466049214,
+ "grad_norm": 0.3561457145290273,
+ "learning_rate": 2.2e-06,
+ "loss": 0.6986,
+ "step": 44
+ },
+ {
+ "epoch": 0.03945854113004878,
+ "grad_norm": 0.3723153937166507,
+ "learning_rate": 2.25e-06,
+ "loss": 0.7154,
+ "step": 45
+ },
+ {
+ "epoch": 0.040335397599605416,
+ "grad_norm": 0.36630666691552083,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.7201,
+ "step": 46
+ },
+ {
+ "epoch": 0.041212254069162055,
+ "grad_norm": 0.3482645877468935,
+ "learning_rate": 2.35e-06,
+ "loss": 0.7213,
+ "step": 47
+ },
+ {
+ "epoch": 0.042089110538718694,
+ "grad_norm": 0.35892687942862245,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.7167,
+ "step": 48
+ },
+ {
+ "epoch": 0.04296596700827533,
+ "grad_norm": 0.3353339246028489,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.7154,
+ "step": 49
+ },
+ {
+ "epoch": 0.04384282347783197,
+ "grad_norm": 0.3327601533732165,
+ "learning_rate": 2.5e-06,
+ "loss": 0.7149,
+ "step": 50
+ },
+ {
+ "epoch": 0.04471967994738861,
+ "grad_norm": 0.31047839521651305,
+ "learning_rate": 2.55e-06,
+ "loss": 0.7022,
+ "step": 51
+ },
+ {
+ "epoch": 0.04559653641694525,
+ "grad_norm": 0.3140715368302216,
+ "learning_rate": 2.6e-06,
+ "loss": 0.7024,
+ "step": 52
+ },
+ {
+ "epoch": 0.04647339288650189,
+ "grad_norm": 0.3070088967685052,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.7116,
+ "step": 53
+ },
+ {
+ "epoch": 0.04735024935605853,
+ "grad_norm": 0.29688015435603987,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.7068,
+ "step": 54
+ },
+ {
+ "epoch": 0.04822710582561517,
+ "grad_norm": 0.312569173156887,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.708,
+ "step": 55
+ },
+ {
+ "epoch": 0.04910396229517181,
+ "grad_norm": 0.3212155084231398,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.6895,
+ "step": 56
+ },
+ {
+ "epoch": 0.04998081876472845,
+ "grad_norm": 0.30141336197411556,
+ "learning_rate": 2.85e-06,
+ "loss": 0.714,
+ "step": 57
+ },
+ {
+ "epoch": 0.05085767523428509,
+ "grad_norm": 0.2678799864293998,
+ "learning_rate": 2.9e-06,
+ "loss": 0.6864,
+ "step": 58
+ },
+ {
+ "epoch": 0.05173453170384173,
+ "grad_norm": 0.2763602360222888,
+ "learning_rate": 2.95e-06,
+ "loss": 0.6955,
+ "step": 59
+ },
+ {
+ "epoch": 0.05261138817339837,
+ "grad_norm": 0.2960116429627635,
+ "learning_rate": 3e-06,
+ "loss": 0.69,
+ "step": 60
+ },
+ {
+ "epoch": 0.053488244642955006,
+ "grad_norm": 0.3126860845251708,
+ "learning_rate": 3.05e-06,
+ "loss": 0.7008,
+ "step": 61
+ },
+ {
+ "epoch": 0.054365101112511645,
+ "grad_norm": 0.2684477743603555,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.7065,
+ "step": 62
+ },
+ {
+ "epoch": 0.055241957582068284,
+ "grad_norm": 0.2831279869843839,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.6908,
+ "step": 63
+ },
+ {
+ "epoch": 0.05611881405162492,
+ "grad_norm": 0.28914936357131454,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.6847,
+ "step": 64
+ },
+ {
+ "epoch": 0.05699567052118156,
+ "grad_norm": 0.2664694092243829,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.6975,
+ "step": 65
+ },
+ {
+ "epoch": 0.0578725269907382,
+ "grad_norm": 0.2670931319561963,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.6957,
+ "step": 66
+ },
+ {
+ "epoch": 0.058749383460294846,
+ "grad_norm": 0.25481964712146327,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.6907,
+ "step": 67
+ },
+ {
+ "epoch": 0.059626239929851485,
+ "grad_norm": 0.2917224006438053,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.6889,
+ "step": 68
+ },
+ {
+ "epoch": 0.060503096399408124,
+ "grad_norm": 0.27794604488949715,
+ "learning_rate": 3.45e-06,
+ "loss": 0.6815,
+ "step": 69
+ },
+ {
+ "epoch": 0.06137995286896476,
+ "grad_norm": 0.24963117175569036,
+ "learning_rate": 3.5e-06,
+ "loss": 0.6883,
+ "step": 70
+ },
+ {
+ "epoch": 0.0622568093385214,
+ "grad_norm": 0.2893133633641976,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.6792,
+ "step": 71
+ },
+ {
+ "epoch": 0.06313366580807804,
+ "grad_norm": 0.2826308836822568,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.7028,
+ "step": 72
+ },
+ {
+ "epoch": 0.06401052227763468,
+ "grad_norm": 0.2640935466003184,
+ "learning_rate": 3.65e-06,
+ "loss": 0.6916,
+ "step": 73
+ },
+ {
+ "epoch": 0.06488737874719132,
+ "grad_norm": 0.24415033172628944,
+ "learning_rate": 3.7e-06,
+ "loss": 0.6839,
+ "step": 74
+ },
+ {
+ "epoch": 0.06576423521674796,
+ "grad_norm": 0.3112401087242733,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.7021,
+ "step": 75
+ },
+ {
+ "epoch": 0.0666410916863046,
+ "grad_norm": 0.2875281112172732,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.6929,
+ "step": 76
+ },
+ {
+ "epoch": 0.06751794815586123,
+ "grad_norm": 0.2874092373703745,
+ "learning_rate": 3.85e-06,
+ "loss": 0.6788,
+ "step": 77
+ },
+ {
+ "epoch": 0.06839480462541787,
+ "grad_norm": 0.26681007920352356,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.6881,
+ "step": 78
+ },
+ {
+ "epoch": 0.06927166109497451,
+ "grad_norm": 0.25207102904583284,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6852,
+ "step": 79
+ },
+ {
+ "epoch": 0.07014851756453115,
+ "grad_norm": 0.2747607135538642,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.6864,
+ "step": 80
+ },
+ {
+ "epoch": 0.07102537403408779,
+ "grad_norm": 0.26361955079133653,
+ "learning_rate": 4.05e-06,
+ "loss": 0.685,
+ "step": 81
+ },
+ {
+ "epoch": 0.07190223050364443,
+ "grad_norm": 0.33310729956901713,
+ "learning_rate": 4.1e-06,
+ "loss": 0.6803,
+ "step": 82
+ },
+ {
+ "epoch": 0.07277908697320107,
+ "grad_norm": 0.2453664087918243,
+ "learning_rate": 4.15e-06,
+ "loss": 0.6761,
+ "step": 83
+ },
+ {
+ "epoch": 0.07365594344275771,
+ "grad_norm": 0.2908734202511105,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.6931,
+ "step": 84
+ },
+ {
+ "epoch": 0.07453279991231436,
+ "grad_norm": 0.2786719287704165,
+ "learning_rate": 4.25e-06,
+ "loss": 0.6874,
+ "step": 85
+ },
+ {
+ "epoch": 0.075409656381871,
+ "grad_norm": 0.271512101257661,
+ "learning_rate": 4.3e-06,
+ "loss": 0.6775,
+ "step": 86
+ },
+ {
+ "epoch": 0.07628651285142764,
+ "grad_norm": 0.2947304767213564,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.6865,
+ "step": 87
+ },
+ {
+ "epoch": 0.07716336932098428,
+ "grad_norm": 0.25160176616217883,
+ "learning_rate": 4.4e-06,
+ "loss": 0.6785,
+ "step": 88
+ },
+ {
+ "epoch": 0.07804022579054092,
+ "grad_norm": 0.32459153781403244,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.6773,
+ "step": 89
+ },
+ {
+ "epoch": 0.07891708226009755,
+ "grad_norm": 0.2487028104553641,
+ "learning_rate": 4.5e-06,
+ "loss": 0.6812,
+ "step": 90
+ },
+ {
+ "epoch": 0.07979393872965419,
+ "grad_norm": 0.2925038544983962,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.6791,
+ "step": 91
+ },
+ {
+ "epoch": 0.08067079519921083,
+ "grad_norm": 0.28005649996035475,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.6704,
+ "step": 92
+ },
+ {
+ "epoch": 0.08154765166876747,
+ "grad_norm": 0.3264776457957641,
+ "learning_rate": 4.65e-06,
+ "loss": 0.6772,
+ "step": 93
+ },
+ {
+ "epoch": 0.08242450813832411,
+ "grad_norm": 0.2533079586966528,
+ "learning_rate": 4.7e-06,
+ "loss": 0.6792,
+ "step": 94
+ },
+ {
+ "epoch": 0.08330136460788075,
+ "grad_norm": 0.25651763696878965,
+ "learning_rate": 4.75e-06,
+ "loss": 0.6607,
+ "step": 95
+ },
+ {
+ "epoch": 0.08417822107743739,
+ "grad_norm": 0.2546288408258964,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.6669,
+ "step": 96
+ },
+ {
+ "epoch": 0.08505507754699403,
+ "grad_norm": 0.25215356470309513,
+ "learning_rate": 4.85e-06,
+ "loss": 0.6846,
+ "step": 97
+ },
+ {
+ "epoch": 0.08593193401655067,
+ "grad_norm": 0.28631928221309494,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.6717,
+ "step": 98
+ },
+ {
+ "epoch": 0.0868087904861073,
+ "grad_norm": 0.27212851090592044,
+ "learning_rate": 4.95e-06,
+ "loss": 0.6804,
+ "step": 99
+ },
+ {
+ "epoch": 0.08768564695566394,
+ "grad_norm": 0.29348118762199116,
+ "learning_rate": 5e-06,
+ "loss": 0.669,
+ "step": 100
+ },
+ {
+ "epoch": 0.08856250342522058,
+ "grad_norm": 0.30678288402779474,
+ "learning_rate": 4.999998880733363e-06,
+ "loss": 0.6631,
+ "step": 101
+ },
+ {
+ "epoch": 0.08943935989477722,
+ "grad_norm": 0.3011120934546324,
+ "learning_rate": 4.999995522934454e-06,
+ "loss": 0.679,
+ "step": 102
+ },
+ {
+ "epoch": 0.09031621636433386,
+ "grad_norm": 0.31706623056013666,
+ "learning_rate": 4.9999899266062804e-06,
+ "loss": 0.6723,
+ "step": 103
+ },
+ {
+ "epoch": 0.0911930728338905,
+ "grad_norm": 0.3120471729111099,
+ "learning_rate": 4.999982091753851e-06,
+ "loss": 0.6613,
+ "step": 104
+ },
+ {
+ "epoch": 0.09206992930344714,
+ "grad_norm": 0.2905613969012575,
+ "learning_rate": 4.999972018384183e-06,
+ "loss": 0.6611,
+ "step": 105
+ },
+ {
+ "epoch": 0.09294678577300378,
+ "grad_norm": 0.28925318733211003,
+ "learning_rate": 4.999959706506297e-06,
+ "loss": 0.6695,
+ "step": 106
+ },
+ {
+ "epoch": 0.09382364224256041,
+ "grad_norm": 0.28085987028825943,
+ "learning_rate": 4.999945156131215e-06,
+ "loss": 0.6502,
+ "step": 107
+ },
+ {
+ "epoch": 0.09470049871211705,
+ "grad_norm": 0.30971852568333075,
+ "learning_rate": 4.9999283672719665e-06,
+ "loss": 0.672,
+ "step": 108
+ },
+ {
+ "epoch": 0.0955773551816737,
+ "grad_norm": 0.32363303577963826,
+ "learning_rate": 4.999909339943585e-06,
+ "loss": 0.673,
+ "step": 109
+ },
+ {
+ "epoch": 0.09645421165123035,
+ "grad_norm": 0.29549042512555623,
+ "learning_rate": 4.999888074163108e-06,
+ "loss": 0.6591,
+ "step": 110
+ },
+ {
+ "epoch": 0.09733106812078698,
+ "grad_norm": 0.33514032815726946,
+ "learning_rate": 4.999864569949576e-06,
+ "loss": 0.6673,
+ "step": 111
+ },
+ {
+ "epoch": 0.09820792459034362,
+ "grad_norm": 0.3092438114721304,
+ "learning_rate": 4.999838827324036e-06,
+ "loss": 0.6641,
+ "step": 112
+ },
+ {
+ "epoch": 0.09908478105990026,
+ "grad_norm": 0.35403209993563217,
+ "learning_rate": 4.999810846309539e-06,
+ "loss": 0.6597,
+ "step": 113
+ },
+ {
+ "epoch": 0.0999616375294569,
+ "grad_norm": 0.2964896689419525,
+ "learning_rate": 4.999780626931136e-06,
+ "loss": 0.67,
+ "step": 114
+ },
+ {
+ "epoch": 0.10083849399901354,
+ "grad_norm": 0.3484706075226941,
+ "learning_rate": 4.999748169215891e-06,
+ "loss": 0.6745,
+ "step": 115
+ },
+ {
+ "epoch": 0.10171535046857018,
+ "grad_norm": 0.33505074735981694,
+ "learning_rate": 4.999713473192863e-06,
+ "loss": 0.6591,
+ "step": 116
+ },
+ {
+ "epoch": 0.10259220693812682,
+ "grad_norm": 0.27082614750107925,
+ "learning_rate": 4.999676538893121e-06,
+ "loss": 0.6621,
+ "step": 117
+ },
+ {
+ "epoch": 0.10346906340768346,
+ "grad_norm": 0.3506965847465109,
+ "learning_rate": 4.999637366349736e-06,
+ "loss": 0.6733,
+ "step": 118
+ },
+ {
+ "epoch": 0.1043459198772401,
+ "grad_norm": 0.27422374937685745,
+ "learning_rate": 4.999595955597784e-06,
+ "loss": 0.655,
+ "step": 119
+ },
+ {
+ "epoch": 0.10522277634679673,
+ "grad_norm": 0.33620430443399,
+ "learning_rate": 4.999552306674345e-06,
+ "loss": 0.6755,
+ "step": 120
+ },
+ {
+ "epoch": 0.10609963281635337,
+ "grad_norm": 0.2837804889330797,
+ "learning_rate": 4.999506419618502e-06,
+ "loss": 0.6579,
+ "step": 121
+ },
+ {
+ "epoch": 0.10697648928591001,
+ "grad_norm": 0.37952040871876175,
+ "learning_rate": 4.999458294471342e-06,
+ "loss": 0.6692,
+ "step": 122
+ },
+ {
+ "epoch": 0.10785334575546665,
+ "grad_norm": 0.2690864525050558,
+ "learning_rate": 4.99940793127596e-06,
+ "loss": 0.6494,
+ "step": 123
+ },
+ {
+ "epoch": 0.10873020222502329,
+ "grad_norm": 0.3635002166658454,
+ "learning_rate": 4.999355330077449e-06,
+ "loss": 0.6611,
+ "step": 124
+ },
+ {
+ "epoch": 0.10960705869457993,
+ "grad_norm": 0.29302462194523843,
+ "learning_rate": 4.999300490922911e-06,
+ "loss": 0.6526,
+ "step": 125
+ },
+ {
+ "epoch": 0.11048391516413657,
+ "grad_norm": 0.3058787861740299,
+ "learning_rate": 4.999243413861447e-06,
+ "loss": 0.659,
+ "step": 126
+ },
+ {
+ "epoch": 0.1113607716336932,
+ "grad_norm": 0.332548080761125,
+ "learning_rate": 4.9991840989441665e-06,
+ "loss": 0.6659,
+ "step": 127
+ },
+ {
+ "epoch": 0.11223762810324985,
+ "grad_norm": 0.29432766212441813,
+ "learning_rate": 4.999122546224181e-06,
+ "loss": 0.6447,
+ "step": 128
+ },
+ {
+ "epoch": 0.11311448457280648,
+ "grad_norm": 0.29523416391879537,
+ "learning_rate": 4.999058755756605e-06,
+ "loss": 0.6587,
+ "step": 129
+ },
+ {
+ "epoch": 0.11399134104236312,
+ "grad_norm": 0.32423165831626255,
+ "learning_rate": 4.998992727598557e-06,
+ "loss": 0.6564,
+ "step": 130
+ },
+ {
+ "epoch": 0.11486819751191976,
+ "grad_norm": 0.34859884756639065,
+ "learning_rate": 4.99892446180916e-06,
+ "loss": 0.653,
+ "step": 131
+ },
+ {
+ "epoch": 0.1157450539814764,
+ "grad_norm": 0.30133447855543133,
+ "learning_rate": 4.99885395844954e-06,
+ "loss": 0.647,
+ "step": 132
+ },
+ {
+ "epoch": 0.11662191045103305,
+ "grad_norm": 0.3600942516700186,
+ "learning_rate": 4.998781217582827e-06,
+ "loss": 0.6581,
+ "step": 133
+ },
+ {
+ "epoch": 0.11749876692058969,
+ "grad_norm": 0.29960571448156953,
+ "learning_rate": 4.998706239274153e-06,
+ "loss": 0.6623,
+ "step": 134
+ },
+ {
+ "epoch": 0.11837562339014633,
+ "grad_norm": 0.2992208264370026,
+ "learning_rate": 4.998629023590656e-06,
+ "loss": 0.6538,
+ "step": 135
+ },
+ {
+ "epoch": 0.11925247985970297,
+ "grad_norm": 0.36522912538035174,
+ "learning_rate": 4.998549570601475e-06,
+ "loss": 0.6566,
+ "step": 136
+ },
+ {
+ "epoch": 0.12012933632925961,
+ "grad_norm": 0.2988448634710597,
+ "learning_rate": 4.998467880377754e-06,
+ "loss": 0.673,
+ "step": 137
+ },
+ {
+ "epoch": 0.12100619279881625,
+ "grad_norm": 0.32912250244162505,
+ "learning_rate": 4.998383952992639e-06,
+ "loss": 0.6482,
+ "step": 138
+ },
+ {
+ "epoch": 0.12188304926837289,
+ "grad_norm": 0.37178534793553225,
+ "learning_rate": 4.998297788521279e-06,
+ "loss": 0.6546,
+ "step": 139
+ },
+ {
+ "epoch": 0.12275990573792953,
+ "grad_norm": 0.28062782891296695,
+ "learning_rate": 4.998209387040829e-06,
+ "loss": 0.6527,
+ "step": 140
+ },
+ {
+ "epoch": 0.12363676220748616,
+ "grad_norm": 0.33723394797540485,
+ "learning_rate": 4.998118748630443e-06,
+ "loss": 0.6391,
+ "step": 141
+ },
+ {
+ "epoch": 0.1245136186770428,
+ "grad_norm": 0.2834572318610097,
+ "learning_rate": 4.99802587337128e-06,
+ "loss": 0.6443,
+ "step": 142
+ },
+ {
+ "epoch": 0.12539047514659943,
+ "grad_norm": 0.321495289367043,
+ "learning_rate": 4.997930761346502e-06,
+ "loss": 0.6507,
+ "step": 143
+ },
+ {
+ "epoch": 0.12626733161615608,
+ "grad_norm": 0.3419910878952078,
+ "learning_rate": 4.997833412641274e-06,
+ "loss": 0.6543,
+ "step": 144
+ },
+ {
+ "epoch": 0.1271441880857127,
+ "grad_norm": 0.28772221770446305,
+ "learning_rate": 4.9977338273427625e-06,
+ "loss": 0.6522,
+ "step": 145
+ },
+ {
+ "epoch": 0.12802104455526936,
+ "grad_norm": 0.29706932671928316,
+ "learning_rate": 4.997632005540139e-06,
+ "loss": 0.6677,
+ "step": 146
+ },
+ {
+ "epoch": 0.128897901024826,
+ "grad_norm": 0.29918610448467253,
+ "learning_rate": 4.997527947324573e-06,
+ "loss": 0.6475,
+ "step": 147
+ },
+ {
+ "epoch": 0.12977475749438264,
+ "grad_norm": 0.33103419851925103,
+ "learning_rate": 4.997421652789243e-06,
+ "loss": 0.67,
+ "step": 148
+ },
+ {
+ "epoch": 0.1306516139639393,
+ "grad_norm": 0.27012500247528487,
+ "learning_rate": 4.9973131220293255e-06,
+ "loss": 0.647,
+ "step": 149
+ },
+ {
+ "epoch": 0.13152847043349591,
+ "grad_norm": 0.297677443804652,
+ "learning_rate": 4.9972023551419995e-06,
+ "loss": 0.6519,
+ "step": 150
+ },
+ {
+ "epoch": 0.13240532690305257,
+ "grad_norm": 0.27386600476743567,
+ "learning_rate": 4.997089352226448e-06,
+ "loss": 0.6562,
+ "step": 151
+ },
+ {
+ "epoch": 0.1332821833726092,
+ "grad_norm": 0.3025435071675535,
+ "learning_rate": 4.996974113383854e-06,
+ "loss": 0.6485,
+ "step": 152
+ },
+ {
+ "epoch": 0.13415903984216584,
+ "grad_norm": 0.2928572797854547,
+ "learning_rate": 4.996856638717406e-06,
+ "loss": 0.641,
+ "step": 153
+ },
+ {
+ "epoch": 0.13503589631172247,
+ "grad_norm": 0.28232417223789874,
+ "learning_rate": 4.996736928332292e-06,
+ "loss": 0.6358,
+ "step": 154
+ },
+ {
+ "epoch": 0.13591275278127912,
+ "grad_norm": 0.33877806926878856,
+ "learning_rate": 4.9966149823357e-06,
+ "loss": 0.6558,
+ "step": 155
+ },
+ {
+ "epoch": 0.13678960925083575,
+ "grad_norm": 0.27274924720742,
+ "learning_rate": 4.996490800836825e-06,
+ "loss": 0.6553,
+ "step": 156
+ },
+ {
+ "epoch": 0.1376664657203924,
+ "grad_norm": 0.3145522020468823,
+ "learning_rate": 4.996364383946859e-06,
+ "loss": 0.6458,
+ "step": 157
+ },
+ {
+ "epoch": 0.13854332218994903,
+ "grad_norm": 0.28298098932682264,
+ "learning_rate": 4.996235731778997e-06,
+ "loss": 0.6467,
+ "step": 158
+ },
+ {
+ "epoch": 0.13942017865950568,
+ "grad_norm": 0.3289393703740858,
+ "learning_rate": 4.996104844448438e-06,
+ "loss": 0.6522,
+ "step": 159
+ },
+ {
+ "epoch": 0.1402970351290623,
+ "grad_norm": 0.3242491154179804,
+ "learning_rate": 4.995971722072379e-06,
+ "loss": 0.6579,
+ "step": 160
+ },
+ {
+ "epoch": 0.14117389159861896,
+ "grad_norm": 0.350063023556927,
+ "learning_rate": 4.995836364770018e-06,
+ "loss": 0.6639,
+ "step": 161
+ },
+ {
+ "epoch": 0.14205074806817558,
+ "grad_norm": 0.26800977502782475,
+ "learning_rate": 4.995698772662558e-06,
+ "loss": 0.6564,
+ "step": 162
+ },
+ {
+ "epoch": 0.14292760453773223,
+ "grad_norm": 0.37123972908338404,
+ "learning_rate": 4.9955589458732e-06,
+ "loss": 0.6521,
+ "step": 163
+ },
+ {
+ "epoch": 0.14380446100728886,
+ "grad_norm": 0.25568101611736427,
+ "learning_rate": 4.995416884527147e-06,
+ "loss": 0.6489,
+ "step": 164
+ },
+ {
+ "epoch": 0.1446813174768455,
+ "grad_norm": 0.3502739955437778,
+ "learning_rate": 4.9952725887516015e-06,
+ "loss": 0.6389,
+ "step": 165
+ },
+ {
+ "epoch": 0.14555817394640214,
+ "grad_norm": 0.2695951493086468,
+ "learning_rate": 4.99512605867577e-06,
+ "loss": 0.6409,
+ "step": 166
+ },
+ {
+ "epoch": 0.1464350304159588,
+ "grad_norm": 0.33224546665642934,
+ "learning_rate": 4.994977294430856e-06,
+ "loss": 0.6478,
+ "step": 167
+ },
+ {
+ "epoch": 0.14731188688551541,
+ "grad_norm": 0.26336591640433304,
+ "learning_rate": 4.994826296150064e-06,
+ "loss": 0.6416,
+ "step": 168
+ },
+ {
+ "epoch": 0.14818874335507207,
+ "grad_norm": 0.3158628283831438,
+ "learning_rate": 4.9946730639686025e-06,
+ "loss": 0.6397,
+ "step": 169
+ },
+ {
+ "epoch": 0.14906559982462872,
+ "grad_norm": 0.29572803602407627,
+ "learning_rate": 4.9945175980236745e-06,
+ "loss": 0.6356,
+ "step": 170
+ },
+ {
+ "epoch": 0.14994245629418534,
+ "grad_norm": 0.3344536076519792,
+ "learning_rate": 4.99435989845449e-06,
+ "loss": 0.6494,
+ "step": 171
+ },
+ {
+ "epoch": 0.150819312763742,
+ "grad_norm": 0.2811402499936693,
+ "learning_rate": 4.994199965402252e-06,
+ "loss": 0.6472,
+ "step": 172
+ },
+ {
+ "epoch": 0.15169616923329862,
+ "grad_norm": 0.30351530565920815,
+ "learning_rate": 4.994037799010168e-06,
+ "loss": 0.6514,
+ "step": 173
+ },
+ {
+ "epoch": 0.15257302570285527,
+ "grad_norm": 0.2667020904201129,
+ "learning_rate": 4.993873399423445e-06,
+ "loss": 0.642,
+ "step": 174
+ },
+ {
+ "epoch": 0.1534498821724119,
+ "grad_norm": 0.3062654941965369,
+ "learning_rate": 4.993706766789287e-06,
+ "loss": 0.6398,
+ "step": 175
+ },
+ {
+ "epoch": 0.15432673864196855,
+ "grad_norm": 0.28228507467929365,
+ "learning_rate": 4.993537901256898e-06,
+ "loss": 0.6446,
+ "step": 176
+ },
+ {
+ "epoch": 0.15520359511152518,
+ "grad_norm": 0.3157908119401443,
+ "learning_rate": 4.993366802977486e-06,
+ "loss": 0.645,
+ "step": 177
+ },
+ {
+ "epoch": 0.15608045158108183,
+ "grad_norm": 0.29612114085869035,
+ "learning_rate": 4.993193472104253e-06,
+ "loss": 0.6379,
+ "step": 178
+ },
+ {
+ "epoch": 0.15695730805063846,
+ "grad_norm": 0.31715005105530436,
+ "learning_rate": 4.9930179087924e-06,
+ "loss": 0.6446,
+ "step": 179
+ },
+ {
+ "epoch": 0.1578341645201951,
+ "grad_norm": 0.3010974405602859,
+ "learning_rate": 4.992840113199131e-06,
+ "loss": 0.6273,
+ "step": 180
+ },
+ {
+ "epoch": 0.15871102098975173,
+ "grad_norm": 0.3097310667014726,
+ "learning_rate": 4.992660085483645e-06,
+ "loss": 0.6477,
+ "step": 181
+ },
+ {
+ "epoch": 0.15958787745930839,
+ "grad_norm": 0.25428924204211556,
+ "learning_rate": 4.992477825807142e-06,
+ "loss": 0.6562,
+ "step": 182
+ },
+ {
+ "epoch": 0.160464733928865,
+ "grad_norm": 0.30870425916577926,
+ "learning_rate": 4.992293334332821e-06,
+ "loss": 0.6528,
+ "step": 183
+ },
+ {
+ "epoch": 0.16134159039842166,
+ "grad_norm": 0.2915653234864446,
+ "learning_rate": 4.992106611225875e-06,
+ "loss": 0.6491,
+ "step": 184
+ },
+ {
+ "epoch": 0.1622184468679783,
+ "grad_norm": 0.3032380988277513,
+ "learning_rate": 4.991917656653501e-06,
+ "loss": 0.6523,
+ "step": 185
+ },
+ {
+ "epoch": 0.16309530333753494,
+ "grad_norm": 0.2986663700583823,
+ "learning_rate": 4.991726470784891e-06,
+ "loss": 0.6333,
+ "step": 186
+ },
+ {
+ "epoch": 0.16397215980709157,
+ "grad_norm": 0.28321065505069615,
+ "learning_rate": 4.9915330537912346e-06,
+ "loss": 0.6411,
+ "step": 187
+ },
+ {
+ "epoch": 0.16484901627664822,
+ "grad_norm": 0.358610834369166,
+ "learning_rate": 4.99133740584572e-06,
+ "loss": 0.6404,
+ "step": 188
+ },
+ {
+ "epoch": 0.16572587274620484,
+ "grad_norm": 0.30976208589225795,
+ "learning_rate": 4.991139527123534e-06,
+ "loss": 0.6405,
+ "step": 189
+ },
+ {
+ "epoch": 0.1666027292157615,
+ "grad_norm": 0.34149502314365515,
+ "learning_rate": 4.990939417801859e-06,
+ "loss": 0.6384,
+ "step": 190
+ },
+ {
+ "epoch": 0.16747958568531812,
+ "grad_norm": 0.2959951500432587,
+ "learning_rate": 4.9907370780598754e-06,
+ "loss": 0.6469,
+ "step": 191
+ },
+ {
+ "epoch": 0.16835644215487477,
+ "grad_norm": 0.3302476980977895,
+ "learning_rate": 4.990532508078761e-06,
+ "loss": 0.6359,
+ "step": 192
+ },
+ {
+ "epoch": 0.1692332986244314,
+ "grad_norm": 0.3944297035939378,
+ "learning_rate": 4.990325708041691e-06,
+ "loss": 0.6502,
+ "step": 193
+ },
+ {
+ "epoch": 0.17011015509398805,
+ "grad_norm": 0.360231124267091,
+ "learning_rate": 4.990116678133836e-06,
+ "loss": 0.6424,
+ "step": 194
+ },
+ {
+ "epoch": 0.1709870115635447,
+ "grad_norm": 0.33832741778437936,
+ "learning_rate": 4.989905418542366e-06,
+ "loss": 0.6352,
+ "step": 195
+ },
+ {
+ "epoch": 0.17186386803310133,
+ "grad_norm": 0.36238295597291414,
+ "learning_rate": 4.989691929456443e-06,
+ "loss": 0.6499,
+ "step": 196
+ },
+ {
+ "epoch": 0.17274072450265798,
+ "grad_norm": 0.32684488652867627,
+ "learning_rate": 4.98947621106723e-06,
+ "loss": 0.6475,
+ "step": 197
+ },
+ {
+ "epoch": 0.1736175809722146,
+ "grad_norm": 0.2757346118610075,
+ "learning_rate": 4.989258263567884e-06,
+ "loss": 0.6355,
+ "step": 198
+ },
+ {
+ "epoch": 0.17449443744177126,
+ "grad_norm": 0.29755713041423115,
+ "learning_rate": 4.989038087153556e-06,
+ "loss": 0.6336,
+ "step": 199
+ },
+ {
+ "epoch": 0.17537129391132789,
+ "grad_norm": 0.29151765698243737,
+ "learning_rate": 4.988815682021398e-06,
+ "loss": 0.6471,
+ "step": 200
+ },
+ {
+ "epoch": 0.17624815038088454,
+ "grad_norm": 0.28111823253643253,
+ "learning_rate": 4.988591048370552e-06,
+ "loss": 0.6407,
+ "step": 201
+ },
+ {
+ "epoch": 0.17712500685044116,
+ "grad_norm": 0.2656165957748681,
+ "learning_rate": 4.988364186402159e-06,
+ "loss": 0.6326,
+ "step": 202
+ },
+ {
+ "epoch": 0.17800186331999782,
+ "grad_norm": 0.3028986715129606,
+ "learning_rate": 4.988135096319355e-06,
+ "loss": 0.6348,
+ "step": 203
+ },
+ {
+ "epoch": 0.17887871978955444,
+ "grad_norm": 0.29924585956112065,
+ "learning_rate": 4.987903778327269e-06,
+ "loss": 0.6488,
+ "step": 204
+ },
+ {
+ "epoch": 0.1797555762591111,
+ "grad_norm": 0.2747438588784908,
+ "learning_rate": 4.987670232633027e-06,
+ "loss": 0.6353,
+ "step": 205
+ },
+ {
+ "epoch": 0.18063243272866772,
+ "grad_norm": 0.30887265845064044,
+ "learning_rate": 4.987434459445748e-06,
+ "loss": 0.6428,
+ "step": 206
+ },
+ {
+ "epoch": 0.18150928919822437,
+ "grad_norm": 0.3193061834187564,
+ "learning_rate": 4.987196458976548e-06,
+ "loss": 0.6467,
+ "step": 207
+ },
+ {
+ "epoch": 0.182386145667781,
+ "grad_norm": 0.2769424032566695,
+ "learning_rate": 4.9869562314385335e-06,
+ "loss": 0.6407,
+ "step": 208
+ },
+ {
+ "epoch": 0.18326300213733765,
+ "grad_norm": 0.3406015148633883,
+ "learning_rate": 4.986713777046809e-06,
+ "loss": 0.6443,
+ "step": 209
+ },
+ {
+ "epoch": 0.18413985860689427,
+ "grad_norm": 0.271878066659463,
+ "learning_rate": 4.986469096018472e-06,
+ "loss": 0.6328,
+ "step": 210
+ },
+ {
+ "epoch": 0.18501671507645093,
+ "grad_norm": 0.2987491049335003,
+ "learning_rate": 4.9862221885726115e-06,
+ "loss": 0.6478,
+ "step": 211
+ },
+ {
+ "epoch": 0.18589357154600755,
+ "grad_norm": 0.3087618217189243,
+ "learning_rate": 4.985973054930313e-06,
+ "loss": 0.6363,
+ "step": 212
+ },
+ {
+ "epoch": 0.1867704280155642,
+ "grad_norm": 0.28612704652497223,
+ "learning_rate": 4.985721695314653e-06,
+ "loss": 0.6409,
+ "step": 213
+ },
+ {
+ "epoch": 0.18764728448512083,
+ "grad_norm": 0.26033127989473615,
+ "learning_rate": 4.985468109950704e-06,
+ "loss": 0.6495,
+ "step": 214
+ },
+ {
+ "epoch": 0.18852414095467748,
+ "grad_norm": 0.29345494621139656,
+ "learning_rate": 4.985212299065528e-06,
+ "loss": 0.648,
+ "step": 215
+ },
+ {
+ "epoch": 0.1894009974242341,
+ "grad_norm": 0.30811406203792147,
+ "learning_rate": 4.984954262888182e-06,
+ "loss": 0.639,
+ "step": 216
+ },
+ {
+ "epoch": 0.19027785389379076,
+ "grad_norm": 0.3312828084167346,
+ "learning_rate": 4.9846940016497146e-06,
+ "loss": 0.6403,
+ "step": 217
+ },
+ {
+ "epoch": 0.1911547103633474,
+ "grad_norm": 0.29106752415257064,
+ "learning_rate": 4.984431515583169e-06,
+ "loss": 0.6457,
+ "step": 218
+ },
+ {
+ "epoch": 0.19203156683290404,
+ "grad_norm": 0.2950307203873666,
+ "learning_rate": 4.984166804923576e-06,
+ "loss": 0.6366,
+ "step": 219
+ },
+ {
+ "epoch": 0.1929084233024607,
+ "grad_norm": 0.33001978484003053,
+ "learning_rate": 4.983899869907963e-06,
+ "loss": 0.6519,
+ "step": 220
+ },
+ {
+ "epoch": 0.19378527977201732,
+ "grad_norm": 0.25712182858786903,
+ "learning_rate": 4.983630710775346e-06,
+ "loss": 0.6302,
+ "step": 221
+ },
+ {
+ "epoch": 0.19466213624157397,
+ "grad_norm": 0.33700258932320354,
+ "learning_rate": 4.983359327766735e-06,
+ "loss": 0.6382,
+ "step": 222
+ },
+ {
+ "epoch": 0.1955389927111306,
+ "grad_norm": 0.3195952299259763,
+ "learning_rate": 4.983085721125128e-06,
+ "loss": 0.6408,
+ "step": 223
+ },
+ {
+ "epoch": 0.19641584918068725,
+ "grad_norm": 0.2820582636542398,
+ "learning_rate": 4.982809891095519e-06,
+ "loss": 0.6196,
+ "step": 224
+ },
+ {
+ "epoch": 0.19729270565024387,
+ "grad_norm": 0.30343326038998625,
+ "learning_rate": 4.982531837924887e-06,
+ "loss": 0.6361,
+ "step": 225
+ },
+ {
+ "epoch": 0.19816956211980052,
+ "grad_norm": 0.2724213298701267,
+ "learning_rate": 4.9822515618622055e-06,
+ "loss": 0.6455,
+ "step": 226
+ },
+ {
+ "epoch": 0.19904641858935715,
+ "grad_norm": 0.28433275446155476,
+ "learning_rate": 4.9819690631584375e-06,
+ "loss": 0.6329,
+ "step": 227
+ },
+ {
+ "epoch": 0.1999232750589138,
+ "grad_norm": 0.2641523923467397,
+ "learning_rate": 4.981684342066536e-06,
+ "loss": 0.6301,
+ "step": 228
+ },
+ {
+ "epoch": 0.20080013152847043,
+ "grad_norm": 0.29243768749633176,
+ "learning_rate": 4.9813973988414454e-06,
+ "loss": 0.6369,
+ "step": 229
+ },
+ {
+ "epoch": 0.20167698799802708,
+ "grad_norm": 0.27139535071517695,
+ "learning_rate": 4.981108233740096e-06,
+ "loss": 0.6279,
+ "step": 230
+ },
+ {
+ "epoch": 0.2025538444675837,
+ "grad_norm": 0.27525475223350887,
+ "learning_rate": 4.980816847021412e-06,
+ "loss": 0.6429,
+ "step": 231
+ },
+ {
+ "epoch": 0.20343070093714036,
+ "grad_norm": 0.3427701449667448,
+ "learning_rate": 4.980523238946304e-06,
+ "loss": 0.6438,
+ "step": 232
+ },
+ {
+ "epoch": 0.20430755740669698,
+ "grad_norm": 0.2574596630900604,
+ "learning_rate": 4.980227409777673e-06,
+ "loss": 0.6278,
+ "step": 233
+ },
+ {
+ "epoch": 0.20518441387625364,
+ "grad_norm": 0.3069435432493287,
+ "learning_rate": 4.9799293597804086e-06,
+ "loss": 0.645,
+ "step": 234
+ },
+ {
+ "epoch": 0.20606127034581026,
+ "grad_norm": 0.2861360169316533,
+ "learning_rate": 4.979629089221387e-06,
+ "loss": 0.646,
+ "step": 235
+ },
+ {
+ "epoch": 0.2069381268153669,
+ "grad_norm": 0.258606470239814,
+ "learning_rate": 4.9793265983694775e-06,
+ "loss": 0.638,
+ "step": 236
+ },
+ {
+ "epoch": 0.20781498328492354,
+ "grad_norm": 0.2852233202848665,
+ "learning_rate": 4.9790218874955325e-06,
+ "loss": 0.6233,
+ "step": 237
+ },
+ {
+ "epoch": 0.2086918397544802,
+ "grad_norm": 0.27593128237727194,
+ "learning_rate": 4.978714956872394e-06,
+ "loss": 0.64,
+ "step": 238
+ },
+ {
+ "epoch": 0.20956869622403682,
+ "grad_norm": 0.2721892419938629,
+ "learning_rate": 4.978405806774892e-06,
+ "loss": 0.6242,
+ "step": 239
+ },
+ {
+ "epoch": 0.21044555269359347,
+ "grad_norm": 0.26477694173686633,
+ "learning_rate": 4.978094437479843e-06,
+ "loss": 0.6409,
+ "step": 240
+ },
+ {
+ "epoch": 0.2113224091631501,
+ "grad_norm": 0.29511740452877416,
+ "learning_rate": 4.977780849266054e-06,
+ "loss": 0.6397,
+ "step": 241
+ },
+ {
+ "epoch": 0.21219926563270675,
+ "grad_norm": 0.3137075106480887,
+ "learning_rate": 4.977465042414314e-06,
+ "loss": 0.6185,
+ "step": 242
+ },
+ {
+ "epoch": 0.2130761221022634,
+ "grad_norm": 0.2841757272525764,
+ "learning_rate": 4.9771470172073985e-06,
+ "loss": 0.6394,
+ "step": 243
+ },
+ {
+ "epoch": 0.21395297857182002,
+ "grad_norm": 0.289636229771129,
+ "learning_rate": 4.976826773930076e-06,
+ "loss": 0.6314,
+ "step": 244
+ },
+ {
+ "epoch": 0.21482983504137668,
+ "grad_norm": 0.30163996035868273,
+ "learning_rate": 4.976504312869093e-06,
+ "loss": 0.6347,
+ "step": 245
+ },
+ {
+ "epoch": 0.2157066915109333,
+ "grad_norm": 0.261372963985366,
+ "learning_rate": 4.976179634313187e-06,
+ "loss": 0.6378,
+ "step": 246
+ },
+ {
+ "epoch": 0.21658354798048995,
+ "grad_norm": 0.3277256326536918,
+ "learning_rate": 4.97585273855308e-06,
+ "loss": 0.6326,
+ "step": 247
+ },
+ {
+ "epoch": 0.21746040445004658,
+ "grad_norm": 0.2609300415027874,
+ "learning_rate": 4.975523625881478e-06,
+ "loss": 0.643,
+ "step": 248
+ },
+ {
+ "epoch": 0.21833726091960323,
+ "grad_norm": 0.360435554160976,
+ "learning_rate": 4.975192296593072e-06,
+ "loss": 0.6301,
+ "step": 249
+ },
+ {
+ "epoch": 0.21921411738915986,
+ "grad_norm": 0.33545569496984357,
+ "learning_rate": 4.97485875098454e-06,
+ "loss": 0.6263,
+ "step": 250
+ },
+ {
+ "epoch": 0.2200909738587165,
+ "grad_norm": 0.3109257543138659,
+ "learning_rate": 4.974522989354544e-06,
+ "loss": 0.6409,
+ "step": 251
+ },
+ {
+ "epoch": 0.22096783032827313,
+ "grad_norm": 0.324992218124581,
+ "learning_rate": 4.974185012003727e-06,
+ "loss": 0.634,
+ "step": 252
+ },
+ {
+ "epoch": 0.2218446867978298,
+ "grad_norm": 0.32486130027399085,
+ "learning_rate": 4.97384481923472e-06,
+ "loss": 0.6164,
+ "step": 253
+ },
+ {
+ "epoch": 0.2227215432673864,
+ "grad_norm": 0.37258515700556377,
+ "learning_rate": 4.973502411352136e-06,
+ "loss": 0.6387,
+ "step": 254
+ },
+ {
+ "epoch": 0.22359839973694307,
+ "grad_norm": 0.29043553996012594,
+ "learning_rate": 4.97315778866257e-06,
+ "loss": 0.6287,
+ "step": 255
+ },
+ {
+ "epoch": 0.2244752562064997,
+ "grad_norm": 0.36257038619483317,
+ "learning_rate": 4.972810951474605e-06,
+ "loss": 0.6343,
+ "step": 256
+ },
+ {
+ "epoch": 0.22535211267605634,
+ "grad_norm": 0.2772793728031826,
+ "learning_rate": 4.972461900098801e-06,
+ "loss": 0.6289,
+ "step": 257
+ },
+ {
+ "epoch": 0.22622896914561297,
+ "grad_norm": 0.35920004083908574,
+ "learning_rate": 4.972110634847703e-06,
+ "loss": 0.6532,
+ "step": 258
+ },
+ {
+ "epoch": 0.22710582561516962,
+ "grad_norm": 0.29471007707943336,
+ "learning_rate": 4.97175715603584e-06,
+ "loss": 0.6431,
+ "step": 259
+ },
+ {
+ "epoch": 0.22798268208472625,
+ "grad_norm": 0.3052965075835166,
+ "learning_rate": 4.971401463979722e-06,
+ "loss": 0.6373,
+ "step": 260
+ },
+ {
+ "epoch": 0.2288595385542829,
+ "grad_norm": 0.27702925326859024,
+ "learning_rate": 4.971043558997839e-06,
+ "loss": 0.6254,
+ "step": 261
+ },
+ {
+ "epoch": 0.22973639502383952,
+ "grad_norm": 0.30905022457424325,
+ "learning_rate": 4.9706834414106645e-06,
+ "loss": 0.6377,
+ "step": 262
+ },
+ {
+ "epoch": 0.23061325149339618,
+ "grad_norm": 0.2820956276882666,
+ "learning_rate": 4.970321111540652e-06,
+ "loss": 0.6303,
+ "step": 263
+ },
+ {
+ "epoch": 0.2314901079629528,
+ "grad_norm": 0.3394900289735489,
+ "learning_rate": 4.969956569712238e-06,
+ "loss": 0.6394,
+ "step": 264
+ },
+ {
+ "epoch": 0.23236696443250945,
+ "grad_norm": 0.26647926556067275,
+ "learning_rate": 4.969589816251837e-06,
+ "loss": 0.6202,
+ "step": 265
+ },
+ {
+ "epoch": 0.2332438209020661,
+ "grad_norm": 0.3281231898594553,
+ "learning_rate": 4.9692208514878445e-06,
+ "loss": 0.6343,
+ "step": 266
+ },
+ {
+ "epoch": 0.23412067737162273,
+ "grad_norm": 0.32675488207496506,
+ "learning_rate": 4.968849675750638e-06,
+ "loss": 0.6106,
+ "step": 267
+ },
+ {
+ "epoch": 0.23499753384117938,
+ "grad_norm": 0.28838375524590465,
+ "learning_rate": 4.9684762893725715e-06,
+ "loss": 0.6191,
+ "step": 268
+ },
+ {
+ "epoch": 0.235874390310736,
+ "grad_norm": 0.3568027126734991,
+ "learning_rate": 4.968100692687981e-06,
+ "loss": 0.6492,
+ "step": 269
+ },
+ {
+ "epoch": 0.23675124678029266,
+ "grad_norm": 0.28443576918161984,
+ "learning_rate": 4.967722886033181e-06,
+ "loss": 0.6332,
+ "step": 270
+ },
+ {
+ "epoch": 0.2376281032498493,
+ "grad_norm": 0.34347891151295074,
+ "learning_rate": 4.967342869746463e-06,
+ "loss": 0.6302,
+ "step": 271
+ },
+ {
+ "epoch": 0.23850495971940594,
+ "grad_norm": 0.26856199334324765,
+ "learning_rate": 4.9669606441681005e-06,
+ "loss": 0.6253,
+ "step": 272
+ },
+ {
+ "epoch": 0.23938181618896257,
+ "grad_norm": 0.28792821400673596,
+ "learning_rate": 4.966576209640344e-06,
+ "loss": 0.617,
+ "step": 273
+ },
+ {
+ "epoch": 0.24025867265851922,
+ "grad_norm": 0.2749481611356667,
+ "learning_rate": 4.966189566507418e-06,
+ "loss": 0.6386,
+ "step": 274
+ },
+ {
+ "epoch": 0.24113552912807584,
+ "grad_norm": 0.2499995559979677,
+ "learning_rate": 4.965800715115531e-06,
+ "loss": 0.6281,
+ "step": 275
+ },
+ {
+ "epoch": 0.2420123855976325,
+ "grad_norm": 0.2802197876098476,
+ "learning_rate": 4.965409655812865e-06,
+ "loss": 0.6356,
+ "step": 276
+ },
+ {
+ "epoch": 0.24288924206718912,
+ "grad_norm": 0.27112050232805884,
+ "learning_rate": 4.965016388949579e-06,
+ "loss": 0.6366,
+ "step": 277
+ },
+ {
+ "epoch": 0.24376609853674577,
+ "grad_norm": 0.28745747065199806,
+ "learning_rate": 4.96462091487781e-06,
+ "loss": 0.6245,
+ "step": 278
+ },
+ {
+ "epoch": 0.2446429550063024,
+ "grad_norm": 0.29635776688822807,
+ "learning_rate": 4.96422323395167e-06,
+ "loss": 0.6413,
+ "step": 279
+ },
+ {
+ "epoch": 0.24551981147585905,
+ "grad_norm": 0.3376283192201481,
+ "learning_rate": 4.963823346527249e-06,
+ "loss": 0.6322,
+ "step": 280
+ },
+ {
+ "epoch": 0.24639666794541568,
+ "grad_norm": 0.30520044326595835,
+ "learning_rate": 4.96342125296261e-06,
+ "loss": 0.6173,
+ "step": 281
+ },
+ {
+ "epoch": 0.24727352441497233,
+ "grad_norm": 0.34476437566601653,
+ "learning_rate": 4.963016953617794e-06,
+ "loss": 0.6172,
+ "step": 282
+ },
+ {
+ "epoch": 0.24815038088452895,
+ "grad_norm": 0.2611205789369605,
+ "learning_rate": 4.962610448854816e-06,
+ "loss": 0.6246,
+ "step": 283
+ },
+ {
+ "epoch": 0.2490272373540856,
+ "grad_norm": 0.3294938430549001,
+ "learning_rate": 4.962201739037665e-06,
+ "loss": 0.632,
+ "step": 284
+ },
+ {
+ "epoch": 0.24990409382364223,
+ "grad_norm": 0.2716869569081184,
+ "learning_rate": 4.961790824532306e-06,
+ "loss": 0.6285,
+ "step": 285
+ },
+ {
+ "epoch": 0.25078095029319886,
+ "grad_norm": 0.33415021484488,
+ "learning_rate": 4.961377705706677e-06,
+ "loss": 0.6295,
+ "step": 286
+ },
+ {
+ "epoch": 0.2516578067627555,
+ "grad_norm": 0.3077857421614378,
+ "learning_rate": 4.960962382930691e-06,
+ "loss": 0.6273,
+ "step": 287
+ },
+ {
+ "epoch": 0.25253466323231216,
+ "grad_norm": 0.3027918805177667,
+ "learning_rate": 4.960544856576232e-06,
+ "loss": 0.629,
+ "step": 288
+ },
+ {
+ "epoch": 0.2534115197018688,
+ "grad_norm": 0.2916258020649895,
+ "learning_rate": 4.960125127017159e-06,
+ "loss": 0.6427,
+ "step": 289
+ },
+ {
+ "epoch": 0.2542883761714254,
+ "grad_norm": 0.3152484231550671,
+ "learning_rate": 4.959703194629304e-06,
+ "loss": 0.6348,
+ "step": 290
+ },
+ {
+ "epoch": 0.25516523264098206,
+ "grad_norm": 0.32915709407999866,
+ "learning_rate": 4.959279059790471e-06,
+ "loss": 0.632,
+ "step": 291
+ },
+ {
+ "epoch": 0.2560420891105387,
+ "grad_norm": 0.2817567268029023,
+ "learning_rate": 4.958852722880435e-06,
+ "loss": 0.6112,
+ "step": 292
+ },
+ {
+ "epoch": 0.25691894558009537,
+ "grad_norm": 0.3538236182060425,
+ "learning_rate": 4.958424184280946e-06,
+ "loss": 0.6241,
+ "step": 293
+ },
+ {
+ "epoch": 0.257795802049652,
+ "grad_norm": 0.2864183700965389,
+ "learning_rate": 4.957993444375719e-06,
+ "loss": 0.6277,
+ "step": 294
+ },
+ {
+ "epoch": 0.2586726585192086,
+ "grad_norm": 0.33515303575483923,
+ "learning_rate": 4.95756050355045e-06,
+ "loss": 0.6277,
+ "step": 295
+ },
+ {
+ "epoch": 0.2595495149887653,
+ "grad_norm": 0.31975746198582533,
+ "learning_rate": 4.957125362192794e-06,
+ "loss": 0.6114,
+ "step": 296
+ },
+ {
+ "epoch": 0.2604263714583219,
+ "grad_norm": 0.34329553758734277,
+ "learning_rate": 4.956688020692386e-06,
+ "loss": 0.6457,
+ "step": 297
+ },
+ {
+ "epoch": 0.2613032279278786,
+ "grad_norm": 0.3122307785419701,
+ "learning_rate": 4.956248479440827e-06,
+ "loss": 0.6272,
+ "step": 298
+ },
+ {
+ "epoch": 0.2621800843974352,
+ "grad_norm": 0.3126439049869492,
+ "learning_rate": 4.955806738831687e-06,
+ "loss": 0.634,
+ "step": 299
+ },
+ {
+ "epoch": 0.26305694086699183,
+ "grad_norm": 0.30725526373905826,
+ "learning_rate": 4.955362799260507e-06,
+ "loss": 0.6269,
+ "step": 300
+ },
+ {
+ "epoch": 0.2639337973365485,
+ "grad_norm": 0.2952615284346605,
+ "learning_rate": 4.954916661124797e-06,
+ "loss": 0.6129,
+ "step": 301
+ },
+ {
+ "epoch": 0.26481065380610513,
+ "grad_norm": 0.3284069744839045,
+ "learning_rate": 4.954468324824035e-06,
+ "loss": 0.613,
+ "step": 302
+ },
+ {
+ "epoch": 0.26568751027566173,
+ "grad_norm": 0.34051928196991404,
+ "learning_rate": 4.954017790759666e-06,
+ "loss": 0.6192,
+ "step": 303
+ },
+ {
+ "epoch": 0.2665643667452184,
+ "grad_norm": 0.30608255552211977,
+ "learning_rate": 4.953565059335104e-06,
+ "loss": 0.6244,
+ "step": 304
+ },
+ {
+ "epoch": 0.26744122321477504,
+ "grad_norm": 0.31501722301988566,
+ "learning_rate": 4.953110130955733e-06,
+ "loss": 0.6236,
+ "step": 305
+ },
+ {
+ "epoch": 0.2683180796843317,
+ "grad_norm": 0.2978345978834651,
+ "learning_rate": 4.9526530060289e-06,
+ "loss": 0.6254,
+ "step": 306
+ },
+ {
+ "epoch": 0.2691949361538883,
+ "grad_norm": 0.2935986604058687,
+ "learning_rate": 4.952193684963922e-06,
+ "loss": 0.6113,
+ "step": 307
+ },
+ {
+ "epoch": 0.27007179262344494,
+ "grad_norm": 0.294670736028252,
+ "learning_rate": 4.95173216817208e-06,
+ "loss": 0.6335,
+ "step": 308
+ },
+ {
+ "epoch": 0.2709486490930016,
+ "grad_norm": 0.2746280487759909,
+ "learning_rate": 4.951268456066623e-06,
+ "loss": 0.6211,
+ "step": 309
+ },
+ {
+ "epoch": 0.27182550556255825,
+ "grad_norm": 0.2823209312944346,
+ "learning_rate": 4.950802549062764e-06,
+ "loss": 0.621,
+ "step": 310
+ },
+ {
+ "epoch": 0.27270236203211484,
+ "grad_norm": 0.2811005060766513,
+ "learning_rate": 4.950334447577685e-06,
+ "loss": 0.6291,
+ "step": 311
+ },
+ {
+ "epoch": 0.2735792185016715,
+ "grad_norm": 0.31377780747479117,
+ "learning_rate": 4.9498641520305264e-06,
+ "loss": 0.6308,
+ "step": 312
+ },
+ {
+ "epoch": 0.27445607497122815,
+ "grad_norm": 0.263859895152384,
+ "learning_rate": 4.949391662842401e-06,
+ "loss": 0.6238,
+ "step": 313
+ },
+ {
+ "epoch": 0.2753329314407848,
+ "grad_norm": 0.3124591272767995,
+ "learning_rate": 4.948916980436379e-06,
+ "loss": 0.6254,
+ "step": 314
+ },
+ {
+ "epoch": 0.27620978791034145,
+ "grad_norm": 0.2762091249470148,
+ "learning_rate": 4.948440105237499e-06,
+ "loss": 0.6297,
+ "step": 315
+ },
+ {
+ "epoch": 0.27708664437989805,
+ "grad_norm": 0.30510467983773004,
+ "learning_rate": 4.947961037672761e-06,
+ "loss": 0.6301,
+ "step": 316
+ },
+ {
+ "epoch": 0.2779635008494547,
+ "grad_norm": 0.2894218681866538,
+ "learning_rate": 4.947479778171127e-06,
+ "loss": 0.6215,
+ "step": 317
+ },
+ {
+ "epoch": 0.27884035731901136,
+ "grad_norm": 0.278604444379188,
+ "learning_rate": 4.946996327163526e-06,
+ "loss": 0.6193,
+ "step": 318
+ },
+ {
+ "epoch": 0.279717213788568,
+ "grad_norm": 0.29226196825962947,
+ "learning_rate": 4.946510685082844e-06,
+ "loss": 0.6205,
+ "step": 319
+ },
+ {
+ "epoch": 0.2805940702581246,
+ "grad_norm": 0.2956824922950759,
+ "learning_rate": 4.946022852363932e-06,
+ "loss": 0.6238,
+ "step": 320
+ },
+ {
+ "epoch": 0.28147092672768126,
+ "grad_norm": 0.28796938907697983,
+ "learning_rate": 4.945532829443604e-06,
+ "loss": 0.6176,
+ "step": 321
+ },
+ {
+ "epoch": 0.2823477831972379,
+ "grad_norm": 0.2688847498978228,
+ "learning_rate": 4.945040616760629e-06,
+ "loss": 0.6178,
+ "step": 322
+ },
+ {
+ "epoch": 0.28322463966679456,
+ "grad_norm": 0.3167327299209847,
+ "learning_rate": 4.944546214755744e-06,
+ "loss": 0.6315,
+ "step": 323
+ },
+ {
+ "epoch": 0.28410149613635116,
+ "grad_norm": 0.28346482132020456,
+ "learning_rate": 4.9440496238716415e-06,
+ "loss": 0.6281,
+ "step": 324
+ },
+ {
+ "epoch": 0.2849783526059078,
+ "grad_norm": 0.2862108698161924,
+ "learning_rate": 4.943550844552978e-06,
+ "loss": 0.6445,
+ "step": 325
+ },
+ {
+ "epoch": 0.28585520907546447,
+ "grad_norm": 0.3168994194030117,
+ "learning_rate": 4.943049877246363e-06,
+ "loss": 0.6336,
+ "step": 326
+ },
+ {
+ "epoch": 0.2867320655450211,
+ "grad_norm": 0.3098419113094991,
+ "learning_rate": 4.942546722400373e-06,
+ "loss": 0.6194,
+ "step": 327
+ },
+ {
+ "epoch": 0.2876089220145777,
+ "grad_norm": 0.3076330226750193,
+ "learning_rate": 4.942041380465539e-06,
+ "loss": 0.6332,
+ "step": 328
+ },
+ {
+ "epoch": 0.28848577848413437,
+ "grad_norm": 0.3073675940253473,
+ "learning_rate": 4.941533851894349e-06,
+ "loss": 0.6329,
+ "step": 329
+ },
+ {
+ "epoch": 0.289362634953691,
+ "grad_norm": 0.27407015238515836,
+ "learning_rate": 4.9410241371412525e-06,
+ "loss": 0.6292,
+ "step": 330
+ },
+ {
+ "epoch": 0.2902394914232477,
+ "grad_norm": 0.3233677059379673,
+ "learning_rate": 4.9405122366626545e-06,
+ "loss": 0.6407,
+ "step": 331
+ },
+ {
+ "epoch": 0.2911163478928043,
+ "grad_norm": 0.3056326849325438,
+ "learning_rate": 4.939998150916917e-06,
+ "loss": 0.6314,
+ "step": 332
+ },
+ {
+ "epoch": 0.2919932043623609,
+ "grad_norm": 0.3140138519054107,
+ "learning_rate": 4.93948188036436e-06,
+ "loss": 0.6583,
+ "step": 333
+ },
+ {
+ "epoch": 0.2928700608319176,
+ "grad_norm": 0.2967689552064628,
+ "learning_rate": 4.938963425467258e-06,
+ "loss": 0.6349,
+ "step": 334
+ },
+ {
+ "epoch": 0.29374691730147423,
+ "grad_norm": 0.35320572702474673,
+ "learning_rate": 4.938442786689843e-06,
+ "loss": 0.6248,
+ "step": 335
+ },
+ {
+ "epoch": 0.29462377377103083,
+ "grad_norm": 0.2958836632865014,
+ "learning_rate": 4.9379199644983025e-06,
+ "loss": 0.6255,
+ "step": 336
+ },
+ {
+ "epoch": 0.2955006302405875,
+ "grad_norm": 0.3054952399371344,
+ "learning_rate": 4.937394959360777e-06,
+ "loss": 0.6119,
+ "step": 337
+ },
+ {
+ "epoch": 0.29637748671014413,
+ "grad_norm": 0.34308383177638463,
+ "learning_rate": 4.9368677717473645e-06,
+ "loss": 0.6468,
+ "step": 338
+ },
+ {
+ "epoch": 0.2972543431797008,
+ "grad_norm": 0.2648620374237178,
+ "learning_rate": 4.936338402130115e-06,
+ "loss": 0.6203,
+ "step": 339
+ },
+ {
+ "epoch": 0.29813119964925744,
+ "grad_norm": 0.2976099930186866,
+ "learning_rate": 4.935806850983034e-06,
+ "loss": 0.6348,
+ "step": 340
+ },
+ {
+ "epoch": 0.29900805611881404,
+ "grad_norm": 0.285144357181017,
+ "learning_rate": 4.935273118782078e-06,
+ "loss": 0.6115,
+ "step": 341
+ },
+ {
+ "epoch": 0.2998849125883707,
+ "grad_norm": 0.3079688238524965,
+ "learning_rate": 4.934737206005159e-06,
+ "loss": 0.6254,
+ "step": 342
+ },
+ {
+ "epoch": 0.30076176905792734,
+ "grad_norm": 0.27719094781494596,
+ "learning_rate": 4.93419911313214e-06,
+ "loss": 0.6386,
+ "step": 343
+ },
+ {
+ "epoch": 0.301638625527484,
+ "grad_norm": 0.29796636665366355,
+ "learning_rate": 4.933658840644837e-06,
+ "loss": 0.6268,
+ "step": 344
+ },
+ {
+ "epoch": 0.3025154819970406,
+ "grad_norm": 0.27509893042636935,
+ "learning_rate": 4.933116389027017e-06,
+ "loss": 0.621,
+ "step": 345
+ },
+ {
+ "epoch": 0.30339233846659724,
+ "grad_norm": 0.31224342373584874,
+ "learning_rate": 4.932571758764398e-06,
+ "loss": 0.6312,
+ "step": 346
+ },
+ {
+ "epoch": 0.3042691949361539,
+ "grad_norm": 0.2689144896057607,
+ "learning_rate": 4.93202495034465e-06,
+ "loss": 0.6115,
+ "step": 347
+ },
+ {
+ "epoch": 0.30514605140571055,
+ "grad_norm": 0.2558266510993566,
+ "learning_rate": 4.931475964257391e-06,
+ "loss": 0.6245,
+ "step": 348
+ },
+ {
+ "epoch": 0.30602290787526715,
+ "grad_norm": 0.25500762407211314,
+ "learning_rate": 4.930924800994192e-06,
+ "loss": 0.6091,
+ "step": 349
+ },
+ {
+ "epoch": 0.3068997643448238,
+ "grad_norm": 0.2717131638453367,
+ "learning_rate": 4.9303714610485705e-06,
+ "loss": 0.6281,
+ "step": 350
+ },
+ {
+ "epoch": 0.30777662081438045,
+ "grad_norm": 0.2729400616989181,
+ "learning_rate": 4.929815944915997e-06,
+ "loss": 0.6083,
+ "step": 351
+ },
+ {
+ "epoch": 0.3086534772839371,
+ "grad_norm": 0.26000631857019024,
+ "learning_rate": 4.929258253093885e-06,
+ "loss": 0.6198,
+ "step": 352
+ },
+ {
+ "epoch": 0.3095303337534937,
+ "grad_norm": 0.2740884453189882,
+ "learning_rate": 4.9286983860816e-06,
+ "loss": 0.6338,
+ "step": 353
+ },
+ {
+ "epoch": 0.31040719022305036,
+ "grad_norm": 0.27150990388252366,
+ "learning_rate": 4.928136344380457e-06,
+ "loss": 0.6162,
+ "step": 354
+ },
+ {
+ "epoch": 0.311284046692607,
+ "grad_norm": 0.26286571771385,
+ "learning_rate": 4.9275721284937115e-06,
+ "loss": 0.629,
+ "step": 355
+ },
+ {
+ "epoch": 0.31216090316216366,
+ "grad_norm": 0.27510252961865267,
+ "learning_rate": 4.9270057389265734e-06,
+ "loss": 0.633,
+ "step": 356
+ },
+ {
+ "epoch": 0.31303775963172026,
+ "grad_norm": 0.2825214790660817,
+ "learning_rate": 4.926437176186193e-06,
+ "loss": 0.6263,
+ "step": 357
+ },
+ {
+ "epoch": 0.3139146161012769,
+ "grad_norm": 0.29292375908331497,
+ "learning_rate": 4.92586644078167e-06,
+ "loss": 0.6313,
+ "step": 358
+ },
+ {
+ "epoch": 0.31479147257083356,
+ "grad_norm": 0.2760563004495057,
+ "learning_rate": 4.925293533224049e-06,
+ "loss": 0.6174,
+ "step": 359
+ },
+ {
+ "epoch": 0.3156683290403902,
+ "grad_norm": 0.29078508943452525,
+ "learning_rate": 4.924718454026318e-06,
+ "loss": 0.6156,
+ "step": 360
+ },
+ {
+ "epoch": 0.3165451855099468,
+ "grad_norm": 0.2878769173523044,
+ "learning_rate": 4.924141203703412e-06,
+ "loss": 0.6047,
+ "step": 361
+ },
+ {
+ "epoch": 0.31742204197950347,
+ "grad_norm": 0.27485843884417593,
+ "learning_rate": 4.923561782772206e-06,
+ "loss": 0.6293,
+ "step": 362
+ },
+ {
+ "epoch": 0.3182988984490601,
+ "grad_norm": 0.2865164028316351,
+ "learning_rate": 4.922980191751524e-06,
+ "loss": 0.6269,
+ "step": 363
+ },
+ {
+ "epoch": 0.31917575491861677,
+ "grad_norm": 0.27991173694279825,
+ "learning_rate": 4.922396431162129e-06,
+ "loss": 0.6143,
+ "step": 364
+ },
+ {
+ "epoch": 0.3200526113881734,
+ "grad_norm": 0.279639353480309,
+ "learning_rate": 4.921810501526728e-06,
+ "loss": 0.635,
+ "step": 365
+ },
+ {
+ "epoch": 0.32092946785773,
+ "grad_norm": 0.2830142803081013,
+ "learning_rate": 4.921222403369971e-06,
+ "loss": 0.6157,
+ "step": 366
+ },
+ {
+ "epoch": 0.3218063243272867,
+ "grad_norm": 0.2684155306717856,
+ "learning_rate": 4.920632137218447e-06,
+ "loss": 0.6294,
+ "step": 367
+ },
+ {
+ "epoch": 0.3226831807968433,
+ "grad_norm": 0.2983455576981931,
+ "learning_rate": 4.920039703600691e-06,
+ "loss": 0.624,
+ "step": 368
+ },
+ {
+ "epoch": 0.3235600372664,
+ "grad_norm": 0.2948947231333358,
+ "learning_rate": 4.9194451030471735e-06,
+ "loss": 0.6102,
+ "step": 369
+ },
+ {
+ "epoch": 0.3244368937359566,
+ "grad_norm": 0.2826890911442374,
+ "learning_rate": 4.918848336090309e-06,
+ "loss": 0.6236,
+ "step": 370
+ },
+ {
+ "epoch": 0.32531375020551323,
+ "grad_norm": 0.32269493597939386,
+ "learning_rate": 4.91824940326445e-06,
+ "loss": 0.6139,
+ "step": 371
+ },
+ {
+ "epoch": 0.3261906066750699,
+ "grad_norm": 0.2734983777513044,
+ "learning_rate": 4.91764830510589e-06,
+ "loss": 0.6166,
+ "step": 372
+ },
+ {
+ "epoch": 0.32706746314462654,
+ "grad_norm": 0.36983262498880637,
+ "learning_rate": 4.917045042152858e-06,
+ "loss": 0.6186,
+ "step": 373
+ },
+ {
+ "epoch": 0.32794431961418313,
+ "grad_norm": 0.2751996219950251,
+ "learning_rate": 4.916439614945527e-06,
+ "loss": 0.6412,
+ "step": 374
+ },
+ {
+ "epoch": 0.3288211760837398,
+ "grad_norm": 0.319865198714037,
+ "learning_rate": 4.915832024026002e-06,
+ "loss": 0.627,
+ "step": 375
+ },
+ {
+ "epoch": 0.32969803255329644,
+ "grad_norm": 0.29823421688781576,
+ "learning_rate": 4.915222269938328e-06,
+ "loss": 0.6181,
+ "step": 376
+ },
+ {
+ "epoch": 0.3305748890228531,
+ "grad_norm": 0.27335542421500575,
+ "learning_rate": 4.914610353228488e-06,
+ "loss": 0.6202,
+ "step": 377
+ },
+ {
+ "epoch": 0.3314517454924097,
+ "grad_norm": 0.3824213724235341,
+ "learning_rate": 4.913996274444401e-06,
+ "loss": 0.608,
+ "step": 378
+ },
+ {
+ "epoch": 0.33232860196196634,
+ "grad_norm": 0.3269271239671324,
+ "learning_rate": 4.913380034135919e-06,
+ "loss": 0.6229,
+ "step": 379
+ },
+ {
+ "epoch": 0.333205458431523,
+ "grad_norm": 0.2832871290462529,
+ "learning_rate": 4.912761632854834e-06,
+ "loss": 0.618,
+ "step": 380
+ },
+ {
+ "epoch": 0.33408231490107965,
+ "grad_norm": 0.329936751234759,
+ "learning_rate": 4.912141071154869e-06,
+ "loss": 0.6231,
+ "step": 381
+ },
+ {
+ "epoch": 0.33495917137063624,
+ "grad_norm": 0.2752693680315103,
+ "learning_rate": 4.911518349591685e-06,
+ "loss": 0.6234,
+ "step": 382
+ },
+ {
+ "epoch": 0.3358360278401929,
+ "grad_norm": 0.3136704903953731,
+ "learning_rate": 4.9108934687228735e-06,
+ "loss": 0.6248,
+ "step": 383
+ },
+ {
+ "epoch": 0.33671288430974955,
+ "grad_norm": 0.2947450161853734,
+ "learning_rate": 4.910266429107962e-06,
+ "loss": 0.6291,
+ "step": 384
+ },
+ {
+ "epoch": 0.3375897407793062,
+ "grad_norm": 0.27963622109645897,
+ "learning_rate": 4.90963723130841e-06,
+ "loss": 0.6168,
+ "step": 385
+ },
+ {
+ "epoch": 0.3384665972488628,
+ "grad_norm": 0.2755048673546131,
+ "learning_rate": 4.90900587588761e-06,
+ "loss": 0.6022,
+ "step": 386
+ },
+ {
+ "epoch": 0.33934345371841945,
+ "grad_norm": 0.28857281828902753,
+ "learning_rate": 4.908372363410886e-06,
+ "loss": 0.6254,
+ "step": 387
+ },
+ {
+ "epoch": 0.3402203101879761,
+ "grad_norm": 0.28648556573019374,
+ "learning_rate": 4.907736694445492e-06,
+ "loss": 0.6175,
+ "step": 388
+ },
+ {
+ "epoch": 0.34109716665753276,
+ "grad_norm": 0.26925532018377424,
+ "learning_rate": 4.9070988695606156e-06,
+ "loss": 0.6176,
+ "step": 389
+ },
+ {
+ "epoch": 0.3419740231270894,
+ "grad_norm": 0.2832182299890066,
+ "learning_rate": 4.906458889327375e-06,
+ "loss": 0.6291,
+ "step": 390
+ },
+ {
+ "epoch": 0.342850879596646,
+ "grad_norm": 0.24545023229724808,
+ "learning_rate": 4.905816754318815e-06,
+ "loss": 0.621,
+ "step": 391
+ },
+ {
+ "epoch": 0.34372773606620266,
+ "grad_norm": 0.27071805276574584,
+ "learning_rate": 4.905172465109912e-06,
+ "loss": 0.6235,
+ "step": 392
+ },
+ {
+ "epoch": 0.3446045925357593,
+ "grad_norm": 0.2686211222363871,
+ "learning_rate": 4.904526022277572e-06,
+ "loss": 0.6259,
+ "step": 393
+ },
+ {
+ "epoch": 0.34548144900531597,
+ "grad_norm": 0.2788582786567745,
+ "learning_rate": 4.903877426400629e-06,
+ "loss": 0.6113,
+ "step": 394
+ },
+ {
+ "epoch": 0.34635830547487256,
+ "grad_norm": 0.2882303517807228,
+ "learning_rate": 4.903226678059842e-06,
+ "loss": 0.6325,
+ "step": 395
+ },
+ {
+ "epoch": 0.3472351619444292,
+ "grad_norm": 0.26417391198725343,
+ "learning_rate": 4.902573777837902e-06,
+ "loss": 0.6171,
+ "step": 396
+ },
+ {
+ "epoch": 0.34811201841398587,
+ "grad_norm": 0.27931172516771346,
+ "learning_rate": 4.901918726319424e-06,
+ "loss": 0.6041,
+ "step": 397
+ },
+ {
+ "epoch": 0.3489888748835425,
+ "grad_norm": 0.24713049818043734,
+ "learning_rate": 4.901261524090949e-06,
+ "loss": 0.6099,
+ "step": 398
+ },
+ {
+ "epoch": 0.3498657313530991,
+ "grad_norm": 0.29086241382146505,
+ "learning_rate": 4.900602171740946e-06,
+ "loss": 0.6258,
+ "step": 399
+ },
+ {
+ "epoch": 0.35074258782265577,
+ "grad_norm": 0.26291418203363,
+ "learning_rate": 4.899940669859807e-06,
+ "loss": 0.6117,
+ "step": 400
+ },
+ {
+ "epoch": 0.3516194442922124,
+ "grad_norm": 0.3216617316096804,
+ "learning_rate": 4.89927701903985e-06,
+ "loss": 0.6187,
+ "step": 401
+ },
+ {
+ "epoch": 0.3524963007617691,
+ "grad_norm": 0.27295463776878537,
+ "learning_rate": 4.898611219875316e-06,
+ "loss": 0.6132,
+ "step": 402
+ },
+ {
+ "epoch": 0.3533731572313257,
+ "grad_norm": 0.2853334578601736,
+ "learning_rate": 4.897943272962372e-06,
+ "loss": 0.6148,
+ "step": 403
+ },
+ {
+ "epoch": 0.3542500137008823,
+ "grad_norm": 0.31932832747253076,
+ "learning_rate": 4.897273178899105e-06,
+ "loss": 0.6187,
+ "step": 404
+ },
+ {
+ "epoch": 0.355126870170439,
+ "grad_norm": 0.28031643219296354,
+ "learning_rate": 4.896600938285526e-06,
+ "loss": 0.6236,
+ "step": 405
+ },
+ {
+ "epoch": 0.35600372663999563,
+ "grad_norm": 0.26831626886851945,
+ "learning_rate": 4.89592655172357e-06,
+ "loss": 0.6102,
+ "step": 406
+ },
+ {
+ "epoch": 0.35688058310955223,
+ "grad_norm": 0.2951228212133584,
+ "learning_rate": 4.895250019817089e-06,
+ "loss": 0.6164,
+ "step": 407
+ },
+ {
+ "epoch": 0.3577574395791089,
+ "grad_norm": 0.27330142007513136,
+ "learning_rate": 4.894571343171862e-06,
+ "loss": 0.6023,
+ "step": 408
+ },
+ {
+ "epoch": 0.35863429604866554,
+ "grad_norm": 0.3204620119402923,
+ "learning_rate": 4.893890522395582e-06,
+ "loss": 0.62,
+ "step": 409
+ },
+ {
+ "epoch": 0.3595111525182222,
+ "grad_norm": 0.261478566125417,
+ "learning_rate": 4.893207558097867e-06,
+ "loss": 0.6294,
+ "step": 410
+ },
+ {
+ "epoch": 0.36038800898777884,
+ "grad_norm": 0.250895473885103,
+ "learning_rate": 4.892522450890251e-06,
+ "loss": 0.6152,
+ "step": 411
+ },
+ {
+ "epoch": 0.36126486545733544,
+ "grad_norm": 0.2634865561040139,
+ "learning_rate": 4.89183520138619e-06,
+ "loss": 0.6157,
+ "step": 412
+ },
+ {
+ "epoch": 0.3621417219268921,
+ "grad_norm": 0.26459491662331874,
+ "learning_rate": 4.891145810201054e-06,
+ "loss": 0.609,
+ "step": 413
+ },
+ {
+ "epoch": 0.36301857839644874,
+ "grad_norm": 0.24301745655990745,
+ "learning_rate": 4.8904542779521346e-06,
+ "loss": 0.6082,
+ "step": 414
+ },
+ {
+ "epoch": 0.3638954348660054,
+ "grad_norm": 0.2692643109083729,
+ "learning_rate": 4.8897606052586384e-06,
+ "loss": 0.6226,
+ "step": 415
+ },
+ {
+ "epoch": 0.364772291335562,
+ "grad_norm": 0.24024671108707563,
+ "learning_rate": 4.889064792741689e-06,
+ "loss": 0.6153,
+ "step": 416
+ },
+ {
+ "epoch": 0.36564914780511865,
+ "grad_norm": 0.273288282597359,
+ "learning_rate": 4.888366841024327e-06,
+ "loss": 0.6334,
+ "step": 417
+ },
+ {
+ "epoch": 0.3665260042746753,
+ "grad_norm": 0.2713735341001686,
+ "learning_rate": 4.887666750731507e-06,
+ "loss": 0.6204,
+ "step": 418
+ },
+ {
+ "epoch": 0.36740286074423195,
+ "grad_norm": 0.2749014394381958,
+ "learning_rate": 4.8869645224901e-06,
+ "loss": 0.6017,
+ "step": 419
+ },
+ {
+ "epoch": 0.36827971721378855,
+ "grad_norm": 0.27621114898765087,
+ "learning_rate": 4.8862601569288885e-06,
+ "loss": 0.6193,
+ "step": 420
+ },
+ {
+ "epoch": 0.3691565736833452,
+ "grad_norm": 0.25931507650511326,
+ "learning_rate": 4.885553654678573e-06,
+ "loss": 0.6233,
+ "step": 421
+ },
+ {
+ "epoch": 0.37003343015290185,
+ "grad_norm": 0.28686169175433923,
+ "learning_rate": 4.884845016371763e-06,
+ "loss": 0.6197,
+ "step": 422
+ },
+ {
+ "epoch": 0.3709102866224585,
+ "grad_norm": 0.27025382919889446,
+ "learning_rate": 4.884134242642985e-06,
+ "loss": 0.6033,
+ "step": 423
+ },
+ {
+ "epoch": 0.3717871430920151,
+ "grad_norm": 0.275669477293775,
+ "learning_rate": 4.883421334128674e-06,
+ "loss": 0.6172,
+ "step": 424
+ },
+ {
+ "epoch": 0.37266399956157176,
+ "grad_norm": 0.26014021950194516,
+ "learning_rate": 4.8827062914671775e-06,
+ "loss": 0.6207,
+ "step": 425
+ },
+ {
+ "epoch": 0.3735408560311284,
+ "grad_norm": 0.2986829920255015,
+ "learning_rate": 4.881989115298755e-06,
+ "loss": 0.6034,
+ "step": 426
+ },
+ {
+ "epoch": 0.37441771250068506,
+ "grad_norm": 0.28151692244357057,
+ "learning_rate": 4.881269806265575e-06,
+ "loss": 0.6133,
+ "step": 427
+ },
+ {
+ "epoch": 0.37529456897024166,
+ "grad_norm": 0.2932206682237993,
+ "learning_rate": 4.8805483650117154e-06,
+ "loss": 0.6132,
+ "step": 428
+ },
+ {
+ "epoch": 0.3761714254397983,
+ "grad_norm": 0.3164265338412961,
+ "learning_rate": 4.879824792183166e-06,
+ "loss": 0.6077,
+ "step": 429
+ },
+ {
+ "epoch": 0.37704828190935497,
+ "grad_norm": 0.3636164115457003,
+ "learning_rate": 4.879099088427824e-06,
+ "loss": 0.6179,
+ "step": 430
+ },
+ {
+ "epoch": 0.3779251383789116,
+ "grad_norm": 0.2891875334309757,
+ "learning_rate": 4.878371254395492e-06,
+ "loss": 0.6197,
+ "step": 431
+ },
+ {
+ "epoch": 0.3788019948484682,
+ "grad_norm": 0.3816104662619605,
+ "learning_rate": 4.8776412907378845e-06,
+ "loss": 0.6197,
+ "step": 432
+ },
+ {
+ "epoch": 0.37967885131802487,
+ "grad_norm": 0.29131497715708005,
+ "learning_rate": 4.876909198108619e-06,
+ "loss": 0.6159,
+ "step": 433
+ },
+ {
+ "epoch": 0.3805557077875815,
+ "grad_norm": 0.3138520265609416,
+ "learning_rate": 4.876174977163222e-06,
+ "loss": 0.6139,
+ "step": 434
+ },
+ {
+ "epoch": 0.3814325642571382,
+ "grad_norm": 0.28035852092093033,
+ "learning_rate": 4.875438628559124e-06,
+ "loss": 0.6183,
+ "step": 435
+ },
+ {
+ "epoch": 0.3823094207266948,
+ "grad_norm": 0.3120106817898386,
+ "learning_rate": 4.874700152955661e-06,
+ "loss": 0.6052,
+ "step": 436
+ },
+ {
+ "epoch": 0.3831862771962514,
+ "grad_norm": 0.29139666929908226,
+ "learning_rate": 4.873959551014075e-06,
+ "loss": 0.6058,
+ "step": 437
+ },
+ {
+ "epoch": 0.3840631336658081,
+ "grad_norm": 0.31305383154436955,
+ "learning_rate": 4.873216823397511e-06,
+ "loss": 0.6094,
+ "step": 438
+ },
+ {
+ "epoch": 0.38493999013536473,
+ "grad_norm": 0.3052879988977325,
+ "learning_rate": 4.872471970771015e-06,
+ "loss": 0.6063,
+ "step": 439
+ },
+ {
+ "epoch": 0.3858168466049214,
+ "grad_norm": 0.2965934350138861,
+ "learning_rate": 4.871724993801541e-06,
+ "loss": 0.6054,
+ "step": 440
+ },
+ {
+ "epoch": 0.386693703074478,
+ "grad_norm": 0.26339362714008424,
+ "learning_rate": 4.870975893157941e-06,
+ "loss": 0.6152,
+ "step": 441
+ },
+ {
+ "epoch": 0.38757055954403463,
+ "grad_norm": 0.27556079714679943,
+ "learning_rate": 4.870224669510968e-06,
+ "loss": 0.6158,
+ "step": 442
+ },
+ {
+ "epoch": 0.3884474160135913,
+ "grad_norm": 0.29125701036171053,
+ "learning_rate": 4.86947132353328e-06,
+ "loss": 0.6202,
+ "step": 443
+ },
+ {
+ "epoch": 0.38932427248314794,
+ "grad_norm": 0.2966406156980298,
+ "learning_rate": 4.868715855899432e-06,
+ "loss": 0.6265,
+ "step": 444
+ },
+ {
+ "epoch": 0.39020112895270453,
+ "grad_norm": 0.27733217518457043,
+ "learning_rate": 4.867958267285879e-06,
+ "loss": 0.6068,
+ "step": 445
+ },
+ {
+ "epoch": 0.3910779854222612,
+ "grad_norm": 0.2919788828093281,
+ "learning_rate": 4.8671985583709765e-06,
+ "loss": 0.6208,
+ "step": 446
+ },
+ {
+ "epoch": 0.39195484189181784,
+ "grad_norm": 0.29327731039840055,
+ "learning_rate": 4.866436729834979e-06,
+ "loss": 0.6175,
+ "step": 447
+ },
+ {
+ "epoch": 0.3928316983613745,
+ "grad_norm": 0.2568832744529454,
+ "learning_rate": 4.865672782360037e-06,
+ "loss": 0.6177,
+ "step": 448
+ },
+ {
+ "epoch": 0.3937085548309311,
+ "grad_norm": 0.283654204460893,
+ "learning_rate": 4.8649067166301985e-06,
+ "loss": 0.6203,
+ "step": 449
+ },
+ {
+ "epoch": 0.39458541130048774,
+ "grad_norm": 0.26828805221375346,
+ "learning_rate": 4.864138533331411e-06,
+ "loss": 0.6118,
+ "step": 450
+ },
+ {
+ "epoch": 0.3954622677700444,
+ "grad_norm": 0.2597158618871073,
+ "learning_rate": 4.863368233151514e-06,
+ "loss": 0.6169,
+ "step": 451
+ },
+ {
+ "epoch": 0.39633912423960105,
+ "grad_norm": 0.28436035142498156,
+ "learning_rate": 4.862595816780246e-06,
+ "loss": 0.632,
+ "step": 452
+ },
+ {
+ "epoch": 0.39721598070915765,
+ "grad_norm": 0.2652505819829089,
+ "learning_rate": 4.861821284909238e-06,
+ "loss": 0.6289,
+ "step": 453
+ },
+ {
+ "epoch": 0.3980928371787143,
+ "grad_norm": 0.29252031992594624,
+ "learning_rate": 4.861044638232016e-06,
+ "loss": 0.6328,
+ "step": 454
+ },
+ {
+ "epoch": 0.39896969364827095,
+ "grad_norm": 0.2994469365008051,
+ "learning_rate": 4.860265877444001e-06,
+ "loss": 0.617,
+ "step": 455
+ },
+ {
+ "epoch": 0.3998465501178276,
+ "grad_norm": 0.2776900829822044,
+ "learning_rate": 4.8594850032425036e-06,
+ "loss": 0.608,
+ "step": 456
+ },
+ {
+ "epoch": 0.4007234065873842,
+ "grad_norm": 0.2753322141436327,
+ "learning_rate": 4.858702016326731e-06,
+ "loss": 0.607,
+ "step": 457
+ },
+ {
+ "epoch": 0.40160026305694085,
+ "grad_norm": 0.2738219915396828,
+ "learning_rate": 4.857916917397779e-06,
+ "loss": 0.6043,
+ "step": 458
+ },
+ {
+ "epoch": 0.4024771195264975,
+ "grad_norm": 0.27192665887665013,
+ "learning_rate": 4.857129707158637e-06,
+ "loss": 0.6376,
+ "step": 459
+ },
+ {
+ "epoch": 0.40335397599605416,
+ "grad_norm": 0.27689826150792163,
+ "learning_rate": 4.8563403863141825e-06,
+ "loss": 0.6172,
+ "step": 460
+ },
+ {
+ "epoch": 0.4042308324656108,
+ "grad_norm": 0.311644665297658,
+ "learning_rate": 4.855548955571183e-06,
+ "loss": 0.6106,
+ "step": 461
+ },
+ {
+ "epoch": 0.4051076889351674,
+ "grad_norm": 0.2912453467934098,
+ "learning_rate": 4.854755415638298e-06,
+ "loss": 0.6129,
+ "step": 462
+ },
+ {
+ "epoch": 0.40598454540472406,
+ "grad_norm": 0.302939167109194,
+ "learning_rate": 4.853959767226072e-06,
+ "loss": 0.6301,
+ "step": 463
+ },
+ {
+ "epoch": 0.4068614018742807,
+ "grad_norm": 0.261297831693092,
+ "learning_rate": 4.85316201104694e-06,
+ "loss": 0.6136,
+ "step": 464
+ },
+ {
+ "epoch": 0.40773825834383737,
+ "grad_norm": 0.3154856081824323,
+ "learning_rate": 4.852362147815225e-06,
+ "loss": 0.6171,
+ "step": 465
+ },
+ {
+ "epoch": 0.40861511481339396,
+ "grad_norm": 0.29411022742744497,
+ "learning_rate": 4.8515601782471325e-06,
+ "loss": 0.6085,
+ "step": 466
+ },
+ {
+ "epoch": 0.4094919712829506,
+ "grad_norm": 0.3027595832299397,
+ "learning_rate": 4.8507561030607576e-06,
+ "loss": 0.6151,
+ "step": 467
+ },
+ {
+ "epoch": 0.41036882775250727,
+ "grad_norm": 0.3003092813187261,
+ "learning_rate": 4.84994992297608e-06,
+ "loss": 0.6071,
+ "step": 468
+ },
+ {
+ "epoch": 0.4112456842220639,
+ "grad_norm": 0.27374249219050456,
+ "learning_rate": 4.849141638714965e-06,
+ "loss": 0.6166,
+ "step": 469
+ },
+ {
+ "epoch": 0.4121225406916205,
+ "grad_norm": 0.3064667255626573,
+ "learning_rate": 4.84833125100116e-06,
+ "loss": 0.6024,
+ "step": 470
+ },
+ {
+ "epoch": 0.4129993971611772,
+ "grad_norm": 0.28188617697439766,
+ "learning_rate": 4.847518760560297e-06,
+ "loss": 0.6134,
+ "step": 471
+ },
+ {
+ "epoch": 0.4138762536307338,
+ "grad_norm": 0.27693005272362925,
+ "learning_rate": 4.846704168119892e-06,
+ "loss": 0.5984,
+ "step": 472
+ },
+ {
+ "epoch": 0.4147531101002905,
+ "grad_norm": 0.3011450154809493,
+ "learning_rate": 4.84588747440934e-06,
+ "loss": 0.5932,
+ "step": 473
+ },
+ {
+ "epoch": 0.4156299665698471,
+ "grad_norm": 0.25715138595393167,
+ "learning_rate": 4.845068680159921e-06,
+ "loss": 0.6101,
+ "step": 474
+ },
+ {
+ "epoch": 0.41650682303940373,
+ "grad_norm": 0.2963493163477849,
+ "learning_rate": 4.844247786104794e-06,
+ "loss": 0.6081,
+ "step": 475
+ },
+ {
+ "epoch": 0.4173836795089604,
+ "grad_norm": 0.29399759702492007,
+ "learning_rate": 4.8434247929789975e-06,
+ "loss": 0.6046,
+ "step": 476
+ },
+ {
+ "epoch": 0.41826053597851703,
+ "grad_norm": 0.3126535237916745,
+ "learning_rate": 4.842599701519451e-06,
+ "loss": 0.6304,
+ "step": 477
+ },
+ {
+ "epoch": 0.41913739244807363,
+ "grad_norm": 0.29299694878032745,
+ "learning_rate": 4.841772512464953e-06,
+ "loss": 0.6168,
+ "step": 478
+ },
+ {
+ "epoch": 0.4200142489176303,
+ "grad_norm": 0.289486342187316,
+ "learning_rate": 4.840943226556178e-06,
+ "loss": 0.6031,
+ "step": 479
+ },
+ {
+ "epoch": 0.42089110538718694,
+ "grad_norm": 0.30359254383613277,
+ "learning_rate": 4.840111844535682e-06,
+ "loss": 0.5994,
+ "step": 480
+ },
+ {
+ "epoch": 0.4217679618567436,
+ "grad_norm": 0.2641793447534652,
+ "learning_rate": 4.839278367147894e-06,
+ "loss": 0.6036,
+ "step": 481
+ },
+ {
+ "epoch": 0.4226448183263002,
+ "grad_norm": 0.29968320834098117,
+ "learning_rate": 4.838442795139121e-06,
+ "loss": 0.6193,
+ "step": 482
+ },
+ {
+ "epoch": 0.42352167479585684,
+ "grad_norm": 0.30614554761610074,
+ "learning_rate": 4.837605129257546e-06,
+ "loss": 0.6115,
+ "step": 483
+ },
+ {
+ "epoch": 0.4243985312654135,
+ "grad_norm": 0.29316129861054724,
+ "learning_rate": 4.836765370253223e-06,
+ "loss": 0.6039,
+ "step": 484
+ },
+ {
+ "epoch": 0.42527538773497015,
+ "grad_norm": 0.35388210389950725,
+ "learning_rate": 4.835923518878088e-06,
+ "loss": 0.6089,
+ "step": 485
+ },
+ {
+ "epoch": 0.4261522442045268,
+ "grad_norm": 0.27541931694811506,
+ "learning_rate": 4.835079575885944e-06,
+ "loss": 0.6129,
+ "step": 486
+ },
+ {
+ "epoch": 0.4270291006740834,
+ "grad_norm": 0.3408256598988536,
+ "learning_rate": 4.834233542032468e-06,
+ "loss": 0.6165,
+ "step": 487
+ },
+ {
+ "epoch": 0.42790595714364005,
+ "grad_norm": 0.30259946435062773,
+ "learning_rate": 4.83338541807521e-06,
+ "loss": 0.6111,
+ "step": 488
+ },
+ {
+ "epoch": 0.4287828136131967,
+ "grad_norm": 0.2871132966743198,
+ "learning_rate": 4.832535204773593e-06,
+ "loss": 0.6273,
+ "step": 489
+ },
+ {
+ "epoch": 0.42965967008275335,
+ "grad_norm": 0.3457337315321895,
+ "learning_rate": 4.8316829028889076e-06,
+ "loss": 0.6005,
+ "step": 490
+ },
+ {
+ "epoch": 0.43053652655230995,
+ "grad_norm": 0.2668696078107318,
+ "learning_rate": 4.830828513184317e-06,
+ "loss": 0.6122,
+ "step": 491
+ },
+ {
+ "epoch": 0.4314133830218666,
+ "grad_norm": 0.321068645111551,
+ "learning_rate": 4.829972036424854e-06,
+ "loss": 0.6058,
+ "step": 492
+ },
+ {
+ "epoch": 0.43229023949142326,
+ "grad_norm": 0.26125737492647644,
+ "learning_rate": 4.829113473377417e-06,
+ "loss": 0.6143,
+ "step": 493
+ },
+ {
+ "epoch": 0.4331670959609799,
+ "grad_norm": 0.32002755047063874,
+ "learning_rate": 4.828252824810777e-06,
+ "loss": 0.6061,
+ "step": 494
+ },
+ {
+ "epoch": 0.4340439524305365,
+ "grad_norm": 0.2863878470189295,
+ "learning_rate": 4.82739009149557e-06,
+ "loss": 0.5977,
+ "step": 495
+ },
+ {
+ "epoch": 0.43492080890009316,
+ "grad_norm": 0.31874371835878795,
+ "learning_rate": 4.826525274204297e-06,
+ "loss": 0.608,
+ "step": 496
+ },
+ {
+ "epoch": 0.4357976653696498,
+ "grad_norm": 0.2956391151217163,
+ "learning_rate": 4.825658373711328e-06,
+ "loss": 0.6107,
+ "step": 497
+ },
+ {
+ "epoch": 0.43667452183920646,
+ "grad_norm": 0.288406786632812,
+ "learning_rate": 4.824789390792899e-06,
+ "loss": 0.6094,
+ "step": 498
+ },
+ {
+ "epoch": 0.43755137830876306,
+ "grad_norm": 0.33737182032602686,
+ "learning_rate": 4.823918326227106e-06,
+ "loss": 0.5971,
+ "step": 499
+ },
+ {
+ "epoch": 0.4384282347783197,
+ "grad_norm": 0.25632117321609454,
+ "learning_rate": 4.823045180793914e-06,
+ "loss": 0.6044,
+ "step": 500
+ },
+ {
+ "epoch": 0.43930509124787637,
+ "grad_norm": 0.2978956835348055,
+ "learning_rate": 4.8221699552751465e-06,
+ "loss": 0.6009,
+ "step": 501
+ },
+ {
+ "epoch": 0.440181947717433,
+ "grad_norm": 0.30339339194561,
+ "learning_rate": 4.821292650454495e-06,
+ "loss": 0.6113,
+ "step": 502
+ },
+ {
+ "epoch": 0.4410588041869896,
+ "grad_norm": 0.3083549716587437,
+ "learning_rate": 4.8204132671175085e-06,
+ "loss": 0.6074,
+ "step": 503
+ },
+ {
+ "epoch": 0.44193566065654627,
+ "grad_norm": 0.291272682255802,
+ "learning_rate": 4.819531806051599e-06,
+ "loss": 0.606,
+ "step": 504
+ },
+ {
+ "epoch": 0.4428125171261029,
+ "grad_norm": 0.3183233272727026,
+ "learning_rate": 4.818648268046038e-06,
+ "loss": 0.6145,
+ "step": 505
+ },
+ {
+ "epoch": 0.4436893735956596,
+ "grad_norm": 0.27989457450916727,
+ "learning_rate": 4.817762653891957e-06,
+ "loss": 0.6095,
+ "step": 506
+ },
+ {
+ "epoch": 0.4445662300652162,
+ "grad_norm": 0.32106502207942483,
+ "learning_rate": 4.816874964382346e-06,
+ "loss": 0.6096,
+ "step": 507
+ },
+ {
+ "epoch": 0.4454430865347728,
+ "grad_norm": 0.2690675603747584,
+ "learning_rate": 4.815985200312057e-06,
+ "loss": 0.5986,
+ "step": 508
+ },
+ {
+ "epoch": 0.4463199430043295,
+ "grad_norm": 0.2818980909126885,
+ "learning_rate": 4.815093362477793e-06,
+ "loss": 0.6136,
+ "step": 509
+ },
+ {
+ "epoch": 0.44719679947388613,
+ "grad_norm": 0.29748447845455983,
+ "learning_rate": 4.8141994516781196e-06,
+ "loss": 0.6162,
+ "step": 510
+ },
+ {
+ "epoch": 0.4480736559434428,
+ "grad_norm": 0.3107094817046459,
+ "learning_rate": 4.813303468713456e-06,
+ "loss": 0.5939,
+ "step": 511
+ },
+ {
+ "epoch": 0.4489505124129994,
+ "grad_norm": 0.27493905192543294,
+ "learning_rate": 4.812405414386078e-06,
+ "loss": 0.6054,
+ "step": 512
+ },
+ {
+ "epoch": 0.44982736888255603,
+ "grad_norm": 0.28885594119974684,
+ "learning_rate": 4.811505289500113e-06,
+ "loss": 0.611,
+ "step": 513
+ },
+ {
+ "epoch": 0.4507042253521127,
+ "grad_norm": 0.2724458036095346,
+ "learning_rate": 4.810603094861548e-06,
+ "loss": 0.6296,
+ "step": 514
+ },
+ {
+ "epoch": 0.45158108182166934,
+ "grad_norm": 0.3171235548951884,
+ "learning_rate": 4.809698831278217e-06,
+ "loss": 0.6137,
+ "step": 515
+ },
+ {
+ "epoch": 0.45245793829122594,
+ "grad_norm": 0.2975607228468226,
+ "learning_rate": 4.808792499559812e-06,
+ "loss": 0.6081,
+ "step": 516
+ },
+ {
+ "epoch": 0.4533347947607826,
+ "grad_norm": 0.29553804453973653,
+ "learning_rate": 4.807884100517873e-06,
+ "loss": 0.6106,
+ "step": 517
+ },
+ {
+ "epoch": 0.45421165123033924,
+ "grad_norm": 0.29283068458115197,
+ "learning_rate": 4.8069736349657935e-06,
+ "loss": 0.6144,
+ "step": 518
+ },
+ {
+ "epoch": 0.4550885076998959,
+ "grad_norm": 0.3123674697628625,
+ "learning_rate": 4.806061103718816e-06,
+ "loss": 0.6024,
+ "step": 519
+ },
+ {
+ "epoch": 0.4559653641694525,
+ "grad_norm": 0.3185535504257689,
+ "learning_rate": 4.805146507594034e-06,
+ "loss": 0.6031,
+ "step": 520
+ },
+ {
+ "epoch": 0.45684222063900914,
+ "grad_norm": 0.32719458735857726,
+ "learning_rate": 4.804229847410388e-06,
+ "loss": 0.614,
+ "step": 521
+ },
+ {
+ "epoch": 0.4577190771085658,
+ "grad_norm": 0.2756686412179773,
+ "learning_rate": 4.803311123988668e-06,
+ "loss": 0.6143,
+ "step": 522
+ },
+ {
+ "epoch": 0.45859593357812245,
+ "grad_norm": 0.3193363571929515,
+ "learning_rate": 4.802390338151512e-06,
+ "loss": 0.5962,
+ "step": 523
+ },
+ {
+ "epoch": 0.45947279004767905,
+ "grad_norm": 0.27470129307670516,
+ "learning_rate": 4.801467490723402e-06,
+ "loss": 0.6118,
+ "step": 524
+ },
+ {
+ "epoch": 0.4603496465172357,
+ "grad_norm": 0.3268257836594815,
+ "learning_rate": 4.800542582530668e-06,
+ "loss": 0.6091,
+ "step": 525
+ },
+ {
+ "epoch": 0.46122650298679235,
+ "grad_norm": 0.2636715015821582,
+ "learning_rate": 4.799615614401488e-06,
+ "loss": 0.6113,
+ "step": 526
+ },
+ {
+ "epoch": 0.462103359456349,
+ "grad_norm": 0.3309929173426789,
+ "learning_rate": 4.79868658716588e-06,
+ "loss": 0.6063,
+ "step": 527
+ },
+ {
+ "epoch": 0.4629802159259056,
+ "grad_norm": 0.2705433155095911,
+ "learning_rate": 4.7977555016557054e-06,
+ "loss": 0.6115,
+ "step": 528
+ },
+ {
+ "epoch": 0.46385707239546226,
+ "grad_norm": 0.2986983107432822,
+ "learning_rate": 4.796822358704673e-06,
+ "loss": 0.624,
+ "step": 529
+ },
+ {
+ "epoch": 0.4647339288650189,
+ "grad_norm": 0.27153673858142124,
+ "learning_rate": 4.7958871591483305e-06,
+ "loss": 0.6144,
+ "step": 530
+ },
+ {
+ "epoch": 0.46561078533457556,
+ "grad_norm": 0.2774095045069063,
+ "learning_rate": 4.794949903824069e-06,
+ "loss": 0.6082,
+ "step": 531
+ },
+ {
+ "epoch": 0.4664876418041322,
+ "grad_norm": 0.28167525290961587,
+ "learning_rate": 4.794010593571118e-06,
+ "loss": 0.6106,
+ "step": 532
+ },
+ {
+ "epoch": 0.4673644982736888,
+ "grad_norm": 0.2626835693504621,
+ "learning_rate": 4.793069229230548e-06,
+ "loss": 0.6142,
+ "step": 533
+ },
+ {
+ "epoch": 0.46824135474324546,
+ "grad_norm": 0.27619948959341917,
+ "learning_rate": 4.792125811645271e-06,
+ "loss": 0.6073,
+ "step": 534
+ },
+ {
+ "epoch": 0.4691182112128021,
+ "grad_norm": 0.2913249262978291,
+ "learning_rate": 4.791180341660035e-06,
+ "loss": 0.6034,
+ "step": 535
+ },
+ {
+ "epoch": 0.46999506768235877,
+ "grad_norm": 0.2792318560656134,
+ "learning_rate": 4.790232820121426e-06,
+ "loss": 0.6002,
+ "step": 536
+ },
+ {
+ "epoch": 0.47087192415191537,
+ "grad_norm": 0.2690237732263836,
+ "learning_rate": 4.789283247877867e-06,
+ "loss": 0.6128,
+ "step": 537
+ },
+ {
+ "epoch": 0.471748780621472,
+ "grad_norm": 0.2875784864108413,
+ "learning_rate": 4.7883316257796195e-06,
+ "loss": 0.6125,
+ "step": 538
+ },
+ {
+ "epoch": 0.47262563709102867,
+ "grad_norm": 0.3494280106540881,
+ "learning_rate": 4.787377954678776e-06,
+ "loss": 0.6079,
+ "step": 539
+ },
+ {
+ "epoch": 0.4735024935605853,
+ "grad_norm": 0.27811345732659243,
+ "learning_rate": 4.786422235429269e-06,
+ "loss": 0.6118,
+ "step": 540
+ },
+ {
+ "epoch": 0.4743793500301419,
+ "grad_norm": 0.33921109846320074,
+ "learning_rate": 4.785464468886859e-06,
+ "loss": 0.6176,
+ "step": 541
+ },
+ {
+ "epoch": 0.4752562064996986,
+ "grad_norm": 0.29592545517880114,
+ "learning_rate": 4.784504655909146e-06,
+ "loss": 0.6131,
+ "step": 542
+ },
+ {
+ "epoch": 0.4761330629692552,
+ "grad_norm": 0.29373530511374163,
+ "learning_rate": 4.783542797355558e-06,
+ "loss": 0.6082,
+ "step": 543
+ },
+ {
+ "epoch": 0.4770099194388119,
+ "grad_norm": 0.2999691792256973,
+ "learning_rate": 4.782578894087357e-06,
+ "loss": 0.5981,
+ "step": 544
+ },
+ {
+ "epoch": 0.4778867759083685,
+ "grad_norm": 0.2694268894908227,
+ "learning_rate": 4.781612946967632e-06,
+ "loss": 0.6055,
+ "step": 545
+ },
+ {
+ "epoch": 0.47876363237792513,
+ "grad_norm": 0.2970836241532985,
+ "learning_rate": 4.780644956861307e-06,
+ "loss": 0.6002,
+ "step": 546
+ },
+ {
+ "epoch": 0.4796404888474818,
+ "grad_norm": 0.3413332201519291,
+ "learning_rate": 4.7796749246351335e-06,
+ "loss": 0.6103,
+ "step": 547
+ },
+ {
+ "epoch": 0.48051734531703844,
+ "grad_norm": 0.27732196553749033,
+ "learning_rate": 4.77870285115769e-06,
+ "loss": 0.5972,
+ "step": 548
+ },
+ {
+ "epoch": 0.48139420178659503,
+ "grad_norm": 0.32594912225980904,
+ "learning_rate": 4.777728737299387e-06,
+ "loss": 0.6275,
+ "step": 549
+ },
+ {
+ "epoch": 0.4822710582561517,
+ "grad_norm": 0.28158230943213153,
+ "learning_rate": 4.776752583932455e-06,
+ "loss": 0.6215,
+ "step": 550
+ },
+ {
+ "epoch": 0.48314791472570834,
+ "grad_norm": 0.3244722564822324,
+ "learning_rate": 4.775774391930956e-06,
+ "loss": 0.5947,
+ "step": 551
+ },
+ {
+ "epoch": 0.484024771195265,
+ "grad_norm": 0.26397208532030864,
+ "learning_rate": 4.774794162170777e-06,
+ "loss": 0.611,
+ "step": 552
+ },
+ {
+ "epoch": 0.4849016276648216,
+ "grad_norm": 0.2816890422555255,
+ "learning_rate": 4.773811895529629e-06,
+ "loss": 0.5942,
+ "step": 553
+ },
+ {
+ "epoch": 0.48577848413437824,
+ "grad_norm": 0.28224512879430635,
+ "learning_rate": 4.772827592887046e-06,
+ "loss": 0.5918,
+ "step": 554
+ },
+ {
+ "epoch": 0.4866553406039349,
+ "grad_norm": 0.2978578883597439,
+ "learning_rate": 4.771841255124385e-06,
+ "loss": 0.6031,
+ "step": 555
+ },
+ {
+ "epoch": 0.48753219707349155,
+ "grad_norm": 0.3212067488646109,
+ "learning_rate": 4.770852883124827e-06,
+ "loss": 0.6066,
+ "step": 556
+ },
+ {
+ "epoch": 0.4884090535430482,
+ "grad_norm": 0.3047898856904216,
+ "learning_rate": 4.769862477773374e-06,
+ "loss": 0.6097,
+ "step": 557
+ },
+ {
+ "epoch": 0.4892859100126048,
+ "grad_norm": 0.32816575436148626,
+ "learning_rate": 4.768870039956846e-06,
+ "loss": 0.6078,
+ "step": 558
+ },
+ {
+ "epoch": 0.49016276648216145,
+ "grad_norm": 0.30333447423661625,
+ "learning_rate": 4.767875570563887e-06,
+ "loss": 0.6103,
+ "step": 559
+ },
+ {
+ "epoch": 0.4910396229517181,
+ "grad_norm": 0.32463487013229164,
+ "learning_rate": 4.766879070484957e-06,
+ "loss": 0.5925,
+ "step": 560
+ },
+ {
+ "epoch": 0.49191647942127475,
+ "grad_norm": 0.27125555349656966,
+ "learning_rate": 4.765880540612336e-06,
+ "loss": 0.6095,
+ "step": 561
+ },
+ {
+ "epoch": 0.49279333589083135,
+ "grad_norm": 0.29571340419933284,
+ "learning_rate": 4.764879981840121e-06,
+ "loss": 0.6061,
+ "step": 562
+ },
+ {
+ "epoch": 0.493670192360388,
+ "grad_norm": 0.28779220439984465,
+ "learning_rate": 4.763877395064225e-06,
+ "loss": 0.6164,
+ "step": 563
+ },
+ {
+ "epoch": 0.49454704882994466,
+ "grad_norm": 0.3023002461106019,
+ "learning_rate": 4.762872781182378e-06,
+ "loss": 0.6099,
+ "step": 564
+ },
+ {
+ "epoch": 0.4954239052995013,
+ "grad_norm": 0.2852998688047179,
+ "learning_rate": 4.761866141094126e-06,
+ "loss": 0.6151,
+ "step": 565
+ },
+ {
+ "epoch": 0.4963007617690579,
+ "grad_norm": 0.27004415072990756,
+ "learning_rate": 4.7608574757008245e-06,
+ "loss": 0.6056,
+ "step": 566
+ },
+ {
+ "epoch": 0.49717761823861456,
+ "grad_norm": 0.26583697629837466,
+ "learning_rate": 4.759846785905649e-06,
+ "loss": 0.6073,
+ "step": 567
+ },
+ {
+ "epoch": 0.4980544747081712,
+ "grad_norm": 0.29963137609858226,
+ "learning_rate": 4.758834072613583e-06,
+ "loss": 0.6175,
+ "step": 568
+ },
+ {
+ "epoch": 0.49893133117772787,
+ "grad_norm": 0.2777428291092147,
+ "learning_rate": 4.757819336731424e-06,
+ "loss": 0.6084,
+ "step": 569
+ },
+ {
+ "epoch": 0.49980818764728446,
+ "grad_norm": 0.286537576055084,
+ "learning_rate": 4.756802579167781e-06,
+ "loss": 0.6122,
+ "step": 570
+ },
+ {
+ "epoch": 0.5006850441168411,
+ "grad_norm": 0.2900434750609322,
+ "learning_rate": 4.755783800833071e-06,
+ "loss": 0.61,
+ "step": 571
+ },
+ {
+ "epoch": 0.5015619005863977,
+ "grad_norm": 0.29602981997833644,
+ "learning_rate": 4.754763002639522e-06,
+ "loss": 0.5979,
+ "step": 572
+ },
+ {
+ "epoch": 0.5024387570559544,
+ "grad_norm": 0.2850500950921633,
+ "learning_rate": 4.75374018550117e-06,
+ "loss": 0.616,
+ "step": 573
+ },
+ {
+ "epoch": 0.503315613525511,
+ "grad_norm": 0.2747595431255721,
+ "learning_rate": 4.752715350333858e-06,
+ "loss": 0.6082,
+ "step": 574
+ },
+ {
+ "epoch": 0.5041924699950677,
+ "grad_norm": 0.30963433949041175,
+ "learning_rate": 4.75168849805524e-06,
+ "loss": 0.6062,
+ "step": 575
+ },
+ {
+ "epoch": 0.5050693264646243,
+ "grad_norm": 0.28817154630491854,
+ "learning_rate": 4.750659629584772e-06,
+ "loss": 0.615,
+ "step": 576
+ },
+ {
+ "epoch": 0.5059461829341809,
+ "grad_norm": 0.29777143797501865,
+ "learning_rate": 4.749628745843715e-06,
+ "loss": 0.6093,
+ "step": 577
+ },
+ {
+ "epoch": 0.5068230394037376,
+ "grad_norm": 0.2761328411528336,
+ "learning_rate": 4.748595847755137e-06,
+ "loss": 0.5949,
+ "step": 578
+ },
+ {
+ "epoch": 0.5076998958732942,
+ "grad_norm": 0.27941749417554973,
+ "learning_rate": 4.74756093624391e-06,
+ "loss": 0.6165,
+ "step": 579
+ },
+ {
+ "epoch": 0.5085767523428508,
+ "grad_norm": 0.28883681834919644,
+ "learning_rate": 4.746524012236706e-06,
+ "loss": 0.6012,
+ "step": 580
+ },
+ {
+ "epoch": 0.5094536088124075,
+ "grad_norm": 0.2712633209555587,
+ "learning_rate": 4.7454850766620005e-06,
+ "loss": 0.5898,
+ "step": 581
+ },
+ {
+ "epoch": 0.5103304652819641,
+ "grad_norm": 0.29386364789948854,
+ "learning_rate": 4.7444441304500714e-06,
+ "loss": 0.6057,
+ "step": 582
+ },
+ {
+ "epoch": 0.5112073217515208,
+ "grad_norm": 0.27998562308750735,
+ "learning_rate": 4.743401174532994e-06,
+ "loss": 0.597,
+ "step": 583
+ },
+ {
+ "epoch": 0.5120841782210774,
+ "grad_norm": 0.2944531079667381,
+ "learning_rate": 4.742356209844646e-06,
+ "loss": 0.5915,
+ "step": 584
+ },
+ {
+ "epoch": 0.512961034690634,
+ "grad_norm": 0.29506045387008756,
+ "learning_rate": 4.741309237320703e-06,
+ "loss": 0.6178,
+ "step": 585
+ },
+ {
+ "epoch": 0.5138378911601907,
+ "grad_norm": 0.299236621784075,
+ "learning_rate": 4.740260257898638e-06,
+ "loss": 0.6121,
+ "step": 586
+ },
+ {
+ "epoch": 0.5147147476297473,
+ "grad_norm": 0.303688650889379,
+ "learning_rate": 4.739209272517721e-06,
+ "loss": 0.5982,
+ "step": 587
+ },
+ {
+ "epoch": 0.515591604099304,
+ "grad_norm": 0.2925779066404172,
+ "learning_rate": 4.738156282119018e-06,
+ "loss": 0.5936,
+ "step": 588
+ },
+ {
+ "epoch": 0.5164684605688606,
+ "grad_norm": 0.3374725318718031,
+ "learning_rate": 4.73710128764539e-06,
+ "loss": 0.6001,
+ "step": 589
+ },
+ {
+ "epoch": 0.5173453170384172,
+ "grad_norm": 0.28811046561615106,
+ "learning_rate": 4.736044290041496e-06,
+ "loss": 0.61,
+ "step": 590
+ },
+ {
+ "epoch": 0.518222173507974,
+ "grad_norm": 0.32139851009391945,
+ "learning_rate": 4.7349852902537814e-06,
+ "loss": 0.5931,
+ "step": 591
+ },
+ {
+ "epoch": 0.5190990299775305,
+ "grad_norm": 0.27307295767087736,
+ "learning_rate": 4.733924289230493e-06,
+ "loss": 0.6035,
+ "step": 592
+ },
+ {
+ "epoch": 0.5199758864470871,
+ "grad_norm": 0.3098223534082736,
+ "learning_rate": 4.7328612879216615e-06,
+ "loss": 0.6082,
+ "step": 593
+ },
+ {
+ "epoch": 0.5208527429166439,
+ "grad_norm": 0.2808341207944162,
+ "learning_rate": 4.731796287279115e-06,
+ "loss": 0.5965,
+ "step": 594
+ },
+ {
+ "epoch": 0.5217295993862004,
+ "grad_norm": 0.3093125993326785,
+ "learning_rate": 4.730729288256468e-06,
+ "loss": 0.6018,
+ "step": 595
+ },
+ {
+ "epoch": 0.5226064558557572,
+ "grad_norm": 0.30147164249765196,
+ "learning_rate": 4.729660291809126e-06,
+ "loss": 0.6072,
+ "step": 596
+ },
+ {
+ "epoch": 0.5234833123253138,
+ "grad_norm": 0.2893545075475105,
+ "learning_rate": 4.728589298894284e-06,
+ "loss": 0.5894,
+ "step": 597
+ },
+ {
+ "epoch": 0.5243601687948704,
+ "grad_norm": 0.29778530349250987,
+ "learning_rate": 4.72751631047092e-06,
+ "loss": 0.5941,
+ "step": 598
+ },
+ {
+ "epoch": 0.5252370252644271,
+ "grad_norm": 0.2822751104373634,
+ "learning_rate": 4.726441327499805e-06,
+ "loss": 0.6056,
+ "step": 599
+ },
+ {
+ "epoch": 0.5261138817339837,
+ "grad_norm": 0.30381920940202223,
+ "learning_rate": 4.725364350943492e-06,
+ "loss": 0.6016,
+ "step": 600
+ },
+ {
+ "epoch": 0.5269907382035403,
+ "grad_norm": 0.2728312952142679,
+ "learning_rate": 4.72428538176632e-06,
+ "loss": 0.6033,
+ "step": 601
+ },
+ {
+ "epoch": 0.527867594673097,
+ "grad_norm": 0.2920360605636878,
+ "learning_rate": 4.723204420934413e-06,
+ "loss": 0.614,
+ "step": 602
+ },
+ {
+ "epoch": 0.5287444511426536,
+ "grad_norm": 0.282387818364113,
+ "learning_rate": 4.722121469415677e-06,
+ "loss": 0.5901,
+ "step": 603
+ },
+ {
+ "epoch": 0.5296213076122103,
+ "grad_norm": 0.2954181717364726,
+ "learning_rate": 4.721036528179802e-06,
+ "loss": 0.6043,
+ "step": 604
+ },
+ {
+ "epoch": 0.5304981640817669,
+ "grad_norm": 0.3084979402180987,
+ "learning_rate": 4.719949598198258e-06,
+ "loss": 0.5931,
+ "step": 605
+ },
+ {
+ "epoch": 0.5313750205513235,
+ "grad_norm": 0.3252699365181927,
+ "learning_rate": 4.718860680444297e-06,
+ "loss": 0.6181,
+ "step": 606
+ },
+ {
+ "epoch": 0.5322518770208802,
+ "grad_norm": 0.28357295095306256,
+ "learning_rate": 4.717769775892951e-06,
+ "loss": 0.5903,
+ "step": 607
+ },
+ {
+ "epoch": 0.5331287334904368,
+ "grad_norm": 0.3569079908279582,
+ "learning_rate": 4.7166768855210294e-06,
+ "loss": 0.5939,
+ "step": 608
+ },
+ {
+ "epoch": 0.5340055899599935,
+ "grad_norm": 0.31741200071485426,
+ "learning_rate": 4.715582010307121e-06,
+ "loss": 0.5897,
+ "step": 609
+ },
+ {
+ "epoch": 0.5348824464295501,
+ "grad_norm": 0.3218789245412814,
+ "learning_rate": 4.714485151231593e-06,
+ "loss": 0.5926,
+ "step": 610
+ },
+ {
+ "epoch": 0.5357593028991067,
+ "grad_norm": 0.2824610260583936,
+ "learning_rate": 4.713386309276585e-06,
+ "loss": 0.6039,
+ "step": 611
+ },
+ {
+ "epoch": 0.5366361593686634,
+ "grad_norm": 0.3111981063952015,
+ "learning_rate": 4.712285485426017e-06,
+ "loss": 0.6012,
+ "step": 612
+ },
+ {
+ "epoch": 0.53751301583822,
+ "grad_norm": 0.2719370118974663,
+ "learning_rate": 4.7111826806655804e-06,
+ "loss": 0.5912,
+ "step": 613
+ },
+ {
+ "epoch": 0.5383898723077766,
+ "grad_norm": 0.3161533458613161,
+ "learning_rate": 4.710077895982741e-06,
+ "loss": 0.5962,
+ "step": 614
+ },
+ {
+ "epoch": 0.5392667287773333,
+ "grad_norm": 0.26701338476822095,
+ "learning_rate": 4.708971132366739e-06,
+ "loss": 0.6025,
+ "step": 615
+ },
+ {
+ "epoch": 0.5401435852468899,
+ "grad_norm": 0.28447205168753736,
+ "learning_rate": 4.707862390808583e-06,
+ "loss": 0.5959,
+ "step": 616
+ },
+ {
+ "epoch": 0.5410204417164466,
+ "grad_norm": 0.26585350433139904,
+ "learning_rate": 4.706751672301058e-06,
+ "loss": 0.5946,
+ "step": 617
+ },
+ {
+ "epoch": 0.5418972981860032,
+ "grad_norm": 0.28276117956241253,
+ "learning_rate": 4.705638977838712e-06,
+ "loss": 0.5986,
+ "step": 618
+ },
+ {
+ "epoch": 0.5427741546555598,
+ "grad_norm": 0.2752743049051474,
+ "learning_rate": 4.704524308417872e-06,
+ "loss": 0.6044,
+ "step": 619
+ },
+ {
+ "epoch": 0.5436510111251165,
+ "grad_norm": 0.2744635750786116,
+ "learning_rate": 4.703407665036622e-06,
+ "loss": 0.6,
+ "step": 620
+ },
+ {
+ "epoch": 0.5445278675946731,
+ "grad_norm": 0.2942835089324837,
+ "learning_rate": 4.702289048694824e-06,
+ "loss": 0.6163,
+ "step": 621
+ },
+ {
+ "epoch": 0.5454047240642297,
+ "grad_norm": 0.29074004193212294,
+ "learning_rate": 4.7011684603940985e-06,
+ "loss": 0.61,
+ "step": 622
+ },
+ {
+ "epoch": 0.5462815805337864,
+ "grad_norm": 0.265548853050648,
+ "learning_rate": 4.700045901137838e-06,
+ "loss": 0.6003,
+ "step": 623
+ },
+ {
+ "epoch": 0.547158437003343,
+ "grad_norm": 0.28147341099339,
+ "learning_rate": 4.6989213719311956e-06,
+ "loss": 0.6057,
+ "step": 624
+ },
+ {
+ "epoch": 0.5480352934728997,
+ "grad_norm": 0.25061686481638634,
+ "learning_rate": 4.697794873781089e-06,
+ "loss": 0.6103,
+ "step": 625
+ },
+ {
+ "epoch": 0.5489121499424563,
+ "grad_norm": 0.28270079603778164,
+ "learning_rate": 4.696666407696201e-06,
+ "loss": 0.5999,
+ "step": 626
+ },
+ {
+ "epoch": 0.5497890064120129,
+ "grad_norm": 0.25832596909684546,
+ "learning_rate": 4.695535974686975e-06,
+ "loss": 0.5989,
+ "step": 627
+ },
+ {
+ "epoch": 0.5506658628815696,
+ "grad_norm": 0.28610489660664173,
+ "learning_rate": 4.694403575765615e-06,
+ "loss": 0.6039,
+ "step": 628
+ },
+ {
+ "epoch": 0.5515427193511262,
+ "grad_norm": 0.26039812165621273,
+ "learning_rate": 4.693269211946086e-06,
+ "loss": 0.5999,
+ "step": 629
+ },
+ {
+ "epoch": 0.5524195758206829,
+ "grad_norm": 0.2802813802636672,
+ "learning_rate": 4.692132884244113e-06,
+ "loss": 0.5957,
+ "step": 630
+ },
+ {
+ "epoch": 0.5532964322902395,
+ "grad_norm": 0.28045233973715045,
+ "learning_rate": 4.69099459367718e-06,
+ "loss": 0.6057,
+ "step": 631
+ },
+ {
+ "epoch": 0.5541732887597961,
+ "grad_norm": 0.2850165288729873,
+ "learning_rate": 4.689854341264525e-06,
+ "loss": 0.6062,
+ "step": 632
+ },
+ {
+ "epoch": 0.5550501452293528,
+ "grad_norm": 0.318532937146288,
+ "learning_rate": 4.688712128027147e-06,
+ "loss": 0.615,
+ "step": 633
+ },
+ {
+ "epoch": 0.5559270016989094,
+ "grad_norm": 0.2700297126701359,
+ "learning_rate": 4.687567954987798e-06,
+ "loss": 0.6027,
+ "step": 634
+ },
+ {
+ "epoch": 0.556803858168466,
+ "grad_norm": 0.2709567537114069,
+ "learning_rate": 4.686421823170987e-06,
+ "loss": 0.606,
+ "step": 635
+ },
+ {
+ "epoch": 0.5576807146380227,
+ "grad_norm": 0.30943308206128534,
+ "learning_rate": 4.685273733602975e-06,
+ "loss": 0.6122,
+ "step": 636
+ },
+ {
+ "epoch": 0.5585575711075793,
+ "grad_norm": 0.2866407684585244,
+ "learning_rate": 4.6841236873117765e-06,
+ "loss": 0.5983,
+ "step": 637
+ },
+ {
+ "epoch": 0.559434427577136,
+ "grad_norm": 0.30074858616349,
+ "learning_rate": 4.6829716853271576e-06,
+ "loss": 0.6112,
+ "step": 638
+ },
+ {
+ "epoch": 0.5603112840466926,
+ "grad_norm": 0.27481764632891953,
+ "learning_rate": 4.681817728680638e-06,
+ "loss": 0.5923,
+ "step": 639
+ },
+ {
+ "epoch": 0.5611881405162492,
+ "grad_norm": 0.30985792219487485,
+ "learning_rate": 4.680661818405485e-06,
+ "loss": 0.6083,
+ "step": 640
+ },
+ {
+ "epoch": 0.5620649969858059,
+ "grad_norm": 0.30548099410676144,
+ "learning_rate": 4.679503955536715e-06,
+ "loss": 0.6105,
+ "step": 641
+ },
+ {
+ "epoch": 0.5629418534553625,
+ "grad_norm": 0.27736446160459594,
+ "learning_rate": 4.678344141111096e-06,
+ "loss": 0.6176,
+ "step": 642
+ },
+ {
+ "epoch": 0.5638187099249191,
+ "grad_norm": 0.313370779146898,
+ "learning_rate": 4.6771823761671386e-06,
+ "loss": 0.6035,
+ "step": 643
+ },
+ {
+ "epoch": 0.5646955663944758,
+ "grad_norm": 0.27389315771120454,
+ "learning_rate": 4.676018661745104e-06,
+ "loss": 0.6118,
+ "step": 644
+ },
+ {
+ "epoch": 0.5655724228640324,
+ "grad_norm": 0.3272671136560007,
+ "learning_rate": 4.674852998886998e-06,
+ "loss": 0.6059,
+ "step": 645
+ },
+ {
+ "epoch": 0.5664492793335891,
+ "grad_norm": 0.29110434636858074,
+ "learning_rate": 4.6736853886365704e-06,
+ "loss": 0.5957,
+ "step": 646
+ },
+ {
+ "epoch": 0.5673261358031457,
+ "grad_norm": 0.27566640053494834,
+ "learning_rate": 4.672515832039315e-06,
+ "loss": 0.5847,
+ "step": 647
+ },
+ {
+ "epoch": 0.5682029922727023,
+ "grad_norm": 0.3439499837560115,
+ "learning_rate": 4.671344330142468e-06,
+ "loss": 0.6066,
+ "step": 648
+ },
+ {
+ "epoch": 0.569079848742259,
+ "grad_norm": 0.2831795036732806,
+ "learning_rate": 4.670170883995007e-06,
+ "loss": 0.5875,
+ "step": 649
+ },
+ {
+ "epoch": 0.5699567052118156,
+ "grad_norm": 0.3084275937304928,
+ "learning_rate": 4.668995494647653e-06,
+ "loss": 0.6046,
+ "step": 650
+ },
+ {
+ "epoch": 0.5708335616813722,
+ "grad_norm": 0.2876312566066635,
+ "learning_rate": 4.667818163152864e-06,
+ "loss": 0.609,
+ "step": 651
+ },
+ {
+ "epoch": 0.5717104181509289,
+ "grad_norm": 0.27641311480374825,
+ "learning_rate": 4.6666388905648394e-06,
+ "loss": 0.6084,
+ "step": 652
+ },
+ {
+ "epoch": 0.5725872746204855,
+ "grad_norm": 0.2760161681243495,
+ "learning_rate": 4.665457677939515e-06,
+ "loss": 0.6036,
+ "step": 653
+ },
+ {
+ "epoch": 0.5734641310900422,
+ "grad_norm": 0.2664014070652965,
+ "learning_rate": 4.664274526334563e-06,
+ "loss": 0.6047,
+ "step": 654
+ },
+ {
+ "epoch": 0.5743409875595988,
+ "grad_norm": 0.27367722811571643,
+ "learning_rate": 4.663089436809395e-06,
+ "loss": 0.607,
+ "step": 655
+ },
+ {
+ "epoch": 0.5752178440291554,
+ "grad_norm": 0.2971494077897638,
+ "learning_rate": 4.661902410425156e-06,
+ "loss": 0.5851,
+ "step": 656
+ },
+ {
+ "epoch": 0.5760947004987121,
+ "grad_norm": 0.28359506675344376,
+ "learning_rate": 4.660713448244723e-06,
+ "loss": 0.5911,
+ "step": 657
+ },
+ {
+ "epoch": 0.5769715569682687,
+ "grad_norm": 0.27646693971859265,
+ "learning_rate": 4.6595225513327105e-06,
+ "loss": 0.601,
+ "step": 658
+ },
+ {
+ "epoch": 0.5778484134378254,
+ "grad_norm": 0.2707379861432875,
+ "learning_rate": 4.658329720755464e-06,
+ "loss": 0.5905,
+ "step": 659
+ },
+ {
+ "epoch": 0.578725269907382,
+ "grad_norm": 0.301271851117793,
+ "learning_rate": 4.657134957581057e-06,
+ "loss": 0.6023,
+ "step": 660
+ },
+ {
+ "epoch": 0.5796021263769386,
+ "grad_norm": 0.30214846729641187,
+ "learning_rate": 4.6559382628793e-06,
+ "loss": 0.6095,
+ "step": 661
+ },
+ {
+ "epoch": 0.5804789828464954,
+ "grad_norm": 0.2880769859831512,
+ "learning_rate": 4.6547396377217265e-06,
+ "loss": 0.6012,
+ "step": 662
+ },
+ {
+ "epoch": 0.581355839316052,
+ "grad_norm": 0.3363251460755209,
+ "learning_rate": 4.653539083181603e-06,
+ "loss": 0.5963,
+ "step": 663
+ },
+ {
+ "epoch": 0.5822326957856085,
+ "grad_norm": 0.3446871487238731,
+ "learning_rate": 4.652336600333921e-06,
+ "loss": 0.5992,
+ "step": 664
+ },
+ {
+ "epoch": 0.5831095522551653,
+ "grad_norm": 0.3016824402176579,
+ "learning_rate": 4.651132190255401e-06,
+ "loss": 0.6016,
+ "step": 665
+ },
+ {
+ "epoch": 0.5839864087247219,
+ "grad_norm": 0.31791554379394255,
+ "learning_rate": 4.649925854024486e-06,
+ "loss": 0.5943,
+ "step": 666
+ },
+ {
+ "epoch": 0.5848632651942786,
+ "grad_norm": 0.3603510668723624,
+ "learning_rate": 4.648717592721347e-06,
+ "loss": 0.6086,
+ "step": 667
+ },
+ {
+ "epoch": 0.5857401216638352,
+ "grad_norm": 0.25073578292290827,
+ "learning_rate": 4.647507407427877e-06,
+ "loss": 0.5965,
+ "step": 668
+ },
+ {
+ "epoch": 0.5866169781333918,
+ "grad_norm": 0.3401292596267892,
+ "learning_rate": 4.646295299227691e-06,
+ "loss": 0.5896,
+ "step": 669
+ },
+ {
+ "epoch": 0.5874938346029485,
+ "grad_norm": 0.26798950974238206,
+ "learning_rate": 4.645081269206128e-06,
+ "loss": 0.5913,
+ "step": 670
+ },
+ {
+ "epoch": 0.5883706910725051,
+ "grad_norm": 0.2712753517614824,
+ "learning_rate": 4.643865318450247e-06,
+ "loss": 0.5948,
+ "step": 671
+ },
+ {
+ "epoch": 0.5892475475420617,
+ "grad_norm": 0.31478669896326056,
+ "learning_rate": 4.642647448048824e-06,
+ "loss": 0.6036,
+ "step": 672
+ },
+ {
+ "epoch": 0.5901244040116184,
+ "grad_norm": 0.2853149586152437,
+ "learning_rate": 4.641427659092359e-06,
+ "loss": 0.5852,
+ "step": 673
+ },
+ {
+ "epoch": 0.591001260481175,
+ "grad_norm": 0.31928733056145026,
+ "learning_rate": 4.6402059526730656e-06,
+ "loss": 0.596,
+ "step": 674
+ },
+ {
+ "epoch": 0.5918781169507317,
+ "grad_norm": 0.28886504451895006,
+ "learning_rate": 4.638982329884878e-06,
+ "loss": 0.5867,
+ "step": 675
+ },
+ {
+ "epoch": 0.5927549734202883,
+ "grad_norm": 0.34332786639440344,
+ "learning_rate": 4.637756791823443e-06,
+ "loss": 0.5951,
+ "step": 676
+ },
+ {
+ "epoch": 0.5936318298898449,
+ "grad_norm": 0.31536294202913445,
+ "learning_rate": 4.6365293395861225e-06,
+ "loss": 0.6005,
+ "step": 677
+ },
+ {
+ "epoch": 0.5945086863594016,
+ "grad_norm": 0.36612645695214535,
+ "learning_rate": 4.6352999742719954e-06,
+ "loss": 0.6125,
+ "step": 678
+ },
+ {
+ "epoch": 0.5953855428289582,
+ "grad_norm": 0.2865910172606529,
+ "learning_rate": 4.634068696981852e-06,
+ "loss": 0.6096,
+ "step": 679
+ },
+ {
+ "epoch": 0.5962623992985149,
+ "grad_norm": 0.3077121674916666,
+ "learning_rate": 4.632835508818192e-06,
+ "loss": 0.5891,
+ "step": 680
+ },
+ {
+ "epoch": 0.5971392557680715,
+ "grad_norm": 0.2930520316480949,
+ "learning_rate": 4.631600410885231e-06,
+ "loss": 0.5918,
+ "step": 681
+ },
+ {
+ "epoch": 0.5980161122376281,
+ "grad_norm": 0.3412197822800723,
+ "learning_rate": 4.630363404288891e-06,
+ "loss": 0.5998,
+ "step": 682
+ },
+ {
+ "epoch": 0.5988929687071848,
+ "grad_norm": 0.2869686807201651,
+ "learning_rate": 4.629124490136804e-06,
+ "loss": 0.5952,
+ "step": 683
+ },
+ {
+ "epoch": 0.5997698251767414,
+ "grad_norm": 0.3044523168792968,
+ "learning_rate": 4.627883669538311e-06,
+ "loss": 0.6058,
+ "step": 684
+ },
+ {
+ "epoch": 0.600646681646298,
+ "grad_norm": 0.298754941767322,
+ "learning_rate": 4.626640943604459e-06,
+ "loss": 0.6099,
+ "step": 685
+ },
+ {
+ "epoch": 0.6015235381158547,
+ "grad_norm": 0.30823608651620477,
+ "learning_rate": 4.625396313448e-06,
+ "loss": 0.5913,
+ "step": 686
+ },
+ {
+ "epoch": 0.6024003945854113,
+ "grad_norm": 0.2745802532714142,
+ "learning_rate": 4.624149780183395e-06,
+ "loss": 0.5904,
+ "step": 687
+ },
+ {
+ "epoch": 0.603277251054968,
+ "grad_norm": 0.2894557068485525,
+ "learning_rate": 4.622901344926805e-06,
+ "loss": 0.6006,
+ "step": 688
+ },
+ {
+ "epoch": 0.6041541075245246,
+ "grad_norm": 0.2844643276622375,
+ "learning_rate": 4.621651008796095e-06,
+ "loss": 0.5972,
+ "step": 689
+ },
+ {
+ "epoch": 0.6050309639940812,
+ "grad_norm": 0.3111750841694179,
+ "learning_rate": 4.620398772910833e-06,
+ "loss": 0.5911,
+ "step": 690
+ },
+ {
+ "epoch": 0.6059078204636379,
+ "grad_norm": 0.30229136138256857,
+ "learning_rate": 4.619144638392289e-06,
+ "loss": 0.6063,
+ "step": 691
+ },
+ {
+ "epoch": 0.6067846769331945,
+ "grad_norm": 0.2903177693650587,
+ "learning_rate": 4.6178886063634295e-06,
+ "loss": 0.6022,
+ "step": 692
+ },
+ {
+ "epoch": 0.6076615334027511,
+ "grad_norm": 0.29466063932438424,
+ "learning_rate": 4.616630677948924e-06,
+ "loss": 0.609,
+ "step": 693
+ },
+ {
+ "epoch": 0.6085383898723078,
+ "grad_norm": 0.29795014881552045,
+ "learning_rate": 4.615370854275138e-06,
+ "loss": 0.5923,
+ "step": 694
+ },
+ {
+ "epoch": 0.6094152463418644,
+ "grad_norm": 0.2835342651327551,
+ "learning_rate": 4.614109136470133e-06,
+ "loss": 0.5941,
+ "step": 695
+ },
+ {
+ "epoch": 0.6102921028114211,
+ "grad_norm": 0.2914927284695803,
+ "learning_rate": 4.612845525663671e-06,
+ "loss": 0.5915,
+ "step": 696
+ },
+ {
+ "epoch": 0.6111689592809777,
+ "grad_norm": 0.27150994490869584,
+ "learning_rate": 4.611580022987202e-06,
+ "loss": 0.5903,
+ "step": 697
+ },
+ {
+ "epoch": 0.6120458157505343,
+ "grad_norm": 0.27427922033901636,
+ "learning_rate": 4.610312629573877e-06,
+ "loss": 0.5826,
+ "step": 698
+ },
+ {
+ "epoch": 0.612922672220091,
+ "grad_norm": 0.3257835351903302,
+ "learning_rate": 4.609043346558536e-06,
+ "loss": 0.608,
+ "step": 699
+ },
+ {
+ "epoch": 0.6137995286896476,
+ "grad_norm": 0.27542786817313375,
+ "learning_rate": 4.607772175077712e-06,
+ "loss": 0.5914,
+ "step": 700
+ },
+ {
+ "epoch": 0.6146763851592043,
+ "grad_norm": 0.32541464673918596,
+ "learning_rate": 4.606499116269628e-06,
+ "loss": 0.6004,
+ "step": 701
+ },
+ {
+ "epoch": 0.6155532416287609,
+ "grad_norm": 0.2775394483279354,
+ "learning_rate": 4.605224171274198e-06,
+ "loss": 0.6042,
+ "step": 702
+ },
+ {
+ "epoch": 0.6164300980983175,
+ "grad_norm": 0.3010566442707075,
+ "learning_rate": 4.603947341233026e-06,
+ "loss": 0.5893,
+ "step": 703
+ },
+ {
+ "epoch": 0.6173069545678742,
+ "grad_norm": 0.28841806172316603,
+ "learning_rate": 4.602668627289401e-06,
+ "loss": 0.5932,
+ "step": 704
+ },
+ {
+ "epoch": 0.6181838110374308,
+ "grad_norm": 0.32720143492110876,
+ "learning_rate": 4.601388030588303e-06,
+ "loss": 0.594,
+ "step": 705
+ },
+ {
+ "epoch": 0.6190606675069874,
+ "grad_norm": 0.2629157828769276,
+ "learning_rate": 4.600105552276393e-06,
+ "loss": 0.5962,
+ "step": 706
+ },
+ {
+ "epoch": 0.6199375239765441,
+ "grad_norm": 0.2976311641314985,
+ "learning_rate": 4.598821193502019e-06,
+ "loss": 0.5993,
+ "step": 707
+ },
+ {
+ "epoch": 0.6208143804461007,
+ "grad_norm": 0.3223849407278096,
+ "learning_rate": 4.597534955415214e-06,
+ "loss": 0.6023,
+ "step": 708
+ },
+ {
+ "epoch": 0.6216912369156574,
+ "grad_norm": 0.3228934470983084,
+ "learning_rate": 4.596246839167692e-06,
+ "loss": 0.6058,
+ "step": 709
+ },
+ {
+ "epoch": 0.622568093385214,
+ "grad_norm": 0.2842350311614894,
+ "learning_rate": 4.59495684591285e-06,
+ "loss": 0.5965,
+ "step": 710
+ },
+ {
+ "epoch": 0.6234449498547706,
+ "grad_norm": 0.30037127301855626,
+ "learning_rate": 4.593664976805765e-06,
+ "loss": 0.5912,
+ "step": 711
+ },
+ {
+ "epoch": 0.6243218063243273,
+ "grad_norm": 0.29537031301186273,
+ "learning_rate": 4.592371233003195e-06,
+ "loss": 0.5847,
+ "step": 712
+ },
+ {
+ "epoch": 0.6251986627938839,
+ "grad_norm": 0.3099776656835445,
+ "learning_rate": 4.5910756156635725e-06,
+ "loss": 0.6061,
+ "step": 713
+ },
+ {
+ "epoch": 0.6260755192634405,
+ "grad_norm": 0.3343474177937486,
+ "learning_rate": 4.589778125947012e-06,
+ "loss": 0.5775,
+ "step": 714
+ },
+ {
+ "epoch": 0.6269523757329972,
+ "grad_norm": 0.26492597760028275,
+ "learning_rate": 4.588478765015304e-06,
+ "loss": 0.6008,
+ "step": 715
+ },
+ {
+ "epoch": 0.6278292322025538,
+ "grad_norm": 0.2996728173414987,
+ "learning_rate": 4.587177534031914e-06,
+ "loss": 0.5868,
+ "step": 716
+ },
+ {
+ "epoch": 0.6287060886721105,
+ "grad_norm": 0.269698012084879,
+ "learning_rate": 4.585874434161979e-06,
+ "loss": 0.5908,
+ "step": 717
+ },
+ {
+ "epoch": 0.6295829451416671,
+ "grad_norm": 0.3120812259438331,
+ "learning_rate": 4.584569466572313e-06,
+ "loss": 0.5964,
+ "step": 718
+ },
+ {
+ "epoch": 0.6304598016112237,
+ "grad_norm": 0.306605213663903,
+ "learning_rate": 4.583262632431402e-06,
+ "loss": 0.587,
+ "step": 719
+ },
+ {
+ "epoch": 0.6313366580807804,
+ "grad_norm": 0.31045769873517814,
+ "learning_rate": 4.581953932909403e-06,
+ "loss": 0.5924,
+ "step": 720
+ },
+ {
+ "epoch": 0.632213514550337,
+ "grad_norm": 0.30956000847409926,
+ "learning_rate": 4.580643369178142e-06,
+ "loss": 0.5905,
+ "step": 721
+ },
+ {
+ "epoch": 0.6330903710198936,
+ "grad_norm": 0.2980650280091205,
+ "learning_rate": 4.579330942411115e-06,
+ "loss": 0.5961,
+ "step": 722
+ },
+ {
+ "epoch": 0.6339672274894503,
+ "grad_norm": 0.2784986194522932,
+ "learning_rate": 4.578016653783488e-06,
+ "loss": 0.5962,
+ "step": 723
+ },
+ {
+ "epoch": 0.6348440839590069,
+ "grad_norm": 0.32816601752120567,
+ "learning_rate": 4.57670050447209e-06,
+ "loss": 0.6149,
+ "step": 724
+ },
+ {
+ "epoch": 0.6357209404285636,
+ "grad_norm": 0.2822290286934802,
+ "learning_rate": 4.575382495655421e-06,
+ "loss": 0.5915,
+ "step": 725
+ },
+ {
+ "epoch": 0.6365977968981202,
+ "grad_norm": 0.2993973936416954,
+ "learning_rate": 4.574062628513643e-06,
+ "loss": 0.59,
+ "step": 726
+ },
+ {
+ "epoch": 0.6374746533676768,
+ "grad_norm": 0.27875804168057794,
+ "learning_rate": 4.572740904228582e-06,
+ "loss": 0.6018,
+ "step": 727
+ },
+ {
+ "epoch": 0.6383515098372335,
+ "grad_norm": 0.3144256132274513,
+ "learning_rate": 4.571417323983727e-06,
+ "loss": 0.6056,
+ "step": 728
+ },
+ {
+ "epoch": 0.6392283663067901,
+ "grad_norm": 0.2763723528672814,
+ "learning_rate": 4.570091888964231e-06,
+ "loss": 0.5943,
+ "step": 729
+ },
+ {
+ "epoch": 0.6401052227763468,
+ "grad_norm": 0.3001278571328794,
+ "learning_rate": 4.5687646003569055e-06,
+ "loss": 0.588,
+ "step": 730
+ },
+ {
+ "epoch": 0.6409820792459034,
+ "grad_norm": 0.2847820308061442,
+ "learning_rate": 4.567435459350222e-06,
+ "loss": 0.5971,
+ "step": 731
+ },
+ {
+ "epoch": 0.64185893571546,
+ "grad_norm": 0.292512543142512,
+ "learning_rate": 4.566104467134311e-06,
+ "loss": 0.5864,
+ "step": 732
+ },
+ {
+ "epoch": 0.6427357921850168,
+ "grad_norm": 0.28968651062565176,
+ "learning_rate": 4.564771624900961e-06,
+ "loss": 0.62,
+ "step": 733
+ },
+ {
+ "epoch": 0.6436126486545733,
+ "grad_norm": 0.3004795852693458,
+ "learning_rate": 4.563436933843617e-06,
+ "loss": 0.5964,
+ "step": 734
+ },
+ {
+ "epoch": 0.64448950512413,
+ "grad_norm": 0.2865806085716862,
+ "learning_rate": 4.562100395157379e-06,
+ "loss": 0.6026,
+ "step": 735
+ },
+ {
+ "epoch": 0.6453663615936867,
+ "grad_norm": 0.2842649974188147,
+ "learning_rate": 4.560762010039001e-06,
+ "loss": 0.5913,
+ "step": 736
+ },
+ {
+ "epoch": 0.6462432180632433,
+ "grad_norm": 0.28683866497814775,
+ "learning_rate": 4.5594217796868915e-06,
+ "loss": 0.5951,
+ "step": 737
+ },
+ {
+ "epoch": 0.6471200745328,
+ "grad_norm": 0.2764873070461295,
+ "learning_rate": 4.558079705301109e-06,
+ "loss": 0.6053,
+ "step": 738
+ },
+ {
+ "epoch": 0.6479969310023566,
+ "grad_norm": 0.27004479414645,
+ "learning_rate": 4.556735788083366e-06,
+ "loss": 0.6039,
+ "step": 739
+ },
+ {
+ "epoch": 0.6488737874719132,
+ "grad_norm": 0.29052397029213667,
+ "learning_rate": 4.555390029237026e-06,
+ "loss": 0.601,
+ "step": 740
+ },
+ {
+ "epoch": 0.6497506439414699,
+ "grad_norm": 0.2947691340138793,
+ "learning_rate": 4.554042429967095e-06,
+ "loss": 0.6025,
+ "step": 741
+ },
+ {
+ "epoch": 0.6506275004110265,
+ "grad_norm": 0.2792458027197797,
+ "learning_rate": 4.552692991480234e-06,
+ "loss": 0.6014,
+ "step": 742
+ },
+ {
+ "epoch": 0.6515043568805831,
+ "grad_norm": 0.3382217380230472,
+ "learning_rate": 4.551341714984748e-06,
+ "loss": 0.5955,
+ "step": 743
+ },
+ {
+ "epoch": 0.6523812133501398,
+ "grad_norm": 0.2966197192699023,
+ "learning_rate": 4.549988601690588e-06,
+ "loss": 0.5935,
+ "step": 744
+ },
+ {
+ "epoch": 0.6532580698196964,
+ "grad_norm": 0.31516646846151397,
+ "learning_rate": 4.54863365280935e-06,
+ "loss": 0.597,
+ "step": 745
+ },
+ {
+ "epoch": 0.6541349262892531,
+ "grad_norm": 0.28496714910224397,
+ "learning_rate": 4.547276869554272e-06,
+ "loss": 0.5814,
+ "step": 746
+ },
+ {
+ "epoch": 0.6550117827588097,
+ "grad_norm": 0.30669749001026353,
+ "learning_rate": 4.545918253140236e-06,
+ "loss": 0.5952,
+ "step": 747
+ },
+ {
+ "epoch": 0.6558886392283663,
+ "grad_norm": 0.2812261666412913,
+ "learning_rate": 4.544557804783765e-06,
+ "loss": 0.6162,
+ "step": 748
+ },
+ {
+ "epoch": 0.656765495697923,
+ "grad_norm": 0.27761745178740765,
+ "learning_rate": 4.543195525703024e-06,
+ "loss": 0.5807,
+ "step": 749
+ },
+ {
+ "epoch": 0.6576423521674796,
+ "grad_norm": 0.31002121863979637,
+ "learning_rate": 4.541831417117815e-06,
+ "loss": 0.5851,
+ "step": 750
+ },
+ {
+ "epoch": 0.6585192086370363,
+ "grad_norm": 0.29034303454873894,
+ "learning_rate": 4.540465480249579e-06,
+ "loss": 0.6019,
+ "step": 751
+ },
+ {
+ "epoch": 0.6593960651065929,
+ "grad_norm": 0.30559901683462565,
+ "learning_rate": 4.539097716321394e-06,
+ "loss": 0.5866,
+ "step": 752
+ },
+ {
+ "epoch": 0.6602729215761495,
+ "grad_norm": 0.2641221990159659,
+ "learning_rate": 4.537728126557974e-06,
+ "loss": 0.5972,
+ "step": 753
+ },
+ {
+ "epoch": 0.6611497780457062,
+ "grad_norm": 0.3227708789669896,
+ "learning_rate": 4.536356712185668e-06,
+ "loss": 0.5796,
+ "step": 754
+ },
+ {
+ "epoch": 0.6620266345152628,
+ "grad_norm": 0.294701481555053,
+ "learning_rate": 4.534983474432458e-06,
+ "loss": 0.6149,
+ "step": 755
+ },
+ {
+ "epoch": 0.6629034909848194,
+ "grad_norm": 0.32377533070879033,
+ "learning_rate": 4.533608414527961e-06,
+ "loss": 0.5891,
+ "step": 756
+ },
+ {
+ "epoch": 0.6637803474543761,
+ "grad_norm": 0.3042889879699245,
+ "learning_rate": 4.532231533703423e-06,
+ "loss": 0.5913,
+ "step": 757
+ },
+ {
+ "epoch": 0.6646572039239327,
+ "grad_norm": 0.31760559251266973,
+ "learning_rate": 4.53085283319172e-06,
+ "loss": 0.6096,
+ "step": 758
+ },
+ {
+ "epoch": 0.6655340603934894,
+ "grad_norm": 0.3078941609749165,
+ "learning_rate": 4.529472314227362e-06,
+ "loss": 0.5905,
+ "step": 759
+ },
+ {
+ "epoch": 0.666410916863046,
+ "grad_norm": 0.30990175786815527,
+ "learning_rate": 4.528089978046481e-06,
+ "loss": 0.5991,
+ "step": 760
+ },
+ {
+ "epoch": 0.6672877733326026,
+ "grad_norm": 0.32903820758007046,
+ "learning_rate": 4.5267058258868414e-06,
+ "loss": 0.5882,
+ "step": 761
+ },
+ {
+ "epoch": 0.6681646298021593,
+ "grad_norm": 0.29452587669480845,
+ "learning_rate": 4.52531985898783e-06,
+ "loss": 0.5803,
+ "step": 762
+ },
+ {
+ "epoch": 0.6690414862717159,
+ "grad_norm": 0.30776706716693625,
+ "learning_rate": 4.52393207859046e-06,
+ "loss": 0.577,
+ "step": 763
+ },
+ {
+ "epoch": 0.6699183427412725,
+ "grad_norm": 0.31422641761257675,
+ "learning_rate": 4.522542485937369e-06,
+ "loss": 0.6018,
+ "step": 764
+ },
+ {
+ "epoch": 0.6707951992108292,
+ "grad_norm": 0.3173718550935184,
+ "learning_rate": 4.521151082272817e-06,
+ "loss": 0.5882,
+ "step": 765
+ },
+ {
+ "epoch": 0.6716720556803858,
+ "grad_norm": 0.2986562015643124,
+ "learning_rate": 4.519757868842685e-06,
+ "loss": 0.579,
+ "step": 766
+ },
+ {
+ "epoch": 0.6725489121499425,
+ "grad_norm": 0.3090764441547647,
+ "learning_rate": 4.518362846894475e-06,
+ "loss": 0.5985,
+ "step": 767
+ },
+ {
+ "epoch": 0.6734257686194991,
+ "grad_norm": 0.30790241933986734,
+ "learning_rate": 4.516966017677308e-06,
+ "loss": 0.5863,
+ "step": 768
+ },
+ {
+ "epoch": 0.6743026250890557,
+ "grad_norm": 0.2994056106304016,
+ "learning_rate": 4.515567382441923e-06,
+ "loss": 0.5991,
+ "step": 769
+ },
+ {
+ "epoch": 0.6751794815586124,
+ "grad_norm": 0.2958764046270931,
+ "learning_rate": 4.514166942440679e-06,
+ "loss": 0.5963,
+ "step": 770
+ },
+ {
+ "epoch": 0.676056338028169,
+ "grad_norm": 0.28788185549499157,
+ "learning_rate": 4.512764698927545e-06,
+ "loss": 0.6064,
+ "step": 771
+ },
+ {
+ "epoch": 0.6769331944977256,
+ "grad_norm": 0.29708423016925406,
+ "learning_rate": 4.511360653158111e-06,
+ "loss": 0.5947,
+ "step": 772
+ },
+ {
+ "epoch": 0.6778100509672823,
+ "grad_norm": 0.30991902940049315,
+ "learning_rate": 4.509954806389577e-06,
+ "loss": 0.5987,
+ "step": 773
+ },
+ {
+ "epoch": 0.6786869074368389,
+ "grad_norm": 0.2873916475278516,
+ "learning_rate": 4.508547159880758e-06,
+ "loss": 0.5924,
+ "step": 774
+ },
+ {
+ "epoch": 0.6795637639063956,
+ "grad_norm": 0.3007245570293541,
+ "learning_rate": 4.50713771489208e-06,
+ "loss": 0.6015,
+ "step": 775
+ },
+ {
+ "epoch": 0.6804406203759522,
+ "grad_norm": 0.30867041078073276,
+ "learning_rate": 4.505726472685577e-06,
+ "loss": 0.5957,
+ "step": 776
+ },
+ {
+ "epoch": 0.6813174768455088,
+ "grad_norm": 0.31345922212682475,
+ "learning_rate": 4.504313434524894e-06,
+ "loss": 0.6006,
+ "step": 777
+ },
+ {
+ "epoch": 0.6821943333150655,
+ "grad_norm": 0.29707717549610757,
+ "learning_rate": 4.502898601675285e-06,
+ "loss": 0.5778,
+ "step": 778
+ },
+ {
+ "epoch": 0.6830711897846221,
+ "grad_norm": 0.3796068136152165,
+ "learning_rate": 4.501481975403611e-06,
+ "loss": 0.5991,
+ "step": 779
+ },
+ {
+ "epoch": 0.6839480462541788,
+ "grad_norm": 0.28337342976468866,
+ "learning_rate": 4.5000635569783365e-06,
+ "loss": 0.5948,
+ "step": 780
+ },
+ {
+ "epoch": 0.6848249027237354,
+ "grad_norm": 0.31230108669893153,
+ "learning_rate": 4.498643347669533e-06,
+ "loss": 0.5925,
+ "step": 781
+ },
+ {
+ "epoch": 0.685701759193292,
+ "grad_norm": 0.27904331433791485,
+ "learning_rate": 4.497221348748874e-06,
+ "loss": 0.5916,
+ "step": 782
+ },
+ {
+ "epoch": 0.6865786156628487,
+ "grad_norm": 0.2942542969448629,
+ "learning_rate": 4.4957975614896386e-06,
+ "loss": 0.5992,
+ "step": 783
+ },
+ {
+ "epoch": 0.6874554721324053,
+ "grad_norm": 0.2908765617548673,
+ "learning_rate": 4.494371987166703e-06,
+ "loss": 0.6065,
+ "step": 784
+ },
+ {
+ "epoch": 0.6883323286019619,
+ "grad_norm": 0.2840490179126863,
+ "learning_rate": 4.492944627056544e-06,
+ "loss": 0.5902,
+ "step": 785
+ },
+ {
+ "epoch": 0.6892091850715186,
+ "grad_norm": 0.2727369127304506,
+ "learning_rate": 4.491515482437242e-06,
+ "loss": 0.5867,
+ "step": 786
+ },
+ {
+ "epoch": 0.6900860415410752,
+ "grad_norm": 0.28769481832954025,
+ "learning_rate": 4.4900845545884695e-06,
+ "loss": 0.5922,
+ "step": 787
+ },
+ {
+ "epoch": 0.6909628980106319,
+ "grad_norm": 0.2906309237155975,
+ "learning_rate": 4.4886518447915e-06,
+ "loss": 0.5887,
+ "step": 788
+ },
+ {
+ "epoch": 0.6918397544801885,
+ "grad_norm": 0.2948842293422461,
+ "learning_rate": 4.487217354329201e-06,
+ "loss": 0.6006,
+ "step": 789
+ },
+ {
+ "epoch": 0.6927166109497451,
+ "grad_norm": 0.302074977476922,
+ "learning_rate": 4.4857810844860325e-06,
+ "loss": 0.5866,
+ "step": 790
+ },
+ {
+ "epoch": 0.6935934674193018,
+ "grad_norm": 0.32893770275300094,
+ "learning_rate": 4.484343036548051e-06,
+ "loss": 0.5976,
+ "step": 791
+ },
+ {
+ "epoch": 0.6944703238888584,
+ "grad_norm": 0.2778002794834819,
+ "learning_rate": 4.482903211802904e-06,
+ "loss": 0.584,
+ "step": 792
+ },
+ {
+ "epoch": 0.695347180358415,
+ "grad_norm": 0.294631010190205,
+ "learning_rate": 4.481461611539829e-06,
+ "loss": 0.5796,
+ "step": 793
+ },
+ {
+ "epoch": 0.6962240368279717,
+ "grad_norm": 0.26497721691156156,
+ "learning_rate": 4.480018237049655e-06,
+ "loss": 0.5921,
+ "step": 794
+ },
+ {
+ "epoch": 0.6971008932975283,
+ "grad_norm": 0.2571147884128945,
+ "learning_rate": 4.4785730896247985e-06,
+ "loss": 0.5967,
+ "step": 795
+ },
+ {
+ "epoch": 0.697977749767085,
+ "grad_norm": 0.27928133327664356,
+ "learning_rate": 4.477126170559262e-06,
+ "loss": 0.5933,
+ "step": 796
+ },
+ {
+ "epoch": 0.6988546062366416,
+ "grad_norm": 0.2678842819485542,
+ "learning_rate": 4.475677481148638e-06,
+ "loss": 0.6041,
+ "step": 797
+ },
+ {
+ "epoch": 0.6997314627061982,
+ "grad_norm": 0.2891606093702898,
+ "learning_rate": 4.474227022690102e-06,
+ "loss": 0.5957,
+ "step": 798
+ },
+ {
+ "epoch": 0.700608319175755,
+ "grad_norm": 0.288045727848727,
+ "learning_rate": 4.4727747964824135e-06,
+ "loss": 0.5904,
+ "step": 799
+ },
+ {
+ "epoch": 0.7014851756453115,
+ "grad_norm": 0.31585634496103415,
+ "learning_rate": 4.471320803825915e-06,
+ "loss": 0.5976,
+ "step": 800
+ },
+ {
+ "epoch": 0.7023620321148683,
+ "grad_norm": 0.2748185200755283,
+ "learning_rate": 4.469865046022531e-06,
+ "loss": 0.5752,
+ "step": 801
+ },
+ {
+ "epoch": 0.7032388885844248,
+ "grad_norm": 0.3355774877957403,
+ "learning_rate": 4.468407524375767e-06,
+ "loss": 0.5983,
+ "step": 802
+ },
+ {
+ "epoch": 0.7041157450539814,
+ "grad_norm": 0.29100988533473726,
+ "learning_rate": 4.466948240190707e-06,
+ "loss": 0.5942,
+ "step": 803
+ },
+ {
+ "epoch": 0.7049926015235382,
+ "grad_norm": 0.32395113661904446,
+ "learning_rate": 4.465487194774012e-06,
+ "loss": 0.5934,
+ "step": 804
+ },
+ {
+ "epoch": 0.7058694579930948,
+ "grad_norm": 0.27010926989878575,
+ "learning_rate": 4.464024389433924e-06,
+ "loss": 0.5965,
+ "step": 805
+ },
+ {
+ "epoch": 0.7067463144626513,
+ "grad_norm": 0.31589368881558894,
+ "learning_rate": 4.462559825480257e-06,
+ "loss": 0.5892,
+ "step": 806
+ },
+ {
+ "epoch": 0.7076231709322081,
+ "grad_norm": 0.2696414843727876,
+ "learning_rate": 4.461093504224401e-06,
+ "loss": 0.5995,
+ "step": 807
+ },
+ {
+ "epoch": 0.7085000274017647,
+ "grad_norm": 0.2953330107498836,
+ "learning_rate": 4.459625426979319e-06,
+ "loss": 0.5918,
+ "step": 808
+ },
+ {
+ "epoch": 0.7093768838713214,
+ "grad_norm": 0.281894292123873,
+ "learning_rate": 4.458155595059549e-06,
+ "loss": 0.5955,
+ "step": 809
+ },
+ {
+ "epoch": 0.710253740340878,
+ "grad_norm": 0.27376761478776995,
+ "learning_rate": 4.4566840097811956e-06,
+ "loss": 0.5871,
+ "step": 810
+ },
+ {
+ "epoch": 0.7111305968104346,
+ "grad_norm": 0.27713167306531405,
+ "learning_rate": 4.455210672461938e-06,
+ "loss": 0.595,
+ "step": 811
+ },
+ {
+ "epoch": 0.7120074532799913,
+ "grad_norm": 0.27385713088626723,
+ "learning_rate": 4.453735584421021e-06,
+ "loss": 0.5899,
+ "step": 812
+ },
+ {
+ "epoch": 0.7128843097495479,
+ "grad_norm": 0.29840396727897567,
+ "learning_rate": 4.452258746979258e-06,
+ "loss": 0.5844,
+ "step": 813
+ },
+ {
+ "epoch": 0.7137611662191045,
+ "grad_norm": 0.28333795883109736,
+ "learning_rate": 4.4507801614590285e-06,
+ "loss": 0.5939,
+ "step": 814
+ },
+ {
+ "epoch": 0.7146380226886612,
+ "grad_norm": 0.3089268512848077,
+ "learning_rate": 4.449299829184278e-06,
+ "loss": 0.5859,
+ "step": 815
+ },
+ {
+ "epoch": 0.7155148791582178,
+ "grad_norm": 0.2808961599877815,
+ "learning_rate": 4.447817751480516e-06,
+ "loss": 0.5871,
+ "step": 816
+ },
+ {
+ "epoch": 0.7163917356277745,
+ "grad_norm": 0.30287533725577037,
+ "learning_rate": 4.446333929674816e-06,
+ "loss": 0.593,
+ "step": 817
+ },
+ {
+ "epoch": 0.7172685920973311,
+ "grad_norm": 0.30584446638710266,
+ "learning_rate": 4.444848365095809e-06,
+ "loss": 0.5917,
+ "step": 818
+ },
+ {
+ "epoch": 0.7181454485668877,
+ "grad_norm": 0.27241453105670504,
+ "learning_rate": 4.44336105907369e-06,
+ "loss": 0.5896,
+ "step": 819
+ },
+ {
+ "epoch": 0.7190223050364444,
+ "grad_norm": 0.36474064413319707,
+ "learning_rate": 4.4418720129402145e-06,
+ "loss": 0.5861,
+ "step": 820
+ },
+ {
+ "epoch": 0.719899161506001,
+ "grad_norm": 0.2832577542195539,
+ "learning_rate": 4.4403812280286915e-06,
+ "loss": 0.5905,
+ "step": 821
+ },
+ {
+ "epoch": 0.7207760179755577,
+ "grad_norm": 0.32117553322486775,
+ "learning_rate": 4.4388887056739926e-06,
+ "loss": 0.5801,
+ "step": 822
+ },
+ {
+ "epoch": 0.7216528744451143,
+ "grad_norm": 0.27537463782509236,
+ "learning_rate": 4.43739444721254e-06,
+ "loss": 0.587,
+ "step": 823
+ },
+ {
+ "epoch": 0.7225297309146709,
+ "grad_norm": 0.3274304411602489,
+ "learning_rate": 4.435898453982313e-06,
+ "loss": 0.6024,
+ "step": 824
+ },
+ {
+ "epoch": 0.7234065873842276,
+ "grad_norm": 0.3232032167824163,
+ "learning_rate": 4.434400727322844e-06,
+ "loss": 0.6145,
+ "step": 825
+ },
+ {
+ "epoch": 0.7242834438537842,
+ "grad_norm": 0.3431783037261662,
+ "learning_rate": 4.432901268575218e-06,
+ "loss": 0.5937,
+ "step": 826
+ },
+ {
+ "epoch": 0.7251603003233408,
+ "grad_norm": 0.30897032551229503,
+ "learning_rate": 4.43140007908207e-06,
+ "loss": 0.598,
+ "step": 827
+ },
+ {
+ "epoch": 0.7260371567928975,
+ "grad_norm": 0.2934772547759602,
+ "learning_rate": 4.429897160187584e-06,
+ "loss": 0.5918,
+ "step": 828
+ },
+ {
+ "epoch": 0.7269140132624541,
+ "grad_norm": 0.31389790755569874,
+ "learning_rate": 4.4283925132374946e-06,
+ "loss": 0.5832,
+ "step": 829
+ },
+ {
+ "epoch": 0.7277908697320108,
+ "grad_norm": 0.29548260652561004,
+ "learning_rate": 4.426886139579083e-06,
+ "loss": 0.5937,
+ "step": 830
+ },
+ {
+ "epoch": 0.7286677262015674,
+ "grad_norm": 0.3162599265610075,
+ "learning_rate": 4.425378040561175e-06,
+ "loss": 0.5889,
+ "step": 831
+ },
+ {
+ "epoch": 0.729544582671124,
+ "grad_norm": 0.3057143041654656,
+ "learning_rate": 4.423868217534144e-06,
+ "loss": 0.5848,
+ "step": 832
+ },
+ {
+ "epoch": 0.7304214391406807,
+ "grad_norm": 0.29540394945672244,
+ "learning_rate": 4.4223566718499055e-06,
+ "loss": 0.5926,
+ "step": 833
+ },
+ {
+ "epoch": 0.7312982956102373,
+ "grad_norm": 0.30681513325771914,
+ "learning_rate": 4.420843404861917e-06,
+ "loss": 0.5838,
+ "step": 834
+ },
+ {
+ "epoch": 0.7321751520797939,
+ "grad_norm": 0.29780757398255076,
+ "learning_rate": 4.419328417925177e-06,
+ "loss": 0.5922,
+ "step": 835
+ },
+ {
+ "epoch": 0.7330520085493506,
+ "grad_norm": 0.28283439818927025,
+ "learning_rate": 4.417811712396226e-06,
+ "loss": 0.5875,
+ "step": 836
+ },
+ {
+ "epoch": 0.7339288650189072,
+ "grad_norm": 0.30029201304931724,
+ "learning_rate": 4.416293289633144e-06,
+ "loss": 0.5989,
+ "step": 837
+ },
+ {
+ "epoch": 0.7348057214884639,
+ "grad_norm": 0.29188774973524867,
+ "learning_rate": 4.414773150995543e-06,
+ "loss": 0.5878,
+ "step": 838
+ },
+ {
+ "epoch": 0.7356825779580205,
+ "grad_norm": 0.3037257039566602,
+ "learning_rate": 4.413251297844579e-06,
+ "loss": 0.5849,
+ "step": 839
+ },
+ {
+ "epoch": 0.7365594344275771,
+ "grad_norm": 0.31802355671271254,
+ "learning_rate": 4.411727731542937e-06,
+ "loss": 0.5873,
+ "step": 840
+ },
+ {
+ "epoch": 0.7374362908971338,
+ "grad_norm": 0.31892860544931334,
+ "learning_rate": 4.410202453454841e-06,
+ "loss": 0.5784,
+ "step": 841
+ },
+ {
+ "epoch": 0.7383131473666904,
+ "grad_norm": 0.31731371407494563,
+ "learning_rate": 4.408675464946043e-06,
+ "loss": 0.5973,
+ "step": 842
+ },
+ {
+ "epoch": 0.739190003836247,
+ "grad_norm": 0.2807004884396655,
+ "learning_rate": 4.40714676738383e-06,
+ "loss": 0.5842,
+ "step": 843
+ },
+ {
+ "epoch": 0.7400668603058037,
+ "grad_norm": 0.3102700515568577,
+ "learning_rate": 4.405616362137017e-06,
+ "loss": 0.584,
+ "step": 844
+ },
+ {
+ "epoch": 0.7409437167753603,
+ "grad_norm": 0.28221217756766914,
+ "learning_rate": 4.404084250575952e-06,
+ "loss": 0.599,
+ "step": 845
+ },
+ {
+ "epoch": 0.741820573244917,
+ "grad_norm": 0.284085524365953,
+ "learning_rate": 4.4025504340725056e-06,
+ "loss": 0.5799,
+ "step": 846
+ },
+ {
+ "epoch": 0.7426974297144736,
+ "grad_norm": 0.35367792241463614,
+ "learning_rate": 4.401014914000078e-06,
+ "loss": 0.5724,
+ "step": 847
+ },
+ {
+ "epoch": 0.7435742861840302,
+ "grad_norm": 0.26695572041406385,
+ "learning_rate": 4.3994776917335945e-06,
+ "loss": 0.5864,
+ "step": 848
+ },
+ {
+ "epoch": 0.7444511426535869,
+ "grad_norm": 0.3230503614090004,
+ "learning_rate": 4.397938768649505e-06,
+ "loss": 0.5781,
+ "step": 849
+ },
+ {
+ "epoch": 0.7453279991231435,
+ "grad_norm": 0.32670313161244324,
+ "learning_rate": 4.39639814612578e-06,
+ "loss": 0.5921,
+ "step": 850
+ },
+ {
+ "epoch": 0.7462048555927002,
+ "grad_norm": 0.2965265275169285,
+ "learning_rate": 4.394855825541915e-06,
+ "loss": 0.5847,
+ "step": 851
+ },
+ {
+ "epoch": 0.7470817120622568,
+ "grad_norm": 0.3364787473225747,
+ "learning_rate": 4.393311808278924e-06,
+ "loss": 0.6032,
+ "step": 852
+ },
+ {
+ "epoch": 0.7479585685318134,
+ "grad_norm": 0.2925797984612242,
+ "learning_rate": 4.391766095719341e-06,
+ "loss": 0.5966,
+ "step": 853
+ },
+ {
+ "epoch": 0.7488354250013701,
+ "grad_norm": 0.36558987387215064,
+ "learning_rate": 4.390218689247216e-06,
+ "loss": 0.5965,
+ "step": 854
+ },
+ {
+ "epoch": 0.7497122814709267,
+ "grad_norm": 0.31214927998435166,
+ "learning_rate": 4.388669590248119e-06,
+ "loss": 0.5799,
+ "step": 855
+ },
+ {
+ "epoch": 0.7505891379404833,
+ "grad_norm": 0.36912682982458045,
+ "learning_rate": 4.387118800109133e-06,
+ "loss": 0.5994,
+ "step": 856
+ },
+ {
+ "epoch": 0.75146599441004,
+ "grad_norm": 0.33858825867324854,
+ "learning_rate": 4.385566320218857e-06,
+ "loss": 0.5894,
+ "step": 857
+ },
+ {
+ "epoch": 0.7523428508795966,
+ "grad_norm": 0.3095865037795698,
+ "learning_rate": 4.384012151967401e-06,
+ "loss": 0.5808,
+ "step": 858
+ },
+ {
+ "epoch": 0.7532197073491533,
+ "grad_norm": 0.3163720033341599,
+ "learning_rate": 4.382456296746389e-06,
+ "loss": 0.61,
+ "step": 859
+ },
+ {
+ "epoch": 0.7540965638187099,
+ "grad_norm": 0.30746322298068,
+ "learning_rate": 4.3808987559489536e-06,
+ "loss": 0.5901,
+ "step": 860
+ },
+ {
+ "epoch": 0.7549734202882665,
+ "grad_norm": 0.3216332568956709,
+ "learning_rate": 4.379339530969738e-06,
+ "loss": 0.5824,
+ "step": 861
+ },
+ {
+ "epoch": 0.7558502767578232,
+ "grad_norm": 0.2924396456503393,
+ "learning_rate": 4.377778623204894e-06,
+ "loss": 0.587,
+ "step": 862
+ },
+ {
+ "epoch": 0.7567271332273798,
+ "grad_norm": 0.3102518126275497,
+ "learning_rate": 4.3762160340520765e-06,
+ "loss": 0.5722,
+ "step": 863
+ },
+ {
+ "epoch": 0.7576039896969364,
+ "grad_norm": 0.29990520801248277,
+ "learning_rate": 4.374651764910452e-06,
+ "loss": 0.5867,
+ "step": 864
+ },
+ {
+ "epoch": 0.7584808461664931,
+ "grad_norm": 0.2742400854190758,
+ "learning_rate": 4.373085817180684e-06,
+ "loss": 0.5897,
+ "step": 865
+ },
+ {
+ "epoch": 0.7593577026360497,
+ "grad_norm": 0.2966143324054175,
+ "learning_rate": 4.371518192264946e-06,
+ "loss": 0.593,
+ "step": 866
+ },
+ {
+ "epoch": 0.7602345591056064,
+ "grad_norm": 0.2659050257990803,
+ "learning_rate": 4.3699488915669106e-06,
+ "loss": 0.5933,
+ "step": 867
+ },
+ {
+ "epoch": 0.761111415575163,
+ "grad_norm": 0.28333909213084835,
+ "learning_rate": 4.368377916491749e-06,
+ "loss": 0.5937,
+ "step": 868
+ },
+ {
+ "epoch": 0.7619882720447196,
+ "grad_norm": 0.294367790561846,
+ "learning_rate": 4.366805268446132e-06,
+ "loss": 0.5908,
+ "step": 869
+ },
+ {
+ "epoch": 0.7628651285142763,
+ "grad_norm": 0.2892104769841804,
+ "learning_rate": 4.365230948838232e-06,
+ "loss": 0.5749,
+ "step": 870
+ },
+ {
+ "epoch": 0.7637419849838329,
+ "grad_norm": 0.2992157610185369,
+ "learning_rate": 4.3636549590777144e-06,
+ "loss": 0.6038,
+ "step": 871
+ },
+ {
+ "epoch": 0.7646188414533897,
+ "grad_norm": 0.2849149162166013,
+ "learning_rate": 4.362077300575742e-06,
+ "loss": 0.5838,
+ "step": 872
+ },
+ {
+ "epoch": 0.7654956979229462,
+ "grad_norm": 0.27419838720395556,
+ "learning_rate": 4.360497974744971e-06,
+ "loss": 0.5792,
+ "step": 873
+ },
+ {
+ "epoch": 0.7663725543925028,
+ "grad_norm": 0.2719357502719954,
+ "learning_rate": 4.35891698299955e-06,
+ "loss": 0.5879,
+ "step": 874
+ },
+ {
+ "epoch": 0.7672494108620596,
+ "grad_norm": 0.29276621658420166,
+ "learning_rate": 4.357334326755123e-06,
+ "loss": 0.5903,
+ "step": 875
+ },
+ {
+ "epoch": 0.7681262673316162,
+ "grad_norm": 0.29234711934765684,
+ "learning_rate": 4.3557500074288175e-06,
+ "loss": 0.58,
+ "step": 876
+ },
+ {
+ "epoch": 0.7690031238011727,
+ "grad_norm": 0.2900743371372321,
+ "learning_rate": 4.354164026439256e-06,
+ "loss": 0.5798,
+ "step": 877
+ },
+ {
+ "epoch": 0.7698799802707295,
+ "grad_norm": 0.26606697197934875,
+ "learning_rate": 4.352576385206547e-06,
+ "loss": 0.6049,
+ "step": 878
+ },
+ {
+ "epoch": 0.770756836740286,
+ "grad_norm": 0.30681607920100556,
+ "learning_rate": 4.350987085152286e-06,
+ "loss": 0.5963,
+ "step": 879
+ },
+ {
+ "epoch": 0.7716336932098428,
+ "grad_norm": 0.28024451945836265,
+ "learning_rate": 4.349396127699552e-06,
+ "loss": 0.6063,
+ "step": 880
+ },
+ {
+ "epoch": 0.7725105496793994,
+ "grad_norm": 0.284435176139814,
+ "learning_rate": 4.347803514272911e-06,
+ "loss": 0.5847,
+ "step": 881
+ },
+ {
+ "epoch": 0.773387406148956,
+ "grad_norm": 0.2787875052171573,
+ "learning_rate": 4.34620924629841e-06,
+ "loss": 0.5909,
+ "step": 882
+ },
+ {
+ "epoch": 0.7742642626185127,
+ "grad_norm": 0.28222554386796406,
+ "learning_rate": 4.344613325203577e-06,
+ "loss": 0.5815,
+ "step": 883
+ },
+ {
+ "epoch": 0.7751411190880693,
+ "grad_norm": 0.30850175508825417,
+ "learning_rate": 4.343015752417421e-06,
+ "loss": 0.5761,
+ "step": 884
+ },
+ {
+ "epoch": 0.7760179755576259,
+ "grad_norm": 0.27711497578948074,
+ "learning_rate": 4.341416529370431e-06,
+ "loss": 0.5851,
+ "step": 885
+ },
+ {
+ "epoch": 0.7768948320271826,
+ "grad_norm": 0.2945928621135004,
+ "learning_rate": 4.339815657494571e-06,
+ "loss": 0.5922,
+ "step": 886
+ },
+ {
+ "epoch": 0.7777716884967392,
+ "grad_norm": 0.2843169638684151,
+ "learning_rate": 4.338213138223285e-06,
+ "loss": 0.5835,
+ "step": 887
+ },
+ {
+ "epoch": 0.7786485449662959,
+ "grad_norm": 0.2840612846899258,
+ "learning_rate": 4.336608972991489e-06,
+ "loss": 0.596,
+ "step": 888
+ },
+ {
+ "epoch": 0.7795254014358525,
+ "grad_norm": 0.2677194609487142,
+ "learning_rate": 4.335003163235574e-06,
+ "loss": 0.5794,
+ "step": 889
+ },
+ {
+ "epoch": 0.7804022579054091,
+ "grad_norm": 0.31211329913480695,
+ "learning_rate": 4.3333957103934025e-06,
+ "loss": 0.5765,
+ "step": 890
+ },
+ {
+ "epoch": 0.7812791143749658,
+ "grad_norm": 0.28583623636409483,
+ "learning_rate": 4.33178661590431e-06,
+ "loss": 0.6016,
+ "step": 891
+ },
+ {
+ "epoch": 0.7821559708445224,
+ "grad_norm": 0.31500304190137224,
+ "learning_rate": 4.330175881209102e-06,
+ "loss": 0.5877,
+ "step": 892
+ },
+ {
+ "epoch": 0.783032827314079,
+ "grad_norm": 0.2811796495740926,
+ "learning_rate": 4.32856350775005e-06,
+ "loss": 0.5881,
+ "step": 893
+ },
+ {
+ "epoch": 0.7839096837836357,
+ "grad_norm": 0.29273259848443445,
+ "learning_rate": 4.3269494969708954e-06,
+ "loss": 0.5921,
+ "step": 894
+ },
+ {
+ "epoch": 0.7847865402531923,
+ "grad_norm": 0.27373150864211443,
+ "learning_rate": 4.325333850316846e-06,
+ "loss": 0.6,
+ "step": 895
+ },
+ {
+ "epoch": 0.785663396722749,
+ "grad_norm": 0.3128309122282222,
+ "learning_rate": 4.323716569234572e-06,
+ "loss": 0.5904,
+ "step": 896
+ },
+ {
+ "epoch": 0.7865402531923056,
+ "grad_norm": 0.2825745062634813,
+ "learning_rate": 4.32209765517221e-06,
+ "loss": 0.5816,
+ "step": 897
+ },
+ {
+ "epoch": 0.7874171096618622,
+ "grad_norm": 0.3282727674741808,
+ "learning_rate": 4.320477109579354e-06,
+ "loss": 0.5882,
+ "step": 898
+ },
+ {
+ "epoch": 0.7882939661314189,
+ "grad_norm": 0.2940095641373108,
+ "learning_rate": 4.318854933907065e-06,
+ "loss": 0.5985,
+ "step": 899
+ },
+ {
+ "epoch": 0.7891708226009755,
+ "grad_norm": 0.31182474508449737,
+ "learning_rate": 4.317231129607859e-06,
+ "loss": 0.5843,
+ "step": 900
+ },
+ {
+ "epoch": 0.7900476790705322,
+ "grad_norm": 0.26489892008261595,
+ "learning_rate": 4.315605698135714e-06,
+ "loss": 0.591,
+ "step": 901
+ },
+ {
+ "epoch": 0.7909245355400888,
+ "grad_norm": 0.32933790566988397,
+ "learning_rate": 4.313978640946061e-06,
+ "loss": 0.5826,
+ "step": 902
+ },
+ {
+ "epoch": 0.7918013920096454,
+ "grad_norm": 0.2790564068544957,
+ "learning_rate": 4.312349959495791e-06,
+ "loss": 0.5897,
+ "step": 903
+ },
+ {
+ "epoch": 0.7926782484792021,
+ "grad_norm": 0.29278849432785253,
+ "learning_rate": 4.310719655243243e-06,
+ "loss": 0.5929,
+ "step": 904
+ },
+ {
+ "epoch": 0.7935551049487587,
+ "grad_norm": 0.2898094197798441,
+ "learning_rate": 4.309087729648217e-06,
+ "loss": 0.575,
+ "step": 905
+ },
+ {
+ "epoch": 0.7944319614183153,
+ "grad_norm": 0.2962974584908221,
+ "learning_rate": 4.30745418417196e-06,
+ "loss": 0.5874,
+ "step": 906
+ },
+ {
+ "epoch": 0.795308817887872,
+ "grad_norm": 0.2894965323690623,
+ "learning_rate": 4.305819020277169e-06,
+ "loss": 0.5769,
+ "step": 907
+ },
+ {
+ "epoch": 0.7961856743574286,
+ "grad_norm": 0.2744231484838131,
+ "learning_rate": 4.304182239427992e-06,
+ "loss": 0.5943,
+ "step": 908
+ },
+ {
+ "epoch": 0.7970625308269853,
+ "grad_norm": 0.2766245048172803,
+ "learning_rate": 4.302543843090026e-06,
+ "loss": 0.5814,
+ "step": 909
+ },
+ {
+ "epoch": 0.7979393872965419,
+ "grad_norm": 0.2842673020480384,
+ "learning_rate": 4.30090383273031e-06,
+ "loss": 0.5912,
+ "step": 910
+ },
+ {
+ "epoch": 0.7988162437660985,
+ "grad_norm": 0.28199584242917014,
+ "learning_rate": 4.2992622098173335e-06,
+ "loss": 0.5809,
+ "step": 911
+ },
+ {
+ "epoch": 0.7996931002356552,
+ "grad_norm": 0.2820675876804688,
+ "learning_rate": 4.297618975821027e-06,
+ "loss": 0.5917,
+ "step": 912
+ },
+ {
+ "epoch": 0.8005699567052118,
+ "grad_norm": 0.2728605500328137,
+ "learning_rate": 4.2959741322127635e-06,
+ "loss": 0.5764,
+ "step": 913
+ },
+ {
+ "epoch": 0.8014468131747684,
+ "grad_norm": 0.27169399222059704,
+ "learning_rate": 4.294327680465358e-06,
+ "loss": 0.5849,
+ "step": 914
+ },
+ {
+ "epoch": 0.8023236696443251,
+ "grad_norm": 0.28063665744680427,
+ "learning_rate": 4.292679622053066e-06,
+ "loss": 0.58,
+ "step": 915
+ },
+ {
+ "epoch": 0.8032005261138817,
+ "grad_norm": 0.25926421536726935,
+ "learning_rate": 4.29102995845158e-06,
+ "loss": 0.5787,
+ "step": 916
+ },
+ {
+ "epoch": 0.8040773825834384,
+ "grad_norm": 0.29001417666592577,
+ "learning_rate": 4.289378691138032e-06,
+ "loss": 0.5868,
+ "step": 917
+ },
+ {
+ "epoch": 0.804954239052995,
+ "grad_norm": 0.27215185007216747,
+ "learning_rate": 4.287725821590987e-06,
+ "loss": 0.5894,
+ "step": 918
+ },
+ {
+ "epoch": 0.8058310955225516,
+ "grad_norm": 0.3050881231274449,
+ "learning_rate": 4.286071351290447e-06,
+ "loss": 0.5911,
+ "step": 919
+ },
+ {
+ "epoch": 0.8067079519921083,
+ "grad_norm": 0.2873456207891206,
+ "learning_rate": 4.2844152817178476e-06,
+ "loss": 0.5835,
+ "step": 920
+ },
+ {
+ "epoch": 0.8075848084616649,
+ "grad_norm": 0.2626365139918821,
+ "learning_rate": 4.282757614356055e-06,
+ "loss": 0.5794,
+ "step": 921
+ },
+ {
+ "epoch": 0.8084616649312216,
+ "grad_norm": 0.28122583577721894,
+ "learning_rate": 4.281098350689367e-06,
+ "loss": 0.581,
+ "step": 922
+ },
+ {
+ "epoch": 0.8093385214007782,
+ "grad_norm": 0.2955727164056087,
+ "learning_rate": 4.279437492203509e-06,
+ "loss": 0.6024,
+ "step": 923
+ },
+ {
+ "epoch": 0.8102153778703348,
+ "grad_norm": 0.2928465088558078,
+ "learning_rate": 4.277775040385636e-06,
+ "loss": 0.5777,
+ "step": 924
+ },
+ {
+ "epoch": 0.8110922343398915,
+ "grad_norm": 0.279748286657514,
+ "learning_rate": 4.276110996724332e-06,
+ "loss": 0.5983,
+ "step": 925
+ },
+ {
+ "epoch": 0.8119690908094481,
+ "grad_norm": 0.3064104243975942,
+ "learning_rate": 4.274445362709602e-06,
+ "loss": 0.5959,
+ "step": 926
+ },
+ {
+ "epoch": 0.8128459472790047,
+ "grad_norm": 0.2705400124701495,
+ "learning_rate": 4.272778139832876e-06,
+ "loss": 0.5964,
+ "step": 927
+ },
+ {
+ "epoch": 0.8137228037485614,
+ "grad_norm": 0.3030828027995252,
+ "learning_rate": 4.271109329587009e-06,
+ "loss": 0.5784,
+ "step": 928
+ },
+ {
+ "epoch": 0.814599660218118,
+ "grad_norm": 0.2629159770264448,
+ "learning_rate": 4.2694389334662745e-06,
+ "loss": 0.5845,
+ "step": 929
+ },
+ {
+ "epoch": 0.8154765166876747,
+ "grad_norm": 0.3351422353981342,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.5949,
+ "step": 930
+ },
+ {
+ "epoch": 0.8163533731572313,
+ "grad_norm": 0.2760441532769009,
+ "learning_rate": 4.2660933895844055e-06,
+ "loss": 0.5904,
+ "step": 931
+ },
+ {
+ "epoch": 0.8172302296267879,
+ "grad_norm": 0.30558832310943446,
+ "learning_rate": 4.264418244818914e-06,
+ "loss": 0.5839,
+ "step": 932
+ },
+ {
+ "epoch": 0.8181070860963446,
+ "grad_norm": 0.28070458613560756,
+ "learning_rate": 4.262741520169844e-06,
+ "loss": 0.5791,
+ "step": 933
+ },
+ {
+ "epoch": 0.8189839425659012,
+ "grad_norm": 0.2735766456330096,
+ "learning_rate": 4.261063217138554e-06,
+ "loss": 0.5836,
+ "step": 934
+ },
+ {
+ "epoch": 0.8198607990354578,
+ "grad_norm": 0.3038178849716158,
+ "learning_rate": 4.259383337227821e-06,
+ "loss": 0.5885,
+ "step": 935
+ },
+ {
+ "epoch": 0.8207376555050145,
+ "grad_norm": 0.26590487432268695,
+ "learning_rate": 4.25770188194183e-06,
+ "loss": 0.6035,
+ "step": 936
+ },
+ {
+ "epoch": 0.8216145119745711,
+ "grad_norm": 0.31271672720672494,
+ "learning_rate": 4.25601885278618e-06,
+ "loss": 0.5926,
+ "step": 937
+ },
+ {
+ "epoch": 0.8224913684441278,
+ "grad_norm": 0.26261561071530615,
+ "learning_rate": 4.254334251267877e-06,
+ "loss": 0.5996,
+ "step": 938
+ },
+ {
+ "epoch": 0.8233682249136844,
+ "grad_norm": 0.2891665251939073,
+ "learning_rate": 4.252648078895336e-06,
+ "loss": 0.5876,
+ "step": 939
+ },
+ {
+ "epoch": 0.824245081383241,
+ "grad_norm": 0.2897735311167941,
+ "learning_rate": 4.2509603371783776e-06,
+ "loss": 0.5892,
+ "step": 940
+ },
+ {
+ "epoch": 0.8251219378527977,
+ "grad_norm": 0.28026024666883764,
+ "learning_rate": 4.249271027628228e-06,
+ "loss": 0.587,
+ "step": 941
+ },
+ {
+ "epoch": 0.8259987943223543,
+ "grad_norm": 0.2765283292737123,
+ "learning_rate": 4.24758015175752e-06,
+ "loss": 0.5769,
+ "step": 942
+ },
+ {
+ "epoch": 0.826875650791911,
+ "grad_norm": 0.2921232680301083,
+ "learning_rate": 4.245887711080283e-06,
+ "loss": 0.5854,
+ "step": 943
+ },
+ {
+ "epoch": 0.8277525072614677,
+ "grad_norm": 0.3005072830624817,
+ "learning_rate": 4.2441937071119524e-06,
+ "loss": 0.5802,
+ "step": 944
+ },
+ {
+ "epoch": 0.8286293637310242,
+ "grad_norm": 0.27059131939602343,
+ "learning_rate": 4.242498141369361e-06,
+ "loss": 0.5837,
+ "step": 945
+ },
+ {
+ "epoch": 0.829506220200581,
+ "grad_norm": 0.3038588097565146,
+ "learning_rate": 4.240801015370743e-06,
+ "loss": 0.5869,
+ "step": 946
+ },
+ {
+ "epoch": 0.8303830766701376,
+ "grad_norm": 0.31875741653821127,
+ "learning_rate": 4.239102330635726e-06,
+ "loss": 0.5836,
+ "step": 947
+ },
+ {
+ "epoch": 0.8312599331396942,
+ "grad_norm": 0.26475770270890336,
+ "learning_rate": 4.2374020886853354e-06,
+ "loss": 0.5796,
+ "step": 948
+ },
+ {
+ "epoch": 0.8321367896092509,
+ "grad_norm": 0.31635648581412845,
+ "learning_rate": 4.235700291041989e-06,
+ "loss": 0.5732,
+ "step": 949
+ },
+ {
+ "epoch": 0.8330136460788075,
+ "grad_norm": 0.27123635854757305,
+ "learning_rate": 4.233996939229502e-06,
+ "loss": 0.5977,
+ "step": 950
+ },
+ {
+ "epoch": 0.8338905025483642,
+ "grad_norm": 0.3356358824197267,
+ "learning_rate": 4.232292034773076e-06,
+ "loss": 0.5871,
+ "step": 951
+ },
+ {
+ "epoch": 0.8347673590179208,
+ "grad_norm": 0.2723531290949244,
+ "learning_rate": 4.230585579199306e-06,
+ "loss": 0.5916,
+ "step": 952
+ },
+ {
+ "epoch": 0.8356442154874774,
+ "grad_norm": 0.2975424730057694,
+ "learning_rate": 4.228877574036175e-06,
+ "loss": 0.592,
+ "step": 953
+ },
+ {
+ "epoch": 0.8365210719570341,
+ "grad_norm": 0.28108527975014536,
+ "learning_rate": 4.227168020813053e-06,
+ "loss": 0.5788,
+ "step": 954
+ },
+ {
+ "epoch": 0.8373979284265907,
+ "grad_norm": 0.26358656072328285,
+ "learning_rate": 4.225456921060698e-06,
+ "loss": 0.5728,
+ "step": 955
+ },
+ {
+ "epoch": 0.8382747848961473,
+ "grad_norm": 0.2793044648839571,
+ "learning_rate": 4.223744276311249e-06,
+ "loss": 0.5714,
+ "step": 956
+ },
+ {
+ "epoch": 0.839151641365704,
+ "grad_norm": 0.30214577120239683,
+ "learning_rate": 4.222030088098233e-06,
+ "loss": 0.5993,
+ "step": 957
+ },
+ {
+ "epoch": 0.8400284978352606,
+ "grad_norm": 0.2639515397393347,
+ "learning_rate": 4.220314357956557e-06,
+ "loss": 0.5994,
+ "step": 958
+ },
+ {
+ "epoch": 0.8409053543048173,
+ "grad_norm": 0.3298154347341819,
+ "learning_rate": 4.218597087422508e-06,
+ "loss": 0.5877,
+ "step": 959
+ },
+ {
+ "epoch": 0.8417822107743739,
+ "grad_norm": 0.28203599665081885,
+ "learning_rate": 4.216878278033753e-06,
+ "loss": 0.5865,
+ "step": 960
+ },
+ {
+ "epoch": 0.8426590672439305,
+ "grad_norm": 0.2746406409148874,
+ "learning_rate": 4.2151579313293364e-06,
+ "loss": 0.5881,
+ "step": 961
+ },
+ {
+ "epoch": 0.8435359237134872,
+ "grad_norm": 0.33875497622714734,
+ "learning_rate": 4.2134360488496804e-06,
+ "loss": 0.6029,
+ "step": 962
+ },
+ {
+ "epoch": 0.8444127801830438,
+ "grad_norm": 0.2875141188036911,
+ "learning_rate": 4.211712632136581e-06,
+ "loss": 0.5845,
+ "step": 963
+ },
+ {
+ "epoch": 0.8452896366526004,
+ "grad_norm": 0.32374197566257723,
+ "learning_rate": 4.209987682733207e-06,
+ "loss": 0.589,
+ "step": 964
+ },
+ {
+ "epoch": 0.8461664931221571,
+ "grad_norm": 0.26718900480287466,
+ "learning_rate": 4.208261202184104e-06,
+ "loss": 0.5844,
+ "step": 965
+ },
+ {
+ "epoch": 0.8470433495917137,
+ "grad_norm": 0.29759515513279916,
+ "learning_rate": 4.206533192035184e-06,
+ "loss": 0.5817,
+ "step": 966
+ },
+ {
+ "epoch": 0.8479202060612704,
+ "grad_norm": 0.28330165664862006,
+ "learning_rate": 4.20480365383373e-06,
+ "loss": 0.5853,
+ "step": 967
+ },
+ {
+ "epoch": 0.848797062530827,
+ "grad_norm": 0.26991723910735316,
+ "learning_rate": 4.203072589128394e-06,
+ "loss": 0.5847,
+ "step": 968
+ },
+ {
+ "epoch": 0.8496739190003836,
+ "grad_norm": 0.28120405866784015,
+ "learning_rate": 4.201339999469194e-06,
+ "loss": 0.5771,
+ "step": 969
+ },
+ {
+ "epoch": 0.8505507754699403,
+ "grad_norm": 0.29731566030764794,
+ "learning_rate": 4.199605886407515e-06,
+ "loss": 0.5872,
+ "step": 970
+ },
+ {
+ "epoch": 0.8514276319394969,
+ "grad_norm": 0.29823098898704575,
+ "learning_rate": 4.197870251496104e-06,
+ "loss": 0.585,
+ "step": 971
+ },
+ {
+ "epoch": 0.8523044884090536,
+ "grad_norm": 0.29246400163730035,
+ "learning_rate": 4.196133096289071e-06,
+ "loss": 0.5728,
+ "step": 972
+ },
+ {
+ "epoch": 0.8531813448786102,
+ "grad_norm": 0.31038345035918974,
+ "learning_rate": 4.194394422341888e-06,
+ "loss": 0.588,
+ "step": 973
+ },
+ {
+ "epoch": 0.8540582013481668,
+ "grad_norm": 0.29419655403066824,
+ "learning_rate": 4.192654231211389e-06,
+ "loss": 0.5802,
+ "step": 974
+ },
+ {
+ "epoch": 0.8549350578177235,
+ "grad_norm": 0.28924212129082133,
+ "learning_rate": 4.190912524455762e-06,
+ "loss": 0.5957,
+ "step": 975
+ },
+ {
+ "epoch": 0.8558119142872801,
+ "grad_norm": 0.3433724407789192,
+ "learning_rate": 4.189169303634555e-06,
+ "loss": 0.5943,
+ "step": 976
+ },
+ {
+ "epoch": 0.8566887707568367,
+ "grad_norm": 0.3447246872111939,
+ "learning_rate": 4.187424570308671e-06,
+ "loss": 0.5679,
+ "step": 977
+ },
+ {
+ "epoch": 0.8575656272263934,
+ "grad_norm": 0.2717297839127488,
+ "learning_rate": 4.185678326040369e-06,
+ "loss": 0.5839,
+ "step": 978
+ },
+ {
+ "epoch": 0.85844248369595,
+ "grad_norm": 0.3149777108439808,
+ "learning_rate": 4.1839305723932565e-06,
+ "loss": 0.5684,
+ "step": 979
+ },
+ {
+ "epoch": 0.8593193401655067,
+ "grad_norm": 0.3196280126814673,
+ "learning_rate": 4.1821813109322975e-06,
+ "loss": 0.5845,
+ "step": 980
+ },
+ {
+ "epoch": 0.8601961966350633,
+ "grad_norm": 0.3166850113740036,
+ "learning_rate": 4.180430543223803e-06,
+ "loss": 0.5722,
+ "step": 981
+ },
+ {
+ "epoch": 0.8610730531046199,
+ "grad_norm": 0.30727325041845543,
+ "learning_rate": 4.178678270835435e-06,
+ "loss": 0.582,
+ "step": 982
+ },
+ {
+ "epoch": 0.8619499095741766,
+ "grad_norm": 0.34738075452538025,
+ "learning_rate": 4.1769244953361995e-06,
+ "loss": 0.5789,
+ "step": 983
+ },
+ {
+ "epoch": 0.8628267660437332,
+ "grad_norm": 0.3029018585056203,
+ "learning_rate": 4.1751692182964524e-06,
+ "loss": 0.5906,
+ "step": 984
+ },
+ {
+ "epoch": 0.8637036225132898,
+ "grad_norm": 0.27172806950560857,
+ "learning_rate": 4.1734124412878915e-06,
+ "loss": 0.5864,
+ "step": 985
+ },
+ {
+ "epoch": 0.8645804789828465,
+ "grad_norm": 0.3078626255245488,
+ "learning_rate": 4.171654165883558e-06,
+ "loss": 0.5961,
+ "step": 986
+ },
+ {
+ "epoch": 0.8654573354524031,
+ "grad_norm": 0.28755523271585887,
+ "learning_rate": 4.169894393657834e-06,
+ "loss": 0.5881,
+ "step": 987
+ },
+ {
+ "epoch": 0.8663341919219598,
+ "grad_norm": 0.3081436303822685,
+ "learning_rate": 4.168133126186445e-06,
+ "loss": 0.5818,
+ "step": 988
+ },
+ {
+ "epoch": 0.8672110483915164,
+ "grad_norm": 0.2785218381541765,
+ "learning_rate": 4.166370365046452e-06,
+ "loss": 0.5828,
+ "step": 989
+ },
+ {
+ "epoch": 0.868087904861073,
+ "grad_norm": 0.3391784184001714,
+ "learning_rate": 4.164606111816256e-06,
+ "loss": 0.5867,
+ "step": 990
+ },
+ {
+ "epoch": 0.8689647613306297,
+ "grad_norm": 0.27636992919331915,
+ "learning_rate": 4.162840368075591e-06,
+ "loss": 0.599,
+ "step": 991
+ },
+ {
+ "epoch": 0.8698416178001863,
+ "grad_norm": 0.28517927301055196,
+ "learning_rate": 4.161073135405529e-06,
+ "loss": 0.5831,
+ "step": 992
+ },
+ {
+ "epoch": 0.870718474269743,
+ "grad_norm": 0.29490820494014364,
+ "learning_rate": 4.1593044153884745e-06,
+ "loss": 0.5757,
+ "step": 993
+ },
+ {
+ "epoch": 0.8715953307392996,
+ "grad_norm": 0.2780476402469785,
+ "learning_rate": 4.157534209608161e-06,
+ "loss": 0.5964,
+ "step": 994
+ },
+ {
+ "epoch": 0.8724721872088562,
+ "grad_norm": 0.29068689725516644,
+ "learning_rate": 4.155762519649654e-06,
+ "loss": 0.5805,
+ "step": 995
+ },
+ {
+ "epoch": 0.8733490436784129,
+ "grad_norm": 0.26095614944942314,
+ "learning_rate": 4.15398934709935e-06,
+ "loss": 0.5841,
+ "step": 996
+ },
+ {
+ "epoch": 0.8742259001479695,
+ "grad_norm": 0.31389428529448765,
+ "learning_rate": 4.1522146935449705e-06,
+ "loss": 0.5846,
+ "step": 997
+ },
+ {
+ "epoch": 0.8751027566175261,
+ "grad_norm": 0.26816106638671405,
+ "learning_rate": 4.150438560575563e-06,
+ "loss": 0.5833,
+ "step": 998
+ },
+ {
+ "epoch": 0.8759796130870828,
+ "grad_norm": 0.31604277041792156,
+ "learning_rate": 4.1486609497815025e-06,
+ "loss": 0.5888,
+ "step": 999
+ },
+ {
+ "epoch": 0.8768564695566394,
+ "grad_norm": 0.3606037237047822,
+ "learning_rate": 4.146881862754485e-06,
+ "loss": 0.5942,
+ "step": 1000
+ },
+ {
+ "epoch": 0.8777333260261961,
+ "grad_norm": 0.28543513756367406,
+ "learning_rate": 4.145101301087527e-06,
+ "loss": 0.5915,
+ "step": 1001
+ },
+ {
+ "epoch": 0.8786101824957527,
+ "grad_norm": 0.3462271962536017,
+ "learning_rate": 4.143319266374969e-06,
+ "loss": 0.5942,
+ "step": 1002
+ },
+ {
+ "epoch": 0.8794870389653093,
+ "grad_norm": 0.2833352289445499,
+ "learning_rate": 4.141535760212467e-06,
+ "loss": 0.5863,
+ "step": 1003
+ },
+ {
+ "epoch": 0.880363895434866,
+ "grad_norm": 0.35489814354695126,
+ "learning_rate": 4.139750784196998e-06,
+ "loss": 0.5924,
+ "step": 1004
+ },
+ {
+ "epoch": 0.8812407519044226,
+ "grad_norm": 0.2942335535458572,
+ "learning_rate": 4.137964339926852e-06,
+ "loss": 0.5892,
+ "step": 1005
+ },
+ {
+ "epoch": 0.8821176083739792,
+ "grad_norm": 0.32828822885224784,
+ "learning_rate": 4.136176429001634e-06,
+ "loss": 0.5909,
+ "step": 1006
+ },
+ {
+ "epoch": 0.8829944648435359,
+ "grad_norm": 0.3123727759868493,
+ "learning_rate": 4.134387053022266e-06,
+ "loss": 0.5845,
+ "step": 1007
+ },
+ {
+ "epoch": 0.8838713213130925,
+ "grad_norm": 0.2862421766790686,
+ "learning_rate": 4.132596213590977e-06,
+ "loss": 0.5848,
+ "step": 1008
+ },
+ {
+ "epoch": 0.8847481777826492,
+ "grad_norm": 0.32232750817039807,
+ "learning_rate": 4.1308039123113084e-06,
+ "loss": 0.5869,
+ "step": 1009
+ },
+ {
+ "epoch": 0.8856250342522058,
+ "grad_norm": 0.28776404090006724,
+ "learning_rate": 4.129010150788112e-06,
+ "loss": 0.5992,
+ "step": 1010
+ },
+ {
+ "epoch": 0.8865018907217624,
+ "grad_norm": 0.3257967217812331,
+ "learning_rate": 4.127214930627545e-06,
+ "loss": 0.5828,
+ "step": 1011
+ },
+ {
+ "epoch": 0.8873787471913192,
+ "grad_norm": 0.3065300730664574,
+ "learning_rate": 4.125418253437071e-06,
+ "loss": 0.578,
+ "step": 1012
+ },
+ {
+ "epoch": 0.8882556036608757,
+ "grad_norm": 0.29218143100925903,
+ "learning_rate": 4.123620120825459e-06,
+ "loss": 0.5939,
+ "step": 1013
+ },
+ {
+ "epoch": 0.8891324601304323,
+ "grad_norm": 0.28565794045128473,
+ "learning_rate": 4.121820534402781e-06,
+ "loss": 0.5868,
+ "step": 1014
+ },
+ {
+ "epoch": 0.890009316599989,
+ "grad_norm": 0.30898296228273797,
+ "learning_rate": 4.120019495780412e-06,
+ "loss": 0.582,
+ "step": 1015
+ },
+ {
+ "epoch": 0.8908861730695457,
+ "grad_norm": 0.2911662733325922,
+ "learning_rate": 4.118217006571023e-06,
+ "loss": 0.5923,
+ "step": 1016
+ },
+ {
+ "epoch": 0.8917630295391024,
+ "grad_norm": 0.2843342810887561,
+ "learning_rate": 4.116413068388589e-06,
+ "loss": 0.5754,
+ "step": 1017
+ },
+ {
+ "epoch": 0.892639886008659,
+ "grad_norm": 0.334401955522752,
+ "learning_rate": 4.11460768284838e-06,
+ "loss": 0.5895,
+ "step": 1018
+ },
+ {
+ "epoch": 0.8935167424782156,
+ "grad_norm": 0.2600873368987441,
+ "learning_rate": 4.11280085156696e-06,
+ "loss": 0.5858,
+ "step": 1019
+ },
+ {
+ "epoch": 0.8943935989477723,
+ "grad_norm": 0.3051388251322737,
+ "learning_rate": 4.110992576162193e-06,
+ "loss": 0.5861,
+ "step": 1020
+ },
+ {
+ "epoch": 0.8952704554173289,
+ "grad_norm": 0.30230682759222505,
+ "learning_rate": 4.109182858253231e-06,
+ "loss": 0.5857,
+ "step": 1021
+ },
+ {
+ "epoch": 0.8961473118868856,
+ "grad_norm": 0.27145584987414345,
+ "learning_rate": 4.107371699460521e-06,
+ "loss": 0.5827,
+ "step": 1022
+ },
+ {
+ "epoch": 0.8970241683564422,
+ "grad_norm": 0.2886096599363367,
+ "learning_rate": 4.1055591014057964e-06,
+ "loss": 0.5732,
+ "step": 1023
+ },
+ {
+ "epoch": 0.8979010248259988,
+ "grad_norm": 0.2643618798342576,
+ "learning_rate": 4.103745065712083e-06,
+ "loss": 0.581,
+ "step": 1024
+ },
+ {
+ "epoch": 0.8987778812955555,
+ "grad_norm": 0.27612674007258925,
+ "learning_rate": 4.101929594003694e-06,
+ "loss": 0.5774,
+ "step": 1025
+ },
+ {
+ "epoch": 0.8996547377651121,
+ "grad_norm": 0.2694404941538916,
+ "learning_rate": 4.100112687906224e-06,
+ "loss": 0.5792,
+ "step": 1026
+ },
+ {
+ "epoch": 0.9005315942346687,
+ "grad_norm": 0.26812897420311116,
+ "learning_rate": 4.098294349046556e-06,
+ "loss": 0.5945,
+ "step": 1027
+ },
+ {
+ "epoch": 0.9014084507042254,
+ "grad_norm": 0.2744007605554886,
+ "learning_rate": 4.0964745790528564e-06,
+ "loss": 0.5712,
+ "step": 1028
+ },
+ {
+ "epoch": 0.902285307173782,
+ "grad_norm": 0.2614641549143825,
+ "learning_rate": 4.09465337955457e-06,
+ "loss": 0.5756,
+ "step": 1029
+ },
+ {
+ "epoch": 0.9031621636433387,
+ "grad_norm": 0.25643605179903173,
+ "learning_rate": 4.092830752182423e-06,
+ "loss": 0.593,
+ "step": 1030
+ },
+ {
+ "epoch": 0.9040390201128953,
+ "grad_norm": 0.26698048225450505,
+ "learning_rate": 4.091006698568419e-06,
+ "loss": 0.5877,
+ "step": 1031
+ },
+ {
+ "epoch": 0.9049158765824519,
+ "grad_norm": 0.2655671129093472,
+ "learning_rate": 4.0891812203458425e-06,
+ "loss": 0.5701,
+ "step": 1032
+ },
+ {
+ "epoch": 0.9057927330520086,
+ "grad_norm": 0.2706223562384906,
+ "learning_rate": 4.08735431914925e-06,
+ "loss": 0.5818,
+ "step": 1033
+ },
+ {
+ "epoch": 0.9066695895215652,
+ "grad_norm": 0.26684323937974636,
+ "learning_rate": 4.085525996614472e-06,
+ "loss": 0.5878,
+ "step": 1034
+ },
+ {
+ "epoch": 0.9075464459911218,
+ "grad_norm": 0.24564951471442678,
+ "learning_rate": 4.083696254378615e-06,
+ "loss": 0.5967,
+ "step": 1035
+ },
+ {
+ "epoch": 0.9084233024606785,
+ "grad_norm": 0.2761933648093443,
+ "learning_rate": 4.081865094080053e-06,
+ "loss": 0.576,
+ "step": 1036
+ },
+ {
+ "epoch": 0.9093001589302351,
+ "grad_norm": 0.2722027493749199,
+ "learning_rate": 4.080032517358431e-06,
+ "loss": 0.579,
+ "step": 1037
+ },
+ {
+ "epoch": 0.9101770153997918,
+ "grad_norm": 0.5039307385586534,
+ "learning_rate": 4.078198525854664e-06,
+ "loss": 0.5943,
+ "step": 1038
+ },
+ {
+ "epoch": 0.9110538718693484,
+ "grad_norm": 0.26519176650439175,
+ "learning_rate": 4.0763631212109315e-06,
+ "loss": 0.5893,
+ "step": 1039
+ },
+ {
+ "epoch": 0.911930728338905,
+ "grad_norm": 0.2644411261920598,
+ "learning_rate": 4.074526305070679e-06,
+ "loss": 0.5791,
+ "step": 1040
+ },
+ {
+ "epoch": 0.9128075848084617,
+ "grad_norm": 0.27917354228958563,
+ "learning_rate": 4.072688079078616e-06,
+ "loss": 0.5847,
+ "step": 1041
+ },
+ {
+ "epoch": 0.9136844412780183,
+ "grad_norm": 0.27274252297201695,
+ "learning_rate": 4.070848444880716e-06,
+ "loss": 0.5695,
+ "step": 1042
+ },
+ {
+ "epoch": 0.914561297747575,
+ "grad_norm": 0.26541238057197397,
+ "learning_rate": 4.06900740412421e-06,
+ "loss": 0.5858,
+ "step": 1043
+ },
+ {
+ "epoch": 0.9154381542171316,
+ "grad_norm": 0.2687466193673103,
+ "learning_rate": 4.0671649584575925e-06,
+ "loss": 0.5832,
+ "step": 1044
+ },
+ {
+ "epoch": 0.9163150106866882,
+ "grad_norm": 0.27584447196087264,
+ "learning_rate": 4.065321109530612e-06,
+ "loss": 0.5828,
+ "step": 1045
+ },
+ {
+ "epoch": 0.9171918671562449,
+ "grad_norm": 0.27618254494046185,
+ "learning_rate": 4.063475858994276e-06,
+ "loss": 0.5829,
+ "step": 1046
+ },
+ {
+ "epoch": 0.9180687236258015,
+ "grad_norm": 0.2800627797716068,
+ "learning_rate": 4.061629208500847e-06,
+ "loss": 0.5813,
+ "step": 1047
+ },
+ {
+ "epoch": 0.9189455800953581,
+ "grad_norm": 0.2731973027581407,
+ "learning_rate": 4.059781159703839e-06,
+ "loss": 0.5907,
+ "step": 1048
+ },
+ {
+ "epoch": 0.9198224365649148,
+ "grad_norm": 0.2817329916742434,
+ "learning_rate": 4.057931714258022e-06,
+ "loss": 0.5845,
+ "step": 1049
+ },
+ {
+ "epoch": 0.9206992930344714,
+ "grad_norm": 0.2624010665247189,
+ "learning_rate": 4.056080873819412e-06,
+ "loss": 0.579,
+ "step": 1050
+ },
+ {
+ "epoch": 0.9215761495040281,
+ "grad_norm": 0.26121937584936983,
+ "learning_rate": 4.054228640045275e-06,
+ "loss": 0.5857,
+ "step": 1051
+ },
+ {
+ "epoch": 0.9224530059735847,
+ "grad_norm": 0.2832895486337394,
+ "learning_rate": 4.052375014594129e-06,
+ "loss": 0.5957,
+ "step": 1052
+ },
+ {
+ "epoch": 0.9233298624431413,
+ "grad_norm": 0.27671228904328893,
+ "learning_rate": 4.0505199991257325e-06,
+ "loss": 0.5791,
+ "step": 1053
+ },
+ {
+ "epoch": 0.924206718912698,
+ "grad_norm": 0.266998502123574,
+ "learning_rate": 4.048663595301093e-06,
+ "loss": 0.5896,
+ "step": 1054
+ },
+ {
+ "epoch": 0.9250835753822546,
+ "grad_norm": 0.3094016546060802,
+ "learning_rate": 4.046805804782456e-06,
+ "loss": 0.5788,
+ "step": 1055
+ },
+ {
+ "epoch": 0.9259604318518112,
+ "grad_norm": 0.2782662002801493,
+ "learning_rate": 4.0449466292333166e-06,
+ "loss": 0.5888,
+ "step": 1056
+ },
+ {
+ "epoch": 0.9268372883213679,
+ "grad_norm": 0.27821869081922773,
+ "learning_rate": 4.043086070318401e-06,
+ "loss": 0.5879,
+ "step": 1057
+ },
+ {
+ "epoch": 0.9277141447909245,
+ "grad_norm": 0.32143887759720546,
+ "learning_rate": 4.04122412970368e-06,
+ "loss": 0.5884,
+ "step": 1058
+ },
+ {
+ "epoch": 0.9285910012604812,
+ "grad_norm": 0.2598221780539352,
+ "learning_rate": 4.039360809056361e-06,
+ "loss": 0.58,
+ "step": 1059
+ },
+ {
+ "epoch": 0.9294678577300378,
+ "grad_norm": 0.3300275262996093,
+ "learning_rate": 4.037496110044885e-06,
+ "loss": 0.5963,
+ "step": 1060
+ },
+ {
+ "epoch": 0.9303447141995944,
+ "grad_norm": 0.2723517740568475,
+ "learning_rate": 4.035630034338928e-06,
+ "loss": 0.5684,
+ "step": 1061
+ },
+ {
+ "epoch": 0.9312215706691511,
+ "grad_norm": 0.26174388908838997,
+ "learning_rate": 4.033762583609398e-06,
+ "loss": 0.5741,
+ "step": 1062
+ },
+ {
+ "epoch": 0.9320984271387077,
+ "grad_norm": 0.2879705808043353,
+ "learning_rate": 4.031893759528439e-06,
+ "loss": 0.5651,
+ "step": 1063
+ },
+ {
+ "epoch": 0.9329752836082644,
+ "grad_norm": 0.27573911638107307,
+ "learning_rate": 4.030023563769418e-06,
+ "loss": 0.5738,
+ "step": 1064
+ },
+ {
+ "epoch": 0.933852140077821,
+ "grad_norm": 0.270890009890323,
+ "learning_rate": 4.028151998006934e-06,
+ "loss": 0.5748,
+ "step": 1065
+ },
+ {
+ "epoch": 0.9347289965473776,
+ "grad_norm": 0.2651359065699047,
+ "learning_rate": 4.026279063916811e-06,
+ "loss": 0.5815,
+ "step": 1066
+ },
+ {
+ "epoch": 0.9356058530169343,
+ "grad_norm": 0.285792627094006,
+ "learning_rate": 4.024404763176101e-06,
+ "loss": 0.5714,
+ "step": 1067
+ },
+ {
+ "epoch": 0.9364827094864909,
+ "grad_norm": 0.25220096965602506,
+ "learning_rate": 4.022529097463076e-06,
+ "loss": 0.5761,
+ "step": 1068
+ },
+ {
+ "epoch": 0.9373595659560475,
+ "grad_norm": 0.2572736434059626,
+ "learning_rate": 4.020652068457234e-06,
+ "loss": 0.5813,
+ "step": 1069
+ },
+ {
+ "epoch": 0.9382364224256042,
+ "grad_norm": 0.2769717174034421,
+ "learning_rate": 4.018773677839289e-06,
+ "loss": 0.5902,
+ "step": 1070
+ },
+ {
+ "epoch": 0.9391132788951608,
+ "grad_norm": 0.2638965107730823,
+ "learning_rate": 4.016893927291179e-06,
+ "loss": 0.5774,
+ "step": 1071
+ },
+ {
+ "epoch": 0.9399901353647175,
+ "grad_norm": 0.26364544697361064,
+ "learning_rate": 4.015012818496057e-06,
+ "loss": 0.5885,
+ "step": 1072
+ },
+ {
+ "epoch": 0.9408669918342741,
+ "grad_norm": 0.2782490552191973,
+ "learning_rate": 4.013130353138293e-06,
+ "loss": 0.5734,
+ "step": 1073
+ },
+ {
+ "epoch": 0.9417438483038307,
+ "grad_norm": 0.2939309170345373,
+ "learning_rate": 4.011246532903472e-06,
+ "loss": 0.5863,
+ "step": 1074
+ },
+ {
+ "epoch": 0.9426207047733874,
+ "grad_norm": 0.27682818038097917,
+ "learning_rate": 4.00936135947839e-06,
+ "loss": 0.5878,
+ "step": 1075
+ },
+ {
+ "epoch": 0.943497561242944,
+ "grad_norm": 0.27100650217384786,
+ "learning_rate": 4.007474834551059e-06,
+ "loss": 0.5788,
+ "step": 1076
+ },
+ {
+ "epoch": 0.9443744177125006,
+ "grad_norm": 0.3179264915740243,
+ "learning_rate": 4.005586959810697e-06,
+ "loss": 0.5697,
+ "step": 1077
+ },
+ {
+ "epoch": 0.9452512741820573,
+ "grad_norm": 0.26927348365153236,
+ "learning_rate": 4.003697736947731e-06,
+ "loss": 0.5683,
+ "step": 1078
+ },
+ {
+ "epoch": 0.9461281306516139,
+ "grad_norm": 0.2755764124341007,
+ "learning_rate": 4.001807167653798e-06,
+ "loss": 0.5794,
+ "step": 1079
+ },
+ {
+ "epoch": 0.9470049871211706,
+ "grad_norm": 0.2908090312996085,
+ "learning_rate": 3.999915253621739e-06,
+ "loss": 0.586,
+ "step": 1080
+ },
+ {
+ "epoch": 0.9478818435907272,
+ "grad_norm": 0.2545666408606057,
+ "learning_rate": 3.998021996545599e-06,
+ "loss": 0.5831,
+ "step": 1081
+ },
+ {
+ "epoch": 0.9487587000602838,
+ "grad_norm": 0.29377943743323887,
+ "learning_rate": 3.9961273981206245e-06,
+ "loss": 0.585,
+ "step": 1082
+ },
+ {
+ "epoch": 0.9496355565298406,
+ "grad_norm": 0.26968750170325856,
+ "learning_rate": 3.994231460043265e-06,
+ "loss": 0.5782,
+ "step": 1083
+ },
+ {
+ "epoch": 0.9505124129993971,
+ "grad_norm": 0.2911018694543167,
+ "learning_rate": 3.9923341840111675e-06,
+ "loss": 0.5813,
+ "step": 1084
+ },
+ {
+ "epoch": 0.9513892694689537,
+ "grad_norm": 0.32080813736390973,
+ "learning_rate": 3.99043557172318e-06,
+ "loss": 0.5836,
+ "step": 1085
+ },
+ {
+ "epoch": 0.9522661259385105,
+ "grad_norm": 0.2894185491332872,
+ "learning_rate": 3.988535624879344e-06,
+ "loss": 0.583,
+ "step": 1086
+ },
+ {
+ "epoch": 0.953142982408067,
+ "grad_norm": 0.3036439907360394,
+ "learning_rate": 3.986634345180899e-06,
+ "loss": 0.5753,
+ "step": 1087
+ },
+ {
+ "epoch": 0.9540198388776238,
+ "grad_norm": 0.30256015219807453,
+ "learning_rate": 3.984731734330273e-06,
+ "loss": 0.5787,
+ "step": 1088
+ },
+ {
+ "epoch": 0.9548966953471804,
+ "grad_norm": 0.2684694121785645,
+ "learning_rate": 3.982827794031091e-06,
+ "loss": 0.5811,
+ "step": 1089
+ },
+ {
+ "epoch": 0.955773551816737,
+ "grad_norm": 0.3047268297869491,
+ "learning_rate": 3.980922525988167e-06,
+ "loss": 0.5757,
+ "step": 1090
+ },
+ {
+ "epoch": 0.9566504082862937,
+ "grad_norm": 0.2680829692432763,
+ "learning_rate": 3.979015931907502e-06,
+ "loss": 0.5938,
+ "step": 1091
+ },
+ {
+ "epoch": 0.9575272647558503,
+ "grad_norm": 0.28352806229638294,
+ "learning_rate": 3.977108013496286e-06,
+ "loss": 0.5648,
+ "step": 1092
+ },
+ {
+ "epoch": 0.958404121225407,
+ "grad_norm": 0.27134893274934896,
+ "learning_rate": 3.975198772462896e-06,
+ "loss": 0.5959,
+ "step": 1093
+ },
+ {
+ "epoch": 0.9592809776949636,
+ "grad_norm": 0.27670636726963027,
+ "learning_rate": 3.973288210516889e-06,
+ "loss": 0.5825,
+ "step": 1094
+ },
+ {
+ "epoch": 0.9601578341645202,
+ "grad_norm": 0.27577855913411087,
+ "learning_rate": 3.971376329369011e-06,
+ "loss": 0.5763,
+ "step": 1095
+ },
+ {
+ "epoch": 0.9610346906340769,
+ "grad_norm": 0.2613562238768912,
+ "learning_rate": 3.969463130731183e-06,
+ "loss": 0.587,
+ "step": 1096
+ },
+ {
+ "epoch": 0.9619115471036335,
+ "grad_norm": 0.30682832359084977,
+ "learning_rate": 3.96754861631651e-06,
+ "loss": 0.6012,
+ "step": 1097
+ },
+ {
+ "epoch": 0.9627884035731901,
+ "grad_norm": 0.2753727317824162,
+ "learning_rate": 3.965632787839274e-06,
+ "loss": 0.593,
+ "step": 1098
+ },
+ {
+ "epoch": 0.9636652600427468,
+ "grad_norm": 0.2896526629743159,
+ "learning_rate": 3.963715647014932e-06,
+ "loss": 0.5823,
+ "step": 1099
+ },
+ {
+ "epoch": 0.9645421165123034,
+ "grad_norm": 0.28810606366408137,
+ "learning_rate": 3.961797195560118e-06,
+ "loss": 0.5844,
+ "step": 1100
+ },
+ {
+ "epoch": 0.9654189729818601,
+ "grad_norm": 0.2603559754869869,
+ "learning_rate": 3.959877435192639e-06,
+ "loss": 0.5803,
+ "step": 1101
+ },
+ {
+ "epoch": 0.9662958294514167,
+ "grad_norm": 0.28655269690518276,
+ "learning_rate": 3.957956367631475e-06,
+ "loss": 0.5707,
+ "step": 1102
+ },
+ {
+ "epoch": 0.9671726859209733,
+ "grad_norm": 0.3009451530592475,
+ "learning_rate": 3.956033994596773e-06,
+ "loss": 0.5771,
+ "step": 1103
+ },
+ {
+ "epoch": 0.96804954239053,
+ "grad_norm": 0.2577540703327921,
+ "learning_rate": 3.954110317809854e-06,
+ "loss": 0.576,
+ "step": 1104
+ },
+ {
+ "epoch": 0.9689263988600866,
+ "grad_norm": 0.29870257898995317,
+ "learning_rate": 3.952185338993202e-06,
+ "loss": 0.5872,
+ "step": 1105
+ },
+ {
+ "epoch": 0.9698032553296432,
+ "grad_norm": 0.2768702174324288,
+ "learning_rate": 3.95025905987047e-06,
+ "loss": 0.5831,
+ "step": 1106
+ },
+ {
+ "epoch": 0.9706801117991999,
+ "grad_norm": 0.288774627238478,
+ "learning_rate": 3.948331482166473e-06,
+ "loss": 0.5951,
+ "step": 1107
+ },
+ {
+ "epoch": 0.9715569682687565,
+ "grad_norm": 0.324678524263679,
+ "learning_rate": 3.94640260760719e-06,
+ "loss": 0.5734,
+ "step": 1108
+ },
+ {
+ "epoch": 0.9724338247383132,
+ "grad_norm": 0.2777093036856744,
+ "learning_rate": 3.944472437919761e-06,
+ "loss": 0.5846,
+ "step": 1109
+ },
+ {
+ "epoch": 0.9733106812078698,
+ "grad_norm": 0.337073965677139,
+ "learning_rate": 3.942540974832486e-06,
+ "loss": 0.5904,
+ "step": 1110
+ },
+ {
+ "epoch": 0.9741875376774264,
+ "grad_norm": 0.2919504390486104,
+ "learning_rate": 3.9406082200748216e-06,
+ "loss": 0.5901,
+ "step": 1111
+ },
+ {
+ "epoch": 0.9750643941469831,
+ "grad_norm": 0.26917415244282195,
+ "learning_rate": 3.938674175377383e-06,
+ "loss": 0.5727,
+ "step": 1112
+ },
+ {
+ "epoch": 0.9759412506165397,
+ "grad_norm": 0.2968354712585106,
+ "learning_rate": 3.93673884247194e-06,
+ "loss": 0.5684,
+ "step": 1113
+ },
+ {
+ "epoch": 0.9768181070860964,
+ "grad_norm": 0.26666333819741744,
+ "learning_rate": 3.934802223091415e-06,
+ "loss": 0.582,
+ "step": 1114
+ },
+ {
+ "epoch": 0.977694963555653,
+ "grad_norm": 0.2648009228041306,
+ "learning_rate": 3.932864318969882e-06,
+ "loss": 0.5732,
+ "step": 1115
+ },
+ {
+ "epoch": 0.9785718200252096,
+ "grad_norm": 0.26447715765911384,
+ "learning_rate": 3.930925131842567e-06,
+ "loss": 0.581,
+ "step": 1116
+ },
+ {
+ "epoch": 0.9794486764947663,
+ "grad_norm": 0.26650421292261106,
+ "learning_rate": 3.928984663445844e-06,
+ "loss": 0.578,
+ "step": 1117
+ },
+ {
+ "epoch": 0.9803255329643229,
+ "grad_norm": 0.27399427740484344,
+ "learning_rate": 3.927042915517234e-06,
+ "loss": 0.5841,
+ "step": 1118
+ },
+ {
+ "epoch": 0.9812023894338795,
+ "grad_norm": 0.29486187077568676,
+ "learning_rate": 3.925099889795404e-06,
+ "loss": 0.5791,
+ "step": 1119
+ },
+ {
+ "epoch": 0.9820792459034362,
+ "grad_norm": 0.27626862187200796,
+ "learning_rate": 3.9231555880201655e-06,
+ "loss": 0.5758,
+ "step": 1120
+ },
+ {
+ "epoch": 0.9829561023729928,
+ "grad_norm": 0.2709394700881976,
+ "learning_rate": 3.9212100119324704e-06,
+ "loss": 0.5725,
+ "step": 1121
+ },
+ {
+ "epoch": 0.9838329588425495,
+ "grad_norm": 0.257787971984586,
+ "learning_rate": 3.919263163274416e-06,
+ "loss": 0.5733,
+ "step": 1122
+ },
+ {
+ "epoch": 0.9847098153121061,
+ "grad_norm": 0.2854496376494655,
+ "learning_rate": 3.917315043789235e-06,
+ "loss": 0.5696,
+ "step": 1123
+ },
+ {
+ "epoch": 0.9855866717816627,
+ "grad_norm": 0.2566199610678738,
+ "learning_rate": 3.9153656552212995e-06,
+ "loss": 0.5813,
+ "step": 1124
+ },
+ {
+ "epoch": 0.9864635282512194,
+ "grad_norm": 0.2555880030988225,
+ "learning_rate": 3.913414999316118e-06,
+ "loss": 0.5945,
+ "step": 1125
+ },
+ {
+ "epoch": 0.987340384720776,
+ "grad_norm": 0.2577195559469773,
+ "learning_rate": 3.911463077820336e-06,
+ "loss": 0.5675,
+ "step": 1126
+ },
+ {
+ "epoch": 0.9882172411903326,
+ "grad_norm": 0.26851748898394834,
+ "learning_rate": 3.909509892481726e-06,
+ "loss": 0.5807,
+ "step": 1127
+ },
+ {
+ "epoch": 0.9890940976598893,
+ "grad_norm": 0.2617539578196299,
+ "learning_rate": 3.907555445049198e-06,
+ "loss": 0.5684,
+ "step": 1128
+ },
+ {
+ "epoch": 0.9899709541294459,
+ "grad_norm": 0.2586839170532308,
+ "learning_rate": 3.905599737272791e-06,
+ "loss": 0.5801,
+ "step": 1129
+ },
+ {
+ "epoch": 0.9908478105990026,
+ "grad_norm": 0.25049955800874396,
+ "learning_rate": 3.903642770903671e-06,
+ "loss": 0.5762,
+ "step": 1130
+ },
+ {
+ "epoch": 0.9917246670685592,
+ "grad_norm": 0.27270516361418773,
+ "learning_rate": 3.901684547694133e-06,
+ "loss": 0.5878,
+ "step": 1131
+ },
+ {
+ "epoch": 0.9926015235381158,
+ "grad_norm": 0.2816673997379789,
+ "learning_rate": 3.899725069397593e-06,
+ "loss": 0.5927,
+ "step": 1132
+ },
+ {
+ "epoch": 0.9934783800076725,
+ "grad_norm": 0.2679288547921494,
+ "learning_rate": 3.897764337768597e-06,
+ "loss": 0.5772,
+ "step": 1133
+ },
+ {
+ "epoch": 0.9943552364772291,
+ "grad_norm": 0.27040765991438753,
+ "learning_rate": 3.895802354562808e-06,
+ "loss": 0.5623,
+ "step": 1134
+ },
+ {
+ "epoch": 0.9952320929467857,
+ "grad_norm": 0.29605913619532825,
+ "learning_rate": 3.893839121537015e-06,
+ "loss": 0.5868,
+ "step": 1135
+ },
+ {
+ "epoch": 0.9961089494163424,
+ "grad_norm": 0.27461413478738583,
+ "learning_rate": 3.89187464044912e-06,
+ "loss": 0.5871,
+ "step": 1136
+ },
+ {
+ "epoch": 0.996985805885899,
+ "grad_norm": 0.28648748056684925,
+ "learning_rate": 3.8899089130581465e-06,
+ "loss": 0.5753,
+ "step": 1137
+ },
+ {
+ "epoch": 0.9978626623554557,
+ "grad_norm": 0.2925165297373746,
+ "learning_rate": 3.8879419411242335e-06,
+ "loss": 0.5828,
+ "step": 1138
+ },
+ {
+ "epoch": 0.9987395188250123,
+ "grad_norm": 0.29352029461564516,
+ "learning_rate": 3.885973726408634e-06,
+ "loss": 0.5842,
+ "step": 1139
+ },
+ {
+ "epoch": 0.9996163752945689,
+ "grad_norm": 0.28650442615475913,
+ "learning_rate": 3.884004270673711e-06,
+ "loss": 0.5803,
+ "step": 1140
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 3420,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 3,
+ "save_steps": 1140,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3818875539947520.0,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}