commit 068c39df12bcf780d5323544504cb69603cc6afe Author: ModelHub XC Date: Fri Jun 5 03:07:12 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: MaziyarPanahi/calme-2.1-qwen2-7b Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..80bfd2d --- /dev/null +++ b/.gitattributes @@ -0,0 +1,55 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..efdd607 --- /dev/null +++ b/README.md @@ -0,0 +1,186 @@ +--- +language: +- en +license: apache-2.0 +library_name: transformers +tags: +- chat +- qwen +- qwen2 +- finetune +- chatml +- OpenHermes-2.5 +- HelpSteer2 +- Orca +- SlimOrca +base_model: Qwen/Qwen2-7B +datasets: +- nvidia/HelpSteer2 +- teknium/OpenHermes-2.5 +- microsoft/orca-math-word-problems-200k +- Open-Orca/SlimOrca +model_name: calme-2.1-qwen2-7b +pipeline_tag: text-generation +inference: false +model_creator: MaziyarPanahi +quantized_by: MaziyarPanahi +model-index: +- name: calme-2.1-qwen2-7b + results: + - task: + type: text-generation + name: Text Generation + dataset: + name: IFEval (0-Shot) + type: HuggingFaceH4/ifeval + args: + num_few_shot: 0 + metrics: + - type: inst_level_strict_acc and prompt_level_strict_acc + value: 38.16 + name: strict accuracy + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=MaziyarPanahi/calme-2.1-qwen2-7b + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: BBH (3-Shot) + type: BBH + args: + num_few_shot: 3 + metrics: + - type: acc_norm + value: 31.01 + name: normalized accuracy + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=MaziyarPanahi/calme-2.1-qwen2-7b + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MATH Lvl 5 (4-Shot) + type: hendrycks/competition_math + args: + num_few_shot: 4 + metrics: + - type: exact_match + value: 21.07 + name: exact match + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=MaziyarPanahi/calme-2.1-qwen2-7b + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: GPQA (0-shot) + type: Idavidrein/gpqa + args: + num_few_shot: 0 + metrics: + - type: acc_norm + value: 5.26 + name: acc_norm + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=MaziyarPanahi/calme-2.1-qwen2-7b + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MuSR (0-shot) + type: TAUR-Lab/MuSR + args: + num_few_shot: 0 + metrics: + - type: acc_norm + value: 13.8 + name: acc_norm + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=MaziyarPanahi/calme-2.1-qwen2-7b + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MMLU-PRO (5-shot) + type: TIGER-Lab/MMLU-Pro + config: main + split: test + args: + num_few_shot: 5 + metrics: + - type: acc + value: 29.92 + name: accuracy + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=MaziyarPanahi/calme-2.1-qwen2-7b + name: Open LLM Leaderboard +--- + +Qwen2 fine-tune + +# MaziyarPanahi/calme-2.1-qwen2-7b + +This is a fine-tuned version of the `Qwen/Qwen2-7B` model. It aims to improve the base model across all benchmarks. + +# ⚡ Quantized GGUF + +All GGUF models are available here: [MaziyarPanahi/calme-2.1-qwen2-7b](https://huggingface.co/MaziyarPanahi/calme-2.1-qwen2-7b) + +# 🏆 [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) +Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_MaziyarPanahi__calme-2.1-qwen2-7b) + +| Metric |Value| +|-------------------|----:| +|Avg. |23.20| +|IFEval (0-Shot) |38.16| +|BBH (3-Shot) |31.01| +|MATH Lvl 5 (4-Shot)|21.07| +|GPQA (0-shot) | 5.26| +|MuSR (0-shot) |13.80| +|MMLU-PRO (5-shot) |29.92| + + + +# Prompt Template + +This model uses `ChatML` prompt template: + +``` +<|im_start|>system +{System} +<|im_end|> +<|im_start|>user +{User} +<|im_end|> +<|im_start|>assistant +{Assistant} +```` + +# How to use + + +```python + +# Use a pipeline as a high-level helper + +from transformers import pipeline + +messages = [ + {"role": "user", "content": "Who are you?"}, +] +pipe = pipeline("text-generation", model="MaziyarPanahi/calme-2.1-qwen2-7b") +pipe(messages) + + +# Load model directly + +from transformers import AutoTokenizer, AutoModelForCausalLM + +tokenizer = AutoTokenizer.from_pretrained("MaziyarPanahi/calme-2.1-qwen2-7b") +model = AutoModelForCausalLM.from_pretrained("MaziyarPanahi/calme-2.1-qwen2-7b") +``` diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..409be9f --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,5 @@ +{ + "<|endoftext|>": 151643, + "<|im_end|>": 151645, + "<|im_start|>": 151644 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..91781b9 --- /dev/null +++ b/config.json @@ -0,0 +1,27 @@ +{ + "_name_or_path": "Qwen/Qwen2-7B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.41.1", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..f93b0a3 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": 151643, + "max_new_tokens": 2048, + "transformers_version": "4.41.1" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..643b3f6 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe7157df403e3af3e0328886829a32f76957b8b56c99caedb1921f2356c63c5 +size 4877660776 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..aba3c82 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:146796ccef716e4bde6ae0a28aa6018afee4c506905e99a2131b356a13741114 +size 4932751008 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..b27f4b0 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ee4e44c5766fb4e3e19837f05c4ecb8dd65b0e917d14ca246436388cc5c01a2 +size 4330865200 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..d3298fe --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c6d17fbc1de294bf0b2891b568870bdf8d6293db48d691b64be3e63427402c +size 1089994880 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..6ca5084 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 15231233024 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00003-of-00004.safetensors" + } +} diff --git a/qwen2-fine-tunes-maziyar-panahi.webp b/qwen2-fine-tunes-maziyar-panahi.webp new file mode 100644 index 0000000..edfe08f Binary files /dev/null and b/qwen2-fine-tunes-maziyar-panahi.webp differ diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..2c53454 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,20 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..f9d83c4 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a12e3ba5d5e0ad173cf7b408ab8534c6be8cbc6a146714e9c7dc8cf2346603b1 +size 7028043 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..53e7151 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": null, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..002cb1d --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2133 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.026314635323012148, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.771545107670716e-05, + "grad_norm": 28.291993022356824, + "learning_rate": 4.385964912280702e-08, + "loss": 0.9764, + "step": 1 + }, + { + "epoch": 0.00017543090215341433, + "grad_norm": 11.00431285069151, + "learning_rate": 8.771929824561404e-08, + "loss": 0.7373, + "step": 2 + }, + { + "epoch": 0.0002631463532301215, + "grad_norm": 19.575902791602918, + "learning_rate": 1.3157894736842107e-07, + "loss": 0.92, + "step": 3 + }, + { + "epoch": 0.00035086180430682866, + "grad_norm": 28.862884630243123, + "learning_rate": 1.7543859649122808e-07, + "loss": 0.9196, + "step": 4 + }, + { + "epoch": 0.0004385772553835358, + "grad_norm": 15.982248327528751, + "learning_rate": 2.192982456140351e-07, + "loss": 0.8366, + "step": 5 + }, + { + "epoch": 0.000526292706460243, + "grad_norm": 31.85723876161732, + "learning_rate": 2.6315789473684213e-07, + "loss": 0.9335, + "step": 6 + }, + { + "epoch": 0.0006140081575369502, + "grad_norm": 21.310207454796295, + "learning_rate": 3.070175438596491e-07, + "loss": 0.8362, + "step": 7 + }, + { + "epoch": 0.0007017236086136573, + "grad_norm": 20.052830776823505, + "learning_rate": 3.5087719298245616e-07, + "loss": 0.8015, + "step": 8 + }, + { + "epoch": 0.0007894390596903645, + "grad_norm": 16.06788143210757, + "learning_rate": 3.9473684210526315e-07, + "loss": 0.8729, + "step": 9 + }, + { + "epoch": 0.0008771545107670716, + "grad_norm": 29.100726513914584, + "learning_rate": 4.385964912280702e-07, + "loss": 0.9058, + "step": 10 + }, + { + "epoch": 0.0009648699618437788, + "grad_norm": 13.993390572028792, + "learning_rate": 4.824561403508772e-07, + "loss": 0.7093, + "step": 11 + }, + { + "epoch": 0.001052585412920486, + "grad_norm": 21.107935511000072, + "learning_rate": 5.263157894736843e-07, + "loss": 0.8955, + "step": 12 + }, + { + "epoch": 0.0011403008639971931, + "grad_norm": 13.66193898339087, + "learning_rate": 5.701754385964912e-07, + "loss": 0.7219, + "step": 13 + }, + { + "epoch": 0.0012280163150739003, + "grad_norm": 10.537203866107753, + "learning_rate": 6.140350877192982e-07, + "loss": 0.8429, + "step": 14 + }, + { + "epoch": 0.0013157317661506075, + "grad_norm": 12.393106853157317, + "learning_rate": 6.578947368421053e-07, + "loss": 0.6708, + "step": 15 + }, + { + "epoch": 0.0014034472172273146, + "grad_norm": 8.734604355126535, + "learning_rate": 7.017543859649123e-07, + "loss": 0.6507, + "step": 16 + }, + { + "epoch": 0.0014911626683040218, + "grad_norm": 9.124362491394539, + "learning_rate": 7.456140350877194e-07, + "loss": 0.838, + "step": 17 + }, + { + "epoch": 0.001578878119380729, + "grad_norm": 8.958389642999963, + "learning_rate": 7.894736842105263e-07, + "loss": 0.6849, + "step": 18 + }, + { + "epoch": 0.0016665935704574361, + "grad_norm": 11.542677492312867, + "learning_rate": 8.333333333333333e-07, + "loss": 0.6926, + "step": 19 + }, + { + "epoch": 0.0017543090215341433, + "grad_norm": 8.045066225626593, + "learning_rate": 8.771929824561404e-07, + "loss": 0.7006, + "step": 20 + }, + { + "epoch": 0.0018420244726108505, + "grad_norm": 8.146906074379428, + "learning_rate": 9.210526315789474e-07, + "loss": 0.6737, + "step": 21 + }, + { + "epoch": 0.0019297399236875576, + "grad_norm": 6.502955757535831, + "learning_rate": 9.649122807017545e-07, + "loss": 0.7495, + "step": 22 + }, + { + "epoch": 0.002017455374764265, + "grad_norm": 8.736982858234592, + "learning_rate": 1.0087719298245615e-06, + "loss": 0.7324, + "step": 23 + }, + { + "epoch": 0.002105170825840972, + "grad_norm": 7.851959741269017, + "learning_rate": 1.0526315789473685e-06, + "loss": 0.6686, + "step": 24 + }, + { + "epoch": 0.002192886276917679, + "grad_norm": 8.594840793358543, + "learning_rate": 1.0964912280701756e-06, + "loss": 0.8064, + "step": 25 + }, + { + "epoch": 0.0022806017279943863, + "grad_norm": 8.935665287337994, + "learning_rate": 1.1403508771929824e-06, + "loss": 0.6751, + "step": 26 + }, + { + "epoch": 0.0023683171790710934, + "grad_norm": 11.146850280588064, + "learning_rate": 1.1842105263157894e-06, + "loss": 0.7884, + "step": 27 + }, + { + "epoch": 0.0024560326301478006, + "grad_norm": 6.917869007862471, + "learning_rate": 1.2280701754385965e-06, + "loss": 0.8772, + "step": 28 + }, + { + "epoch": 0.0025437480812245078, + "grad_norm": 9.32145567192897, + "learning_rate": 1.2719298245614037e-06, + "loss": 0.6486, + "step": 29 + }, + { + "epoch": 0.002631463532301215, + "grad_norm": 7.83399807213587, + "learning_rate": 1.3157894736842106e-06, + "loss": 0.7793, + "step": 30 + }, + { + "epoch": 0.002719178983377922, + "grad_norm": 5.701851482721999, + "learning_rate": 1.3596491228070178e-06, + "loss": 0.6418, + "step": 31 + }, + { + "epoch": 0.0028068944344546293, + "grad_norm": 6.357569510522249, + "learning_rate": 1.4035087719298246e-06, + "loss": 0.7803, + "step": 32 + }, + { + "epoch": 0.0028946098855313364, + "grad_norm": 6.1458878660724, + "learning_rate": 1.4473684210526317e-06, + "loss": 0.6075, + "step": 33 + }, + { + "epoch": 0.0029823253366080436, + "grad_norm": 5.258525934759675, + "learning_rate": 1.4912280701754387e-06, + "loss": 0.7558, + "step": 34 + }, + { + "epoch": 0.0030700407876847508, + "grad_norm": 5.96497463401995, + "learning_rate": 1.5350877192982458e-06, + "loss": 0.5807, + "step": 35 + }, + { + "epoch": 0.003157756238761458, + "grad_norm": 9.97378904781871, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.6766, + "step": 36 + }, + { + "epoch": 0.003245471689838165, + "grad_norm": 10.558130153122322, + "learning_rate": 1.6228070175438598e-06, + "loss": 0.6318, + "step": 37 + }, + { + "epoch": 0.0033331871409148723, + "grad_norm": 7.730592682668347, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.5723, + "step": 38 + }, + { + "epoch": 0.0034209025919915794, + "grad_norm": 6.513997535111305, + "learning_rate": 1.710526315789474e-06, + "loss": 0.7381, + "step": 39 + }, + { + "epoch": 0.0035086180430682866, + "grad_norm": 6.4186997859745185, + "learning_rate": 1.7543859649122807e-06, + "loss": 0.676, + "step": 40 + }, + { + "epoch": 0.0035963334941449938, + "grad_norm": 4.789756704738587, + "learning_rate": 1.798245614035088e-06, + "loss": 0.8106, + "step": 41 + }, + { + "epoch": 0.003684048945221701, + "grad_norm": 8.552415866186008, + "learning_rate": 1.8421052631578948e-06, + "loss": 0.7834, + "step": 42 + }, + { + "epoch": 0.003771764396298408, + "grad_norm": 5.104236885105078, + "learning_rate": 1.8859649122807019e-06, + "loss": 0.6694, + "step": 43 + }, + { + "epoch": 0.0038594798473751152, + "grad_norm": 6.998642641947579, + "learning_rate": 1.929824561403509e-06, + "loss": 0.7184, + "step": 44 + }, + { + "epoch": 0.003947195298451822, + "grad_norm": 6.754484565741454, + "learning_rate": 1.973684210526316e-06, + "loss": 0.7682, + "step": 45 + }, + { + "epoch": 0.00403491074952853, + "grad_norm": 5.702466747706841, + "learning_rate": 2.017543859649123e-06, + "loss": 0.7167, + "step": 46 + }, + { + "epoch": 0.004122626200605236, + "grad_norm": 7.038100758557257, + "learning_rate": 2.06140350877193e-06, + "loss": 0.6709, + "step": 47 + }, + { + "epoch": 0.004210341651681944, + "grad_norm": 8.659378609826204, + "learning_rate": 2.105263157894737e-06, + "loss": 0.6508, + "step": 48 + }, + { + "epoch": 0.004298057102758651, + "grad_norm": 9.315174303463822, + "learning_rate": 2.149122807017544e-06, + "loss": 0.6168, + "step": 49 + }, + { + "epoch": 0.004385772553835358, + "grad_norm": 7.447716885721135, + "learning_rate": 2.192982456140351e-06, + "loss": 0.6738, + "step": 50 + }, + { + "epoch": 0.004473488004912065, + "grad_norm": 5.600770404460154, + "learning_rate": 2.236842105263158e-06, + "loss": 0.6311, + "step": 51 + }, + { + "epoch": 0.004561203455988773, + "grad_norm": 7.059691201242354, + "learning_rate": 2.280701754385965e-06, + "loss": 0.7204, + "step": 52 + }, + { + "epoch": 0.004648918907065479, + "grad_norm": 5.589092290239263, + "learning_rate": 2.324561403508772e-06, + "loss": 0.7266, + "step": 53 + }, + { + "epoch": 0.004736634358142187, + "grad_norm": 5.801762781587569, + "learning_rate": 2.368421052631579e-06, + "loss": 0.5336, + "step": 54 + }, + { + "epoch": 0.004824349809218894, + "grad_norm": 5.599754768073974, + "learning_rate": 2.412280701754386e-06, + "loss": 0.6338, + "step": 55 + }, + { + "epoch": 0.004912065260295601, + "grad_norm": 5.66437398031977, + "learning_rate": 2.456140350877193e-06, + "loss": 0.7813, + "step": 56 + }, + { + "epoch": 0.004999780711372308, + "grad_norm": 6.32022790188225, + "learning_rate": 2.5e-06, + "loss": 0.6613, + "step": 57 + }, + { + "epoch": 0.0050874961624490156, + "grad_norm": 8.01474270706056, + "learning_rate": 2.5438596491228075e-06, + "loss": 0.6451, + "step": 58 + }, + { + "epoch": 0.005175211613525722, + "grad_norm": 6.586182462850705, + "learning_rate": 2.5877192982456147e-06, + "loss": 0.6984, + "step": 59 + }, + { + "epoch": 0.00526292706460243, + "grad_norm": 5.61553252576188, + "learning_rate": 2.631578947368421e-06, + "loss": 0.5773, + "step": 60 + }, + { + "epoch": 0.005350642515679137, + "grad_norm": 5.5274818204706895, + "learning_rate": 2.6754385964912284e-06, + "loss": 0.6083, + "step": 61 + }, + { + "epoch": 0.005438357966755844, + "grad_norm": 3.8762804528384254, + "learning_rate": 2.7192982456140356e-06, + "loss": 0.7174, + "step": 62 + }, + { + "epoch": 0.005526073417832551, + "grad_norm": 5.248404081335598, + "learning_rate": 2.7631578947368424e-06, + "loss": 0.7066, + "step": 63 + }, + { + "epoch": 0.0056137888689092585, + "grad_norm": 7.214109517049078, + "learning_rate": 2.8070175438596493e-06, + "loss": 0.692, + "step": 64 + }, + { + "epoch": 0.005701504319985965, + "grad_norm": 5.429278596290352, + "learning_rate": 2.8508771929824565e-06, + "loss": 0.6145, + "step": 65 + }, + { + "epoch": 0.005789219771062673, + "grad_norm": 17.638205100824422, + "learning_rate": 2.8947368421052634e-06, + "loss": 0.7677, + "step": 66 + }, + { + "epoch": 0.00587693522213938, + "grad_norm": 5.677374136021176, + "learning_rate": 2.9385964912280706e-06, + "loss": 0.6779, + "step": 67 + }, + { + "epoch": 0.005964650673216087, + "grad_norm": 5.453107411280262, + "learning_rate": 2.9824561403508774e-06, + "loss": 0.6428, + "step": 68 + }, + { + "epoch": 0.006052366124292794, + "grad_norm": 5.888626008478417, + "learning_rate": 3.0263157894736843e-06, + "loss": 0.6342, + "step": 69 + }, + { + "epoch": 0.0061400815753695015, + "grad_norm": 5.3185045733144225, + "learning_rate": 3.0701754385964915e-06, + "loss": 0.5644, + "step": 70 + }, + { + "epoch": 0.006227797026446208, + "grad_norm": 4.902919731780363, + "learning_rate": 3.1140350877192988e-06, + "loss": 0.709, + "step": 71 + }, + { + "epoch": 0.006315512477522916, + "grad_norm": 8.773622618503456, + "learning_rate": 3.157894736842105e-06, + "loss": 0.6674, + "step": 72 + }, + { + "epoch": 0.006403227928599623, + "grad_norm": 6.7570883776978174, + "learning_rate": 3.2017543859649124e-06, + "loss": 0.6918, + "step": 73 + }, + { + "epoch": 0.00649094337967633, + "grad_norm": 5.597179964370573, + "learning_rate": 3.2456140350877197e-06, + "loss": 0.7119, + "step": 74 + }, + { + "epoch": 0.006578658830753037, + "grad_norm": 5.4824260737552795, + "learning_rate": 3.289473684210527e-06, + "loss": 0.5667, + "step": 75 + }, + { + "epoch": 0.0066663742818297445, + "grad_norm": 6.083422094529157, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5972, + "step": 76 + }, + { + "epoch": 0.006754089732906451, + "grad_norm": 6.688559230122185, + "learning_rate": 3.3771929824561406e-06, + "loss": 0.6079, + "step": 77 + }, + { + "epoch": 0.006841805183983159, + "grad_norm": 4.675152512564395, + "learning_rate": 3.421052631578948e-06, + "loss": 0.6431, + "step": 78 + }, + { + "epoch": 0.006929520635059866, + "grad_norm": 6.61824094926871, + "learning_rate": 3.464912280701755e-06, + "loss": 0.7219, + "step": 79 + }, + { + "epoch": 0.007017236086136573, + "grad_norm": 4.3090639659166685, + "learning_rate": 3.5087719298245615e-06, + "loss": 0.6267, + "step": 80 + }, + { + "epoch": 0.00710495153721328, + "grad_norm": 5.908526205124108, + "learning_rate": 3.5526315789473687e-06, + "loss": 0.5598, + "step": 81 + }, + { + "epoch": 0.0071926669882899875, + "grad_norm": 4.954945711406169, + "learning_rate": 3.596491228070176e-06, + "loss": 0.6251, + "step": 82 + }, + { + "epoch": 0.007280382439366694, + "grad_norm": 6.403352381905709, + "learning_rate": 3.640350877192983e-06, + "loss": 0.6921, + "step": 83 + }, + { + "epoch": 0.007368097890443402, + "grad_norm": 5.8960340556018505, + "learning_rate": 3.6842105263157896e-06, + "loss": 0.5803, + "step": 84 + }, + { + "epoch": 0.007455813341520109, + "grad_norm": 5.5832723717085795, + "learning_rate": 3.728070175438597e-06, + "loss": 0.7109, + "step": 85 + }, + { + "epoch": 0.007543528792596816, + "grad_norm": 6.9538610646678425, + "learning_rate": 3.7719298245614037e-06, + "loss": 0.57, + "step": 86 + }, + { + "epoch": 0.007631244243673523, + "grad_norm": 4.9040721673618615, + "learning_rate": 3.815789473684211e-06, + "loss": 0.6681, + "step": 87 + }, + { + "epoch": 0.0077189596947502305, + "grad_norm": 4.367227562952691, + "learning_rate": 3.859649122807018e-06, + "loss": 0.5881, + "step": 88 + }, + { + "epoch": 0.007806675145826937, + "grad_norm": 6.135869823936115, + "learning_rate": 3.903508771929825e-06, + "loss": 0.6333, + "step": 89 + }, + { + "epoch": 0.007894390596903644, + "grad_norm": 5.26232269598073, + "learning_rate": 3.947368421052632e-06, + "loss": 0.6228, + "step": 90 + }, + { + "epoch": 0.007982106047980352, + "grad_norm": 5.478510766614749, + "learning_rate": 3.991228070175439e-06, + "loss": 0.6889, + "step": 91 + }, + { + "epoch": 0.00806982149905706, + "grad_norm": 7.252221492478827, + "learning_rate": 4.035087719298246e-06, + "loss": 0.6726, + "step": 92 + }, + { + "epoch": 0.008157536950133767, + "grad_norm": 6.810323867433885, + "learning_rate": 4.078947368421053e-06, + "loss": 0.6186, + "step": 93 + }, + { + "epoch": 0.008245252401210473, + "grad_norm": 5.1477310672971965, + "learning_rate": 4.12280701754386e-06, + "loss": 0.6739, + "step": 94 + }, + { + "epoch": 0.00833296785228718, + "grad_norm": 4.455009313283226, + "learning_rate": 4.166666666666667e-06, + "loss": 0.6676, + "step": 95 + }, + { + "epoch": 0.008420683303363888, + "grad_norm": 4.854476484535793, + "learning_rate": 4.210526315789474e-06, + "loss": 0.624, + "step": 96 + }, + { + "epoch": 0.008508398754440595, + "grad_norm": 8.775528791539337, + "learning_rate": 4.254385964912281e-06, + "loss": 0.7236, + "step": 97 + }, + { + "epoch": 0.008596114205517301, + "grad_norm": 4.656928105654083, + "learning_rate": 4.298245614035088e-06, + "loss": 0.4853, + "step": 98 + }, + { + "epoch": 0.008683829656594009, + "grad_norm": 6.1151229878888795, + "learning_rate": 4.342105263157895e-06, + "loss": 0.6611, + "step": 99 + }, + { + "epoch": 0.008771545107670716, + "grad_norm": 4.846266795088099, + "learning_rate": 4.385964912280702e-06, + "loss": 0.6899, + "step": 100 + }, + { + "epoch": 0.008859260558747424, + "grad_norm": 5.63076019856985, + "learning_rate": 4.429824561403509e-06, + "loss": 0.7394, + "step": 101 + }, + { + "epoch": 0.00894697600982413, + "grad_norm": 6.152211661702361, + "learning_rate": 4.473684210526316e-06, + "loss": 0.6366, + "step": 102 + }, + { + "epoch": 0.009034691460900838, + "grad_norm": 5.271237730819475, + "learning_rate": 4.517543859649123e-06, + "loss": 0.6776, + "step": 103 + }, + { + "epoch": 0.009122406911977545, + "grad_norm": 6.150704296921181, + "learning_rate": 4.56140350877193e-06, + "loss": 0.7287, + "step": 104 + }, + { + "epoch": 0.009210122363054253, + "grad_norm": 5.511353295743786, + "learning_rate": 4.605263157894737e-06, + "loss": 0.7156, + "step": 105 + }, + { + "epoch": 0.009297837814130959, + "grad_norm": 5.651321362023493, + "learning_rate": 4.649122807017544e-06, + "loss": 0.5971, + "step": 106 + }, + { + "epoch": 0.009385553265207666, + "grad_norm": 4.521052312786367, + "learning_rate": 4.692982456140351e-06, + "loss": 0.662, + "step": 107 + }, + { + "epoch": 0.009473268716284374, + "grad_norm": 6.5893774516601775, + "learning_rate": 4.736842105263158e-06, + "loss": 0.6838, + "step": 108 + }, + { + "epoch": 0.009560984167361081, + "grad_norm": 7.413604525506308, + "learning_rate": 4.780701754385965e-06, + "loss": 0.6798, + "step": 109 + }, + { + "epoch": 0.009648699618437787, + "grad_norm": 5.258683042524991, + "learning_rate": 4.824561403508772e-06, + "loss": 0.7137, + "step": 110 + }, + { + "epoch": 0.009736415069514495, + "grad_norm": 3.56629655229689, + "learning_rate": 4.8684210526315795e-06, + "loss": 0.5524, + "step": 111 + }, + { + "epoch": 0.009824130520591202, + "grad_norm": 7.972594797604, + "learning_rate": 4.912280701754386e-06, + "loss": 0.7946, + "step": 112 + }, + { + "epoch": 0.00991184597166791, + "grad_norm": 5.9169587346561965, + "learning_rate": 4.956140350877193e-06, + "loss": 0.6985, + "step": 113 + }, + { + "epoch": 0.009999561422744616, + "grad_norm": 4.9028768240583895, + "learning_rate": 5e-06, + "loss": 0.7471, + "step": 114 + }, + { + "epoch": 0.010087276873821324, + "grad_norm": 4.952040118758915, + "learning_rate": 4.999999903143301e-06, + "loss": 0.6645, + "step": 115 + }, + { + "epoch": 0.010174992324898031, + "grad_norm": 5.307375041926707, + "learning_rate": 4.999999612573212e-06, + "loss": 0.6568, + "step": 116 + }, + { + "epoch": 0.010262707775974739, + "grad_norm": 4.417210142946582, + "learning_rate": 4.9999991282897545e-06, + "loss": 0.6633, + "step": 117 + }, + { + "epoch": 0.010350423227051445, + "grad_norm": 6.813103500844099, + "learning_rate": 4.999998450292966e-06, + "loss": 0.7479, + "step": 118 + }, + { + "epoch": 0.010438138678128152, + "grad_norm": 5.220452049535287, + "learning_rate": 4.9999975785829e-06, + "loss": 0.5982, + "step": 119 + }, + { + "epoch": 0.01052585412920486, + "grad_norm": 6.470241976711781, + "learning_rate": 4.999996513159624e-06, + "loss": 0.5915, + "step": 120 + }, + { + "epoch": 0.010613569580281567, + "grad_norm": 5.236784827517624, + "learning_rate": 4.99999525402322e-06, + "loss": 0.665, + "step": 121 + }, + { + "epoch": 0.010701285031358273, + "grad_norm": 5.5322906674158565, + "learning_rate": 4.999993801173785e-06, + "loss": 0.473, + "step": 122 + }, + { + "epoch": 0.01078900048243498, + "grad_norm": 5.643434680672429, + "learning_rate": 4.999992154611433e-06, + "loss": 0.5802, + "step": 123 + }, + { + "epoch": 0.010876715933511688, + "grad_norm": 4.909123022379139, + "learning_rate": 4.9999903143362905e-06, + "loss": 0.6103, + "step": 124 + }, + { + "epoch": 0.010964431384588396, + "grad_norm": 7.046173121098522, + "learning_rate": 4.999988280348501e-06, + "loss": 0.6601, + "step": 125 + }, + { + "epoch": 0.011052146835665102, + "grad_norm": 5.567754476589664, + "learning_rate": 4.99998605264822e-06, + "loss": 0.7144, + "step": 126 + }, + { + "epoch": 0.01113986228674181, + "grad_norm": 6.670512866037107, + "learning_rate": 4.999983631235623e-06, + "loss": 0.5034, + "step": 127 + }, + { + "epoch": 0.011227577737818517, + "grad_norm": 5.068760146843144, + "learning_rate": 4.999981016110896e-06, + "loss": 0.5965, + "step": 128 + }, + { + "epoch": 0.011315293188895225, + "grad_norm": 5.493410339028754, + "learning_rate": 4.999978207274243e-06, + "loss": 0.6697, + "step": 129 + }, + { + "epoch": 0.01140300863997193, + "grad_norm": 5.662089015796081, + "learning_rate": 4.999975204725879e-06, + "loss": 0.7182, + "step": 130 + }, + { + "epoch": 0.011490724091048638, + "grad_norm": 3.734356064938746, + "learning_rate": 4.999972008466039e-06, + "loss": 0.632, + "step": 131 + }, + { + "epoch": 0.011578439542125346, + "grad_norm": 4.29907687663725, + "learning_rate": 4.99996861849497e-06, + "loss": 0.6321, + "step": 132 + }, + { + "epoch": 0.011666154993202053, + "grad_norm": 5.292963722155827, + "learning_rate": 4.999965034812934e-06, + "loss": 0.5768, + "step": 133 + }, + { + "epoch": 0.01175387044427876, + "grad_norm": 4.564589196086129, + "learning_rate": 4.99996125742021e-06, + "loss": 0.5991, + "step": 134 + }, + { + "epoch": 0.011841585895355467, + "grad_norm": 5.889974426321806, + "learning_rate": 4.99995728631709e-06, + "loss": 0.568, + "step": 135 + }, + { + "epoch": 0.011929301346432174, + "grad_norm": 4.903556688362067, + "learning_rate": 4.999953121503881e-06, + "loss": 0.6221, + "step": 136 + }, + { + "epoch": 0.012017016797508882, + "grad_norm": 4.652137494582458, + "learning_rate": 4.999948762980906e-06, + "loss": 0.6499, + "step": 137 + }, + { + "epoch": 0.012104732248585588, + "grad_norm": 7.2681565015460965, + "learning_rate": 4.999944210748504e-06, + "loss": 0.7997, + "step": 138 + }, + { + "epoch": 0.012192447699662295, + "grad_norm": 4.498966830496647, + "learning_rate": 4.999939464807027e-06, + "loss": 0.7033, + "step": 139 + }, + { + "epoch": 0.012280163150739003, + "grad_norm": 5.658829625864849, + "learning_rate": 4.999934525156842e-06, + "loss": 0.6234, + "step": 140 + }, + { + "epoch": 0.01236787860181571, + "grad_norm": 6.170987539440289, + "learning_rate": 4.9999293917983325e-06, + "loss": 0.7359, + "step": 141 + }, + { + "epoch": 0.012455594052892417, + "grad_norm": 4.889450035742974, + "learning_rate": 4.999924064731896e-06, + "loss": 0.6418, + "step": 142 + }, + { + "epoch": 0.012543309503969124, + "grad_norm": 5.565665252735285, + "learning_rate": 4.9999185439579445e-06, + "loss": 0.8114, + "step": 143 + }, + { + "epoch": 0.012631024955045832, + "grad_norm": 5.009655972578068, + "learning_rate": 4.9999128294769075e-06, + "loss": 0.7307, + "step": 144 + }, + { + "epoch": 0.01271874040612254, + "grad_norm": 5.011444448419762, + "learning_rate": 4.999906921289227e-06, + "loss": 0.6434, + "step": 145 + }, + { + "epoch": 0.012806455857199245, + "grad_norm": 5.91290249112379, + "learning_rate": 4.999900819395361e-06, + "loss": 0.7576, + "step": 146 + }, + { + "epoch": 0.012894171308275953, + "grad_norm": 5.291827066915767, + "learning_rate": 4.9998945237957814e-06, + "loss": 0.717, + "step": 147 + }, + { + "epoch": 0.01298188675935266, + "grad_norm": 6.889695918810895, + "learning_rate": 4.9998880344909765e-06, + "loss": 0.6566, + "step": 148 + }, + { + "epoch": 0.013069602210429368, + "grad_norm": 4.139725258131711, + "learning_rate": 4.999881351481449e-06, + "loss": 0.6139, + "step": 149 + }, + { + "epoch": 0.013157317661506074, + "grad_norm": 5.041147601092224, + "learning_rate": 4.999874474767718e-06, + "loss": 0.7046, + "step": 150 + }, + { + "epoch": 0.013245033112582781, + "grad_norm": 4.850191233243735, + "learning_rate": 4.999867404350315e-06, + "loss": 0.6494, + "step": 151 + }, + { + "epoch": 0.013332748563659489, + "grad_norm": 5.608814210289025, + "learning_rate": 4.999860140229788e-06, + "loss": 0.8654, + "step": 152 + }, + { + "epoch": 0.013420464014736197, + "grad_norm": 4.097824317856954, + "learning_rate": 4.9998526824067e-06, + "loss": 0.6889, + "step": 153 + }, + { + "epoch": 0.013508179465812903, + "grad_norm": 6.425927321695068, + "learning_rate": 4.999845030881629e-06, + "loss": 0.5837, + "step": 154 + }, + { + "epoch": 0.01359589491688961, + "grad_norm": 7.686652681051417, + "learning_rate": 4.999837185655168e-06, + "loss": 0.6869, + "step": 155 + }, + { + "epoch": 0.013683610367966318, + "grad_norm": 6.199666417167642, + "learning_rate": 4.9998291467279245e-06, + "loss": 0.7371, + "step": 156 + }, + { + "epoch": 0.013771325819043024, + "grad_norm": 6.797879751043678, + "learning_rate": 4.999820914100522e-06, + "loss": 0.6912, + "step": 157 + }, + { + "epoch": 0.013859041270119731, + "grad_norm": 9.837640179642968, + "learning_rate": 4.999812487773597e-06, + "loss": 0.8045, + "step": 158 + }, + { + "epoch": 0.013946756721196439, + "grad_norm": 6.620454193744729, + "learning_rate": 4.9998038677478044e-06, + "loss": 0.6018, + "step": 159 + }, + { + "epoch": 0.014034472172273146, + "grad_norm": 4.952380418390811, + "learning_rate": 4.99979505402381e-06, + "loss": 0.5851, + "step": 160 + }, + { + "epoch": 0.014122187623349852, + "grad_norm": 4.571346505498035, + "learning_rate": 4.999786046602299e-06, + "loss": 0.6633, + "step": 161 + }, + { + "epoch": 0.01420990307442656, + "grad_norm": 6.745466717777739, + "learning_rate": 4.999776845483968e-06, + "loss": 0.714, + "step": 162 + }, + { + "epoch": 0.014297618525503267, + "grad_norm": 4.888639355192875, + "learning_rate": 4.999767450669531e-06, + "loss": 0.5328, + "step": 163 + }, + { + "epoch": 0.014385333976579975, + "grad_norm": 5.263414218540685, + "learning_rate": 4.999757862159713e-06, + "loss": 0.6746, + "step": 164 + }, + { + "epoch": 0.014473049427656681, + "grad_norm": 5.8723140369149895, + "learning_rate": 4.99974807995526e-06, + "loss": 0.7101, + "step": 165 + }, + { + "epoch": 0.014560764878733388, + "grad_norm": 4.125348885535371, + "learning_rate": 4.999738104056931e-06, + "loss": 0.6418, + "step": 166 + }, + { + "epoch": 0.014648480329810096, + "grad_norm": 5.079939786355144, + "learning_rate": 4.999727934465495e-06, + "loss": 0.6757, + "step": 167 + }, + { + "epoch": 0.014736195780886804, + "grad_norm": 4.436648943550616, + "learning_rate": 4.999717571181742e-06, + "loss": 0.6878, + "step": 168 + }, + { + "epoch": 0.01482391123196351, + "grad_norm": 4.6070293178483706, + "learning_rate": 4.999707014206475e-06, + "loss": 0.6882, + "step": 169 + }, + { + "epoch": 0.014911626683040217, + "grad_norm": 4.337658765605819, + "learning_rate": 4.999696263540513e-06, + "loss": 0.6418, + "step": 170 + }, + { + "epoch": 0.014999342134116925, + "grad_norm": 5.834498841218243, + "learning_rate": 4.999685319184688e-06, + "loss": 0.6367, + "step": 171 + }, + { + "epoch": 0.015087057585193632, + "grad_norm": 6.027148776110112, + "learning_rate": 4.999674181139848e-06, + "loss": 0.7505, + "step": 172 + }, + { + "epoch": 0.015174773036270338, + "grad_norm": 4.712652033599274, + "learning_rate": 4.999662849406855e-06, + "loss": 0.7515, + "step": 173 + }, + { + "epoch": 0.015262488487347046, + "grad_norm": 5.325275991673836, + "learning_rate": 4.99965132398659e-06, + "loss": 0.7871, + "step": 174 + }, + { + "epoch": 0.015350203938423753, + "grad_norm": 5.006048437293231, + "learning_rate": 4.999639604879943e-06, + "loss": 0.6038, + "step": 175 + }, + { + "epoch": 0.015437919389500461, + "grad_norm": 4.692976251794895, + "learning_rate": 4.999627692087824e-06, + "loss": 0.7106, + "step": 176 + }, + { + "epoch": 0.015525634840577167, + "grad_norm": 6.484912012474024, + "learning_rate": 4.999615585611156e-06, + "loss": 0.6456, + "step": 177 + }, + { + "epoch": 0.015613350291653874, + "grad_norm": 7.072312221146792, + "learning_rate": 4.999603285450875e-06, + "loss": 0.6986, + "step": 178 + }, + { + "epoch": 0.015701065742730582, + "grad_norm": 5.072158684292459, + "learning_rate": 4.999590791607936e-06, + "loss": 0.6386, + "step": 179 + }, + { + "epoch": 0.015788781193807288, + "grad_norm": 5.674801641765509, + "learning_rate": 4.999578104083307e-06, + "loss": 0.6512, + "step": 180 + }, + { + "epoch": 0.015876496644883997, + "grad_norm": 6.011232915930249, + "learning_rate": 4.9995652228779715e-06, + "loss": 0.6166, + "step": 181 + }, + { + "epoch": 0.015964212095960703, + "grad_norm": 7.067996556252431, + "learning_rate": 4.999552147992926e-06, + "loss": 0.8316, + "step": 182 + }, + { + "epoch": 0.01605192754703741, + "grad_norm": 6.191586224655665, + "learning_rate": 4.999538879429183e-06, + "loss": 0.7167, + "step": 183 + }, + { + "epoch": 0.01613964299811412, + "grad_norm": 5.40861794404673, + "learning_rate": 4.999525417187774e-06, + "loss": 0.6604, + "step": 184 + }, + { + "epoch": 0.016227358449190824, + "grad_norm": 5.619694849325643, + "learning_rate": 4.999511761269739e-06, + "loss": 0.7141, + "step": 185 + }, + { + "epoch": 0.016315073900267534, + "grad_norm": 7.467663008400906, + "learning_rate": 4.999497911676138e-06, + "loss": 0.6086, + "step": 186 + }, + { + "epoch": 0.01640278935134424, + "grad_norm": 4.645589903763359, + "learning_rate": 4.999483868408043e-06, + "loss": 0.6932, + "step": 187 + }, + { + "epoch": 0.016490504802420945, + "grad_norm": 4.819294533224638, + "learning_rate": 4.999469631466544e-06, + "loss": 0.6256, + "step": 188 + }, + { + "epoch": 0.016578220253497655, + "grad_norm": 4.711171445741636, + "learning_rate": 4.999455200852741e-06, + "loss": 0.7445, + "step": 189 + }, + { + "epoch": 0.01666593570457436, + "grad_norm": 4.371758877075776, + "learning_rate": 4.999440576567755e-06, + "loss": 0.6801, + "step": 190 + }, + { + "epoch": 0.016753651155651066, + "grad_norm": 5.761171404408883, + "learning_rate": 4.999425758612718e-06, + "loss": 0.6701, + "step": 191 + }, + { + "epoch": 0.016841366606727776, + "grad_norm": 4.340375314807721, + "learning_rate": 4.999410746988778e-06, + "loss": 0.5556, + "step": 192 + }, + { + "epoch": 0.01692908205780448, + "grad_norm": 4.775058922031801, + "learning_rate": 4.9993955416970986e-06, + "loss": 0.6915, + "step": 193 + }, + { + "epoch": 0.01701679750888119, + "grad_norm": 4.301940379009061, + "learning_rate": 4.999380142738857e-06, + "loss": 0.6982, + "step": 194 + }, + { + "epoch": 0.017104512959957897, + "grad_norm": 4.746670538298819, + "learning_rate": 4.9993645501152485e-06, + "loss": 0.5392, + "step": 195 + }, + { + "epoch": 0.017192228411034603, + "grad_norm": 5.312812102176541, + "learning_rate": 4.999348763827479e-06, + "loss": 0.6254, + "step": 196 + }, + { + "epoch": 0.017279943862111312, + "grad_norm": 6.073252701324542, + "learning_rate": 4.999332783876774e-06, + "loss": 0.7221, + "step": 197 + }, + { + "epoch": 0.017367659313188018, + "grad_norm": 6.783014797465277, + "learning_rate": 4.999316610264369e-06, + "loss": 0.5914, + "step": 198 + }, + { + "epoch": 0.017455374764264724, + "grad_norm": 5.105373260000072, + "learning_rate": 4.999300242991519e-06, + "loss": 0.4895, + "step": 199 + }, + { + "epoch": 0.017543090215341433, + "grad_norm": 5.3256898167081825, + "learning_rate": 4.999283682059493e-06, + "loss": 0.714, + "step": 200 + }, + { + "epoch": 0.01763080566641814, + "grad_norm": 7.815945435660424, + "learning_rate": 4.999266927469572e-06, + "loss": 0.7691, + "step": 201 + }, + { + "epoch": 0.017718521117494848, + "grad_norm": 4.350216346007481, + "learning_rate": 4.999249979223056e-06, + "loss": 0.7205, + "step": 202 + }, + { + "epoch": 0.017806236568571554, + "grad_norm": 4.167534183562087, + "learning_rate": 4.999232837321257e-06, + "loss": 0.6716, + "step": 203 + }, + { + "epoch": 0.01789395201964826, + "grad_norm": 6.564156035042191, + "learning_rate": 4.999215501765504e-06, + "loss": 0.6139, + "step": 204 + }, + { + "epoch": 0.01798166747072497, + "grad_norm": 4.58988335300785, + "learning_rate": 4.9991979725571395e-06, + "loss": 0.6241, + "step": 205 + }, + { + "epoch": 0.018069382921801675, + "grad_norm": 7.14774553510386, + "learning_rate": 4.999180249697524e-06, + "loss": 0.7338, + "step": 206 + }, + { + "epoch": 0.01815709837287838, + "grad_norm": 4.3154768710391656, + "learning_rate": 4.999162333188028e-06, + "loss": 0.646, + "step": 207 + }, + { + "epoch": 0.01824481382395509, + "grad_norm": 3.930924147546703, + "learning_rate": 4.999144223030041e-06, + "loss": 0.7162, + "step": 208 + }, + { + "epoch": 0.018332529275031796, + "grad_norm": 3.75066761929553, + "learning_rate": 4.999125919224966e-06, + "loss": 0.6283, + "step": 209 + }, + { + "epoch": 0.018420244726108505, + "grad_norm": 4.916459254987505, + "learning_rate": 4.999107421774222e-06, + "loss": 0.6716, + "step": 210 + }, + { + "epoch": 0.01850796017718521, + "grad_norm": 4.570226928027306, + "learning_rate": 4.999088730679241e-06, + "loss": 0.6527, + "step": 211 + }, + { + "epoch": 0.018595675628261917, + "grad_norm": 3.6658012035372605, + "learning_rate": 4.999069845941472e-06, + "loss": 0.5452, + "step": 212 + }, + { + "epoch": 0.018683391079338627, + "grad_norm": 4.697816375671605, + "learning_rate": 4.999050767562379e-06, + "loss": 0.7316, + "step": 213 + }, + { + "epoch": 0.018771106530415332, + "grad_norm": 5.639876519194002, + "learning_rate": 4.99903149554344e-06, + "loss": 0.5152, + "step": 214 + }, + { + "epoch": 0.018858821981492038, + "grad_norm": 5.527702869650481, + "learning_rate": 4.999012029886147e-06, + "loss": 0.6119, + "step": 215 + }, + { + "epoch": 0.018946537432568748, + "grad_norm": 6.019639388484205, + "learning_rate": 4.998992370592008e-06, + "loss": 0.7366, + "step": 216 + }, + { + "epoch": 0.019034252883645453, + "grad_norm": 4.014799337285965, + "learning_rate": 4.998972517662549e-06, + "loss": 0.7088, + "step": 217 + }, + { + "epoch": 0.019121968334722163, + "grad_norm": 7.876499612097003, + "learning_rate": 4.998952471099307e-06, + "loss": 0.5565, + "step": 218 + }, + { + "epoch": 0.01920968378579887, + "grad_norm": 7.386792956892447, + "learning_rate": 4.998932230903835e-06, + "loss": 0.6387, + "step": 219 + }, + { + "epoch": 0.019297399236875575, + "grad_norm": 5.346097163630257, + "learning_rate": 4.998911797077701e-06, + "loss": 0.6237, + "step": 220 + }, + { + "epoch": 0.019385114687952284, + "grad_norm": 6.133310652425816, + "learning_rate": 4.998891169622488e-06, + "loss": 0.7428, + "step": 221 + }, + { + "epoch": 0.01947283013902899, + "grad_norm": 4.224801633855712, + "learning_rate": 4.998870348539797e-06, + "loss": 0.7206, + "step": 222 + }, + { + "epoch": 0.019560545590105696, + "grad_norm": 5.648869005800134, + "learning_rate": 4.998849333831238e-06, + "loss": 0.6249, + "step": 223 + }, + { + "epoch": 0.019648261041182405, + "grad_norm": 4.634920959306503, + "learning_rate": 4.998828125498441e-06, + "loss": 0.6764, + "step": 224 + }, + { + "epoch": 0.01973597649225911, + "grad_norm": 4.882651557085375, + "learning_rate": 4.998806723543049e-06, + "loss": 0.6682, + "step": 225 + }, + { + "epoch": 0.01982369194333582, + "grad_norm": 4.5073631852916645, + "learning_rate": 4.998785127966721e-06, + "loss": 0.7658, + "step": 226 + }, + { + "epoch": 0.019911407394412526, + "grad_norm": 6.444404326993186, + "learning_rate": 4.99876333877113e-06, + "loss": 0.7161, + "step": 227 + }, + { + "epoch": 0.019999122845489232, + "grad_norm": 5.926254683053582, + "learning_rate": 4.998741355957963e-06, + "loss": 0.6083, + "step": 228 + }, + { + "epoch": 0.02008683829656594, + "grad_norm": 4.715935033600424, + "learning_rate": 4.998719179528925e-06, + "loss": 0.5764, + "step": 229 + }, + { + "epoch": 0.020174553747642647, + "grad_norm": 4.06642116262848, + "learning_rate": 4.998696809485734e-06, + "loss": 0.6436, + "step": 230 + }, + { + "epoch": 0.020262269198719353, + "grad_norm": 4.060536926809771, + "learning_rate": 4.998674245830123e-06, + "loss": 0.6455, + "step": 231 + }, + { + "epoch": 0.020349984649796062, + "grad_norm": 5.769596888340199, + "learning_rate": 4.9986514885638405e-06, + "loss": 0.6422, + "step": 232 + }, + { + "epoch": 0.020437700100872768, + "grad_norm": 5.619149975421577, + "learning_rate": 4.99862853768865e-06, + "loss": 0.5151, + "step": 233 + }, + { + "epoch": 0.020525415551949477, + "grad_norm": 5.738973149236573, + "learning_rate": 4.998605393206329e-06, + "loss": 0.5698, + "step": 234 + }, + { + "epoch": 0.020613131003026183, + "grad_norm": 3.9117936997485443, + "learning_rate": 4.998582055118672e-06, + "loss": 0.6139, + "step": 235 + }, + { + "epoch": 0.02070084645410289, + "grad_norm": 5.594946157519774, + "learning_rate": 4.998558523427488e-06, + "loss": 0.6305, + "step": 236 + }, + { + "epoch": 0.0207885619051796, + "grad_norm": 3.7796595114227816, + "learning_rate": 4.998534798134598e-06, + "loss": 0.6064, + "step": 237 + }, + { + "epoch": 0.020876277356256304, + "grad_norm": 5.530110712124758, + "learning_rate": 4.998510879241842e-06, + "loss": 0.7404, + "step": 238 + }, + { + "epoch": 0.02096399280733301, + "grad_norm": 5.795681054870311, + "learning_rate": 4.998486766751073e-06, + "loss": 0.6637, + "step": 239 + }, + { + "epoch": 0.02105170825840972, + "grad_norm": 5.250443330736557, + "learning_rate": 4.99846246066416e-06, + "loss": 0.7229, + "step": 240 + }, + { + "epoch": 0.021139423709486425, + "grad_norm": 5.307033877732376, + "learning_rate": 4.998437960982985e-06, + "loss": 0.729, + "step": 241 + }, + { + "epoch": 0.021227139160563135, + "grad_norm": 4.264326950314863, + "learning_rate": 4.998413267709446e-06, + "loss": 0.6363, + "step": 242 + }, + { + "epoch": 0.02131485461163984, + "grad_norm": 4.56674428695937, + "learning_rate": 4.99838838084546e-06, + "loss": 0.573, + "step": 243 + }, + { + "epoch": 0.021402570062716546, + "grad_norm": 5.367393577306364, + "learning_rate": 4.998363300392951e-06, + "loss": 0.6187, + "step": 244 + }, + { + "epoch": 0.021490285513793256, + "grad_norm": 5.58627031411974, + "learning_rate": 4.998338026353865e-06, + "loss": 0.635, + "step": 245 + }, + { + "epoch": 0.02157800096486996, + "grad_norm": 4.1536241104050005, + "learning_rate": 4.9983125587301594e-06, + "loss": 0.7296, + "step": 246 + }, + { + "epoch": 0.021665716415946668, + "grad_norm": 5.369955138376355, + "learning_rate": 4.998286897523808e-06, + "loss": 0.5939, + "step": 247 + }, + { + "epoch": 0.021753431867023377, + "grad_norm": 4.749169550030242, + "learning_rate": 4.998261042736799e-06, + "loss": 0.7125, + "step": 248 + }, + { + "epoch": 0.021841147318100083, + "grad_norm": 3.847851803716185, + "learning_rate": 4.998234994371135e-06, + "loss": 0.6874, + "step": 249 + }, + { + "epoch": 0.021928862769176792, + "grad_norm": 6.3610718821634755, + "learning_rate": 4.998208752428836e-06, + "loss": 0.6839, + "step": 250 + }, + { + "epoch": 0.022016578220253498, + "grad_norm": 6.90892255007994, + "learning_rate": 4.998182316911934e-06, + "loss": 0.6706, + "step": 251 + }, + { + "epoch": 0.022104293671330204, + "grad_norm": 4.842858396629252, + "learning_rate": 4.998155687822478e-06, + "loss": 0.7887, + "step": 252 + }, + { + "epoch": 0.022192009122406913, + "grad_norm": 6.80960196083629, + "learning_rate": 4.99812886516253e-06, + "loss": 0.6891, + "step": 253 + }, + { + "epoch": 0.02227972457348362, + "grad_norm": 6.897100992823047, + "learning_rate": 4.998101848934171e-06, + "loss": 0.7213, + "step": 254 + }, + { + "epoch": 0.022367440024560325, + "grad_norm": 4.383904436150581, + "learning_rate": 4.9980746391394916e-06, + "loss": 0.5472, + "step": 255 + }, + { + "epoch": 0.022455155475637034, + "grad_norm": 6.136102422729719, + "learning_rate": 4.998047235780603e-06, + "loss": 0.7462, + "step": 256 + }, + { + "epoch": 0.02254287092671374, + "grad_norm": 5.873462354540876, + "learning_rate": 4.9980196388596255e-06, + "loss": 0.6893, + "step": 257 + }, + { + "epoch": 0.02263058637779045, + "grad_norm": 5.36389164609212, + "learning_rate": 4.9979918483787e-06, + "loss": 0.725, + "step": 258 + }, + { + "epoch": 0.022718301828867155, + "grad_norm": 6.634852411669424, + "learning_rate": 4.997963864339978e-06, + "loss": 0.7619, + "step": 259 + }, + { + "epoch": 0.02280601727994386, + "grad_norm": 4.201015694891079, + "learning_rate": 4.99793568674563e-06, + "loss": 0.653, + "step": 260 + }, + { + "epoch": 0.02289373273102057, + "grad_norm": 4.951129353141893, + "learning_rate": 4.997907315597836e-06, + "loss": 0.7543, + "step": 261 + }, + { + "epoch": 0.022981448182097276, + "grad_norm": 4.331792323630216, + "learning_rate": 4.997878750898798e-06, + "loss": 0.6553, + "step": 262 + }, + { + "epoch": 0.023069163633173982, + "grad_norm": 4.764837636647203, + "learning_rate": 4.997849992650727e-06, + "loss": 0.719, + "step": 263 + }, + { + "epoch": 0.02315687908425069, + "grad_norm": 7.315146297212186, + "learning_rate": 4.997821040855852e-06, + "loss": 0.8217, + "step": 264 + }, + { + "epoch": 0.023244594535327397, + "grad_norm": 4.5164891139288015, + "learning_rate": 4.997791895516417e-06, + "loss": 0.5553, + "step": 265 + }, + { + "epoch": 0.023332309986404107, + "grad_norm": 4.651549875308793, + "learning_rate": 4.99776255663468e-06, + "loss": 0.6981, + "step": 266 + }, + { + "epoch": 0.023420025437480813, + "grad_norm": 4.941120481014187, + "learning_rate": 4.997733024212913e-06, + "loss": 0.604, + "step": 267 + }, + { + "epoch": 0.02350774088855752, + "grad_norm": 6.3616778757465315, + "learning_rate": 4.997703298253406e-06, + "loss": 0.7253, + "step": 268 + }, + { + "epoch": 0.023595456339634228, + "grad_norm": 4.723855693485358, + "learning_rate": 4.997673378758462e-06, + "loss": 0.7335, + "step": 269 + }, + { + "epoch": 0.023683171790710934, + "grad_norm": 4.336523073382538, + "learning_rate": 4.997643265730399e-06, + "loss": 0.5665, + "step": 270 + }, + { + "epoch": 0.02377088724178764, + "grad_norm": 6.547875149524498, + "learning_rate": 4.997612959171549e-06, + "loss": 0.6542, + "step": 271 + }, + { + "epoch": 0.02385860269286435, + "grad_norm": 5.285021138793967, + "learning_rate": 4.997582459084264e-06, + "loss": 0.7824, + "step": 272 + }, + { + "epoch": 0.023946318143941055, + "grad_norm": 4.447718203152539, + "learning_rate": 4.9975517654709025e-06, + "loss": 0.6728, + "step": 273 + }, + { + "epoch": 0.024034033595017764, + "grad_norm": 4.323105158596241, + "learning_rate": 4.997520878333847e-06, + "loss": 0.6516, + "step": 274 + }, + { + "epoch": 0.02412174904609447, + "grad_norm": 4.091596093860627, + "learning_rate": 4.997489797675489e-06, + "loss": 0.5786, + "step": 275 + }, + { + "epoch": 0.024209464497171176, + "grad_norm": 4.50262054947591, + "learning_rate": 4.997458523498236e-06, + "loss": 0.6632, + "step": 276 + }, + { + "epoch": 0.024297179948247885, + "grad_norm": 5.394966563241667, + "learning_rate": 4.997427055804513e-06, + "loss": 0.7415, + "step": 277 + }, + { + "epoch": 0.02438489539932459, + "grad_norm": 5.134838704391961, + "learning_rate": 4.9973953945967565e-06, + "loss": 0.6225, + "step": 278 + }, + { + "epoch": 0.024472610850401297, + "grad_norm": 4.555937935551801, + "learning_rate": 4.9973635398774226e-06, + "loss": 0.7451, + "step": 279 + }, + { + "epoch": 0.024560326301478006, + "grad_norm": 4.014041307501394, + "learning_rate": 4.997331491648976e-06, + "loss": 0.607, + "step": 280 + }, + { + "epoch": 0.024648041752554712, + "grad_norm": 5.398424400960683, + "learning_rate": 4.9972992499139025e-06, + "loss": 0.665, + "step": 281 + }, + { + "epoch": 0.02473575720363142, + "grad_norm": 6.959554022697295, + "learning_rate": 4.9972668146746995e-06, + "loss": 0.8175, + "step": 282 + }, + { + "epoch": 0.024823472654708127, + "grad_norm": 5.048396931572014, + "learning_rate": 4.997234185933879e-06, + "loss": 0.6961, + "step": 283 + }, + { + "epoch": 0.024911188105784833, + "grad_norm": 4.737474855724115, + "learning_rate": 4.997201363693972e-06, + "loss": 0.5337, + "step": 284 + }, + { + "epoch": 0.024998903556861542, + "grad_norm": 7.374843310231967, + "learning_rate": 4.997168347957521e-06, + "loss": 0.6791, + "step": 285 + }, + { + "epoch": 0.025086619007938248, + "grad_norm": 4.306967488515473, + "learning_rate": 4.997135138727081e-06, + "loss": 0.8791, + "step": 286 + }, + { + "epoch": 0.025174334459014954, + "grad_norm": 3.7949900410813737, + "learning_rate": 4.99710173600523e-06, + "loss": 0.7743, + "step": 287 + }, + { + "epoch": 0.025262049910091663, + "grad_norm": 4.842604758031469, + "learning_rate": 4.997068139794554e-06, + "loss": 0.6602, + "step": 288 + }, + { + "epoch": 0.02534976536116837, + "grad_norm": 3.531764677671023, + "learning_rate": 4.9970343500976545e-06, + "loss": 0.6317, + "step": 289 + }, + { + "epoch": 0.02543748081224508, + "grad_norm": 5.68234167540357, + "learning_rate": 4.997000366917153e-06, + "loss": 0.7404, + "step": 290 + }, + { + "epoch": 0.025525196263321785, + "grad_norm": 4.623883782994243, + "learning_rate": 4.9969661902556804e-06, + "loss": 0.6093, + "step": 291 + }, + { + "epoch": 0.02561291171439849, + "grad_norm": 5.9956405593570175, + "learning_rate": 4.996931820115885e-06, + "loss": 0.6773, + "step": 292 + }, + { + "epoch": 0.0257006271654752, + "grad_norm": 5.06274620174889, + "learning_rate": 4.996897256500433e-06, + "loss": 0.7249, + "step": 293 + }, + { + "epoch": 0.025788342616551906, + "grad_norm": 5.989915075597491, + "learning_rate": 4.996862499411998e-06, + "loss": 0.7526, + "step": 294 + }, + { + "epoch": 0.02587605806762861, + "grad_norm": 4.58567195302804, + "learning_rate": 4.996827548853276e-06, + "loss": 0.6762, + "step": 295 + }, + { + "epoch": 0.02596377351870532, + "grad_norm": 4.097368677404026, + "learning_rate": 4.996792404826974e-06, + "loss": 0.6238, + "step": 296 + }, + { + "epoch": 0.026051488969782027, + "grad_norm": 4.021749832913485, + "learning_rate": 4.996757067335816e-06, + "loss": 0.7958, + "step": 297 + }, + { + "epoch": 0.026139204420858736, + "grad_norm": 4.679522912267575, + "learning_rate": 4.99672153638254e-06, + "loss": 0.6583, + "step": 298 + }, + { + "epoch": 0.026226919871935442, + "grad_norm": 4.256974035317045, + "learning_rate": 4.996685811969898e-06, + "loss": 0.6464, + "step": 299 + }, + { + "epoch": 0.026314635323012148, + "grad_norm": 4.4862335847168096, + "learning_rate": 4.996649894100659e-06, + "loss": 0.6116, + "step": 300 + } + ], + "logging_steps": 1, + "max_steps": 11400, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 150, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7802380615680.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833