From 2ad30c6bc3866451403f79377098d6b1c18413bc Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sun, 7 Jun 2026 12:08:13 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Weyaxi/Einstein-v7-Qwen2-7B Source: Original Platform --- .gitattributes | 54 + README.md | 430 ++ added_tokens.json | 6 + checkpoint-500/added_tokens.json | 6 + checkpoint-500/config.json | 27 + checkpoint-500/generation_config.json | 7 + ..._zero_pp_rank_0_mp_rank_00_optim_states.pt | 3 + ..._zero_pp_rank_1_mp_rank_00_optim_states.pt | 3 + ..._zero_pp_rank_2_mp_rank_00_optim_states.pt | 3 + ..._zero_pp_rank_3_mp_rank_00_optim_states.pt | 3 + ..._zero_pp_rank_4_mp_rank_00_optim_states.pt | 3 + ..._zero_pp_rank_5_mp_rank_00_optim_states.pt | 3 + ..._zero_pp_rank_6_mp_rank_00_optim_states.pt | 3 + ..._zero_pp_rank_7_mp_rank_00_optim_states.pt | 3 + .../zero_pp_rank_0_mp_rank_00_model_states.pt | 3 + .../zero_pp_rank_1_mp_rank_00_model_states.pt | 3 + .../zero_pp_rank_2_mp_rank_00_model_states.pt | 3 + .../zero_pp_rank_3_mp_rank_00_model_states.pt | 3 + .../zero_pp_rank_4_mp_rank_00_model_states.pt | 3 + .../zero_pp_rank_5_mp_rank_00_model_states.pt | 3 + .../zero_pp_rank_6_mp_rank_00_model_states.pt | 3 + .../zero_pp_rank_7_mp_rank_00_model_states.pt | 3 + checkpoint-500/latest | 1 + checkpoint-500/merges.txt | 3 + .../model-00001-of-00004.safetensors | 3 + .../model-00002-of-00004.safetensors | 3 + .../model-00003-of-00004.safetensors | 3 + .../model-00004-of-00004.safetensors | 3 + checkpoint-500/model.safetensors.index.json | 346 ++ checkpoint-500/rng_state_0.pth | 3 + checkpoint-500/rng_state_1.pth | 3 + checkpoint-500/rng_state_2.pth | 3 + checkpoint-500/rng_state_3.pth | 3 + checkpoint-500/rng_state_4.pth | 3 + checkpoint-500/rng_state_5.pth | 3 + checkpoint-500/rng_state_6.pth | 3 + checkpoint-500/rng_state_7.pth | 3 + checkpoint-500/scheduler.pt | 3 + checkpoint-500/special_tokens_map.json | 20 + checkpoint-500/tokenizer.json | 3 + checkpoint-500/tokenizer_config.json | 51 + checkpoint-500/trainer_state.json | 3561 +++++++++++++++++ checkpoint-500/training_args.bin | 3 + checkpoint-500/vocab.json | 3 + checkpoint-500/zero_to_fp32.py | 604 +++ config.json | 27 + configuration.json | 1 + generation_config.json | 7 + merges.txt | 3 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 346 ++ special_tokens_map.json | 20 + tokenizer.json | 3 + tokenizer_config.json | 51 + trainer_state.json | 3561 +++++++++++++++++ training_args.bin | 3 + vocab.json | 3 + 60 files changed, 9249 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 added_tokens.json create mode 100644 checkpoint-500/added_tokens.json create mode 100644 checkpoint-500/config.json create mode 100644 checkpoint-500/generation_config.json create mode 100644 checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt create mode 100644 checkpoint-500/global_step500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt create mode 100644 checkpoint-500/global_step500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt create mode 100644 checkpoint-500/global_step500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt create mode 100644 checkpoint-500/global_step500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt create mode 100644 checkpoint-500/global_step500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt create mode 100644 checkpoint-500/global_step500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt create mode 100644 checkpoint-500/global_step500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt create mode 100644 checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt create mode 100644 checkpoint-500/global_step500/zero_pp_rank_1_mp_rank_00_model_states.pt create mode 100644 checkpoint-500/global_step500/zero_pp_rank_2_mp_rank_00_model_states.pt create mode 100644 checkpoint-500/global_step500/zero_pp_rank_3_mp_rank_00_model_states.pt create mode 100644 checkpoint-500/global_step500/zero_pp_rank_4_mp_rank_00_model_states.pt create mode 100644 checkpoint-500/global_step500/zero_pp_rank_5_mp_rank_00_model_states.pt create mode 100644 checkpoint-500/global_step500/zero_pp_rank_6_mp_rank_00_model_states.pt create mode 100644 checkpoint-500/global_step500/zero_pp_rank_7_mp_rank_00_model_states.pt create mode 100644 checkpoint-500/latest create mode 100644 checkpoint-500/merges.txt create mode 100644 checkpoint-500/model-00001-of-00004.safetensors create mode 100644 checkpoint-500/model-00002-of-00004.safetensors create mode 100644 checkpoint-500/model-00003-of-00004.safetensors create mode 100644 checkpoint-500/model-00004-of-00004.safetensors create mode 100644 checkpoint-500/model.safetensors.index.json create mode 100644 checkpoint-500/rng_state_0.pth create mode 100644 checkpoint-500/rng_state_1.pth create mode 100644 checkpoint-500/rng_state_2.pth create mode 100644 checkpoint-500/rng_state_3.pth create mode 100644 checkpoint-500/rng_state_4.pth create mode 100644 checkpoint-500/rng_state_5.pth create mode 100644 checkpoint-500/rng_state_6.pth create mode 100644 checkpoint-500/rng_state_7.pth create mode 100644 checkpoint-500/scheduler.pt create mode 100644 checkpoint-500/special_tokens_map.json create mode 100644 checkpoint-500/tokenizer.json create mode 100644 checkpoint-500/tokenizer_config.json create mode 100644 checkpoint-500/trainer_state.json create mode 100644 checkpoint-500/training_args.bin create mode 100644 checkpoint-500/vocab.json create mode 100644 checkpoint-500/zero_to_fp32.py create mode 100644 config.json create mode 100644 configuration.json create mode 100644 generation_config.json create mode 100644 merges.txt create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 trainer_state.json create mode 100644 training_args.bin create mode 100644 vocab.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..73f0d7a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,54 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +checkpoint-500/merges.txt filter=lfs diff=lfs merge=lfs -text +checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-500/vocab.json filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b8d9a33 --- /dev/null +++ b/README.md @@ -0,0 +1,430 @@ +--- +language: +- en +license: other +tags: +- axolotl +- instruct +- finetune +- chatml +- gpt4 +- synthetic data +- science +- physics +- chemistry +- biology +- math +- qwen +- qwen2 +base_model: Qwen/Qwen2-7B +datasets: +- allenai/ai2_arc +- camel-ai/physics +- camel-ai/chemistry +- camel-ai/biology +- camel-ai/math +- metaeval/reclor +- openbookqa +- mandyyyyii/scibench +- derek-thomas/ScienceQA +- TIGER-Lab/ScienceEval +- jondurbin/airoboros-3.2 +- LDJnr/Capybara +- Cot-Alpaca-GPT4-From-OpenHermes-2.5 +- STEM-AI-mtl/Electrical-engineering +- knowrohit07/saraswati-stem +- sablo/oasst2_curated +- lmsys/lmsys-chat-1m +- TIGER-Lab/MathInstruct +- bigbio/med_qa +- meta-math/MetaMathQA-40K +- openbookqa +- piqa +- metaeval/reclor +- derek-thomas/ScienceQA +- scibench +- sciq +- Open-Orca/SlimOrca +- migtissera/Synthia-v1.3 +- TIGER-Lab/ScienceEval +- allenai/WildChat +- microsoft/orca-math-word-problems-200k +- openchat/openchat_sharegpt4_dataset +- teknium/GPTeacher-General-Instruct +- m-a-p/CodeFeedback-Filtered-Instruction +- totally-not-an-llm/EverythingLM-data-V3 +- HuggingFaceH4/no_robots +- OpenAssistant/oasst_top1_2023-08-25 +- WizardLM/WizardLM_evol_instruct_70k +- abacusai/SystemChat-1.1 +- H-D-T/Buzz-V1.2 +model-index: +- name: Einstein-v7-Qwen2-7B + results: + - task: + type: text-generation + name: Text Generation + dataset: + name: IFEval (0-Shot) + type: HuggingFaceH4/ifeval + args: + num_few_shot: 0 + metrics: + - type: inst_level_strict_acc and prompt_level_strict_acc + value: 41.0 + name: strict accuracy + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Weyaxi/Einstein-v7-Qwen2-7B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: BBH (3-Shot) + type: BBH + args: + num_few_shot: 3 + metrics: + - type: acc_norm + value: 32.84 + name: normalized accuracy + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Weyaxi/Einstein-v7-Qwen2-7B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MATH Lvl 5 (4-Shot) + type: hendrycks/competition_math + args: + num_few_shot: 4 + metrics: + - type: exact_match + value: 15.18 + name: exact match + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Weyaxi/Einstein-v7-Qwen2-7B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: GPQA (0-shot) + type: Idavidrein/gpqa + args: + num_few_shot: 0 + metrics: + - type: acc_norm + value: 6.6 + name: acc_norm + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Weyaxi/Einstein-v7-Qwen2-7B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MuSR (0-shot) + type: TAUR-Lab/MuSR + args: + num_few_shot: 0 + metrics: + - type: acc_norm + value: 14.06 + name: acc_norm + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Weyaxi/Einstein-v7-Qwen2-7B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MMLU-PRO (5-shot) + type: TIGER-Lab/MMLU-Pro + config: main + split: test + args: + num_few_shot: 5 + metrics: + - type: acc + value: 34.4 + name: accuracy + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Weyaxi/Einstein-v7-Qwen2-7B + name: Open LLM Leaderboard +--- +![image/png](https://cdn-uploads.huggingface.co/production/uploads/6468ce47e134d050a58aa89c/KLQP1jK-DIzpwHzYRIH-Q.png) + +# 🔬 Einstein-v7-Qwen2-7B + +This model is a full fine-tuned version of [Qwen/Qwen2-7B](https://huggingface.co/Qwen/Qwen2-7B) on diverse datasets. + +This model is finetuned using `8xMI300X` using [axolotl](https://github.com/OpenAccess-AI-Collective/axolotl). + +This model has been trained using compute resources from [TensorWave](https://tensorwave.com/). + +
See axolotl config + +axolotl version: `0.4.0` +```yaml +base_model: Qwen/Qwen2-7B +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: false +load_in_4bit: false +strict: false + +chat_template: chatml +datasets: + - path: data/airoboros_3.2_without_contextual_slimorca_orca_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/allenai_wild_chat_gpt4_english_toxic_random_half_4k_sharegpt.json + ds_type: json + type: sharegpt + strict: false + conversation: chatml + + - path: data/buzz_unstacked_chosen_math_removed_filtered.json + ds_type: json + type: alpaca + conversation: chatml + + - path: data/capybara_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/cot_alpaca_gpt4_extracted_openhermes_2.5_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/everythinglm-data-v3_sharegpt.json + ds_type: json + type: sharegpt + strict: false + conversation: chatml + + - path: data/gpt4_data_lmys_1m_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/gpteacher-instruct-special-alpaca.json + ds_type: json + type: gpteacher + conversation: chatml + + - path: data/merged_all.json + ds_type: json + type: alpaca + conversation: chatml + + - path: data/no_robots_sharegpt.json + ds_type: json + type: sharegpt + strict: false + conversation: chatml + + - path: data/oasst_top1_from_fusechatmixture_sharegpt.json + ds_type: json + type: sharegpt + strict: false + conversation: chatml + + - path: data/pippa_bagel_repo_3k_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/rpguild_quarter_alignment_lab_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/sharegpt_gpt4_english.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/slimorca_dedup_filtered_95k_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/soda_diaolog_longest_tenth_buzz_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/synthia-v1.3_sharegpt_12500.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/system_conversations_dolphin_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + +dataset_prepared_path: last_run_prepared +val_set_size: 0.002 + +output_dir: ./Einstein-v7-Qwen2-7B-model + +sequence_len: 8192 +sample_packing: true +pad_to_sequence_len: true +eval_sample_packing: false + +wandb_project: Einstein +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: +hub_model_id: Weyaxi/Einstein-v7-Qwen2-7B + +gradient_accumulation_steps: 4 +micro_batch_size: 6 +num_epochs: 2 +optimizer: paged_adamw_8bit +lr_scheduler: cosine +learning_rate: 0.00001 # look + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: unsloth +gradient_checkpointing_kwargs: + use_reentrant: true # look +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +evals_per_epoch: 2 +eval_table_size: +eval_max_new_tokens: 128 +saves_per_epoch: 1 +debug: + +deepspeed: deepspeed_configs/zero3_bf16.json +weight_decay: 0.05 +fsdp: +fsdp_config: +special_tokens: + eos_token: "<|im_end|>" + pad_token: "<|end_of_text|>" +tokens: + - "<|im_start|>" + - "<|im_end|>" +``` + +

+ +# 💬 Prompt Template + +You can use ChatML prompt template while using the model: + +### ChatML + +``` +<|im_start|>system +{system}<|im_end|> +<|im_start|>user +{user}<|im_end|> +<|im_start|>assistant +{asistant}<|im_end|> +``` + +This prompt template is available as a [chat template](https://huggingface.co/docs/transformers/main/chat_templating), which means you can format messages using the +`tokenizer.apply_chat_template()` method: + +```python +messages = [ + {"role": "system", "content": "You are helpful AI asistant."}, + {"role": "user", "content": "Hello!"} +] +gen_input = tokenizer.apply_chat_template(message, return_tensors="pt") +model.generate(**gen_input) +``` + +# 📊 Datasets used in this model + +The datasets used to train this model are listed in the metadata section of the model card. + +Please note that certain datasets mentioned in the metadata may have undergone filtering based on various criteria. + +The results of this filtering process and its outcomes are in a diffrent repository: + +[Weyaxi/sci-datasets/main](https://huggingface.co/datasets/Weyaxi/sci-datasets/tree/main) + +# 🔄 Quantizationed versions + +## GGUF [@bartowski](https://huggingface.co/bartowski) + +- https://huggingface.co/bartowski/Einstein-v7-Qwen2-7B-GGUF + +## ExLlamaV2 [@bartowski](https://huggingface.co/bartowski) + +- https://huggingface.co/bartowski/Einstein-v7-Qwen2-7B-exl2 + +# 🎯 [Open LLM Leaderboard v2 Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) +Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_Weyaxi__Einstein-v7-Qwen2-7B) + +| Metric |Value| +|-------------------|----:| +|Avg. |24.01| +|IFEval (0-Shot) |41.00| +|BBH (3-Shot) |32.84| +|MATH Lvl 5 (4-Shot)|15.18| +|GPQA (0-shot) | 6.60| +|MuSR (0-shot) |14.06| +|MMLU-PRO (5-shot) |34.40| + +# 📚 Some resources, discussions and reviews aboout this model + +#### 🐦 Announcement tweet: + +- https://twitter.com/Weyaxi/status/1809644014515154961 + +#### 🔍 Reddit post in r/LocalLLaMA: + +- https://www.reddit.com/r/LocalLLaMA/comments/1dy6o4l/introducing_einstein_v7_based_on_the_qwen2_7b/ + +# 🤖 Additional information about training + +This model is full fine-tuned for 2 epoch. + +Total number of steps was 500. + +
Loss graph + +![image/png](https://cdn-uploads.huggingface.co/production/uploads/6468ce47e134d050a58aa89c/bkJGgh_JUfKeRlTLo_ZcB.png) + +

+ +# 🤝 Acknowledgments + +Thanks to all the dataset authors mentioned in the datasets section. + +Thanks to [axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) for making the repository I used to make this model. + +Thanks to all open source AI community. + +[Built with Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) + +If you would like to support me: + +[☕ Buy Me a Coffee](https://www.buymeacoffee.com/weyaxi) diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..cf6e896 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,6 @@ +{ + "<|end_of_text|>": 151646, + "<|endoftext|>": 151643, + "<|im_end|>": 151645, + "<|im_start|>": 151644 +} diff --git a/checkpoint-500/added_tokens.json b/checkpoint-500/added_tokens.json new file mode 100644 index 0000000..cf6e896 --- /dev/null +++ b/checkpoint-500/added_tokens.json @@ -0,0 +1,6 @@ +{ + "<|end_of_text|>": 151646, + "<|endoftext|>": 151643, + "<|im_end|>": 151645, + "<|im_start|>": 151644 +} diff --git a/checkpoint-500/config.json b/checkpoint-500/config.json new file mode 100644 index 0000000..e858a09 --- /dev/null +++ b/checkpoint-500/config.json @@ -0,0 +1,27 @@ +{ + "_name_or_path": "Qwen/Qwen2-7B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/checkpoint-500/generation_config.json b/checkpoint-500/generation_config.json new file mode 100644 index 0000000..85199a2 --- /dev/null +++ b/checkpoint-500/generation_config.json @@ -0,0 +1,7 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": 151643, + "max_new_tokens": 2048, + "transformers_version": "4.40.0.dev0" +} diff --git a/checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..e3ccb0a --- /dev/null +++ b/checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:226020039fecd1670e02302ba8d2124ccc49580310fa41ce7984f538b3d16766 +size 5716169251 diff --git a/checkpoint-500/global_step500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-500/global_step500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..6d86153 --- /dev/null +++ b/checkpoint-500/global_step500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1199951eb26c1a658f27ecf4ea19136921370f8c2c36a6e499d12f0ae86c354 +size 5716169251 diff --git a/checkpoint-500/global_step500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-500/global_step500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..60e74f2 --- /dev/null +++ b/checkpoint-500/global_step500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83ae7b10c5ec2a2767c728b6985cc362f79fc621021130cce9eb82c7274bc90c +size 5716169251 diff --git a/checkpoint-500/global_step500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-500/global_step500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..2c97991 --- /dev/null +++ b/checkpoint-500/global_step500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d226b9dc33d0558acc1e7519686ba1fab16f952d4f5fc4be1712a5f6421e59bd +size 5716169251 diff --git a/checkpoint-500/global_step500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-500/global_step500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..d0d0d0e --- /dev/null +++ b/checkpoint-500/global_step500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65d4f2403432a6049565b7212b2eb226baab152ec0ce6e733b5c8954b38fc867 +size 5716169251 diff --git a/checkpoint-500/global_step500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-500/global_step500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..40b770a --- /dev/null +++ b/checkpoint-500/global_step500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeece00898602ab131f81c4491caaaaf507351a3f40e7fd308d6767353485850 +size 5716169251 diff --git a/checkpoint-500/global_step500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-500/global_step500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..579239c --- /dev/null +++ b/checkpoint-500/global_step500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c05cead029cd470b5533740e9165be56e8efd224f5972f4ea8a780360b2ebdcf +size 5716169251 diff --git a/checkpoint-500/global_step500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-500/global_step500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..863922a --- /dev/null +++ b/checkpoint-500/global_step500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d342ac1ed4b7e6a3bd1e6a98e8c3921aba9e855b336a5680b0345da1173e993f +size 5716169251 diff --git a/checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000..3930701 --- /dev/null +++ b/checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a5f982290cc9a877338f69b8c20701b958a19e007ec5699307c0a8199264325 +size 171605 diff --git a/checkpoint-500/global_step500/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-500/global_step500/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000..018a4c0 --- /dev/null +++ b/checkpoint-500/global_step500/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:689394a6137430bef959c423ec75415ee05814e05a2fda78d892475556074cd2 +size 171605 diff --git a/checkpoint-500/global_step500/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-500/global_step500/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000..15b0898 --- /dev/null +++ b/checkpoint-500/global_step500/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae652cca71c22c7a58e86deab6e000979a9285be725fe2caafaae5314cfb2d85 +size 171605 diff --git a/checkpoint-500/global_step500/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-500/global_step500/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000..7723de3 --- /dev/null +++ b/checkpoint-500/global_step500/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:452532268dd0c41b5a07e3d8641bd75e8a54c90c420b074d606948c0a834f174 +size 171605 diff --git a/checkpoint-500/global_step500/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-500/global_step500/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000..26a299e --- /dev/null +++ b/checkpoint-500/global_step500/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b5d1cb07d60aaba8b91ec8cba3adc3a2b3da4e12f103e63888c0f0a9a33807 +size 171605 diff --git a/checkpoint-500/global_step500/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-500/global_step500/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000..6510d67 --- /dev/null +++ b/checkpoint-500/global_step500/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6459e6cc0f3738c7f4bef34d23c484fad1ff5dcc2f123033eec0daedd73b354 +size 171605 diff --git a/checkpoint-500/global_step500/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-500/global_step500/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000..14ce64a --- /dev/null +++ b/checkpoint-500/global_step500/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dddf9aa1dbaf0833a7d190dc973e32da8b337f7e7d418f2ef56355cca12ae8a9 +size 171605 diff --git a/checkpoint-500/global_step500/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-500/global_step500/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000..2ca9fc0 --- /dev/null +++ b/checkpoint-500/global_step500/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50740920ed0f156846d230aacda455448a356949592f9fb78cd62d8388a2a2cd +size 171605 diff --git a/checkpoint-500/latest b/checkpoint-500/latest new file mode 100644 index 0000000..f0b47ce --- /dev/null +++ b/checkpoint-500/latest @@ -0,0 +1 @@ +global_step500 \ No newline at end of file diff --git a/checkpoint-500/merges.txt b/checkpoint-500/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/checkpoint-500/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/checkpoint-500/model-00001-of-00004.safetensors b/checkpoint-500/model-00001-of-00004.safetensors new file mode 100644 index 0000000..3fd8dab --- /dev/null +++ b/checkpoint-500/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f421710e83034813f0366192f32dd36a6005990365885d2b7b3fad1f95ee71a1 +size 4877660776 diff --git a/checkpoint-500/model-00002-of-00004.safetensors b/checkpoint-500/model-00002-of-00004.safetensors new file mode 100644 index 0000000..b7bc6db --- /dev/null +++ b/checkpoint-500/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ee3add1c94133fccf2b4c5a11fdea6167e7de07c2f58f1cbfc8e7da0844518 +size 4932751008 diff --git a/checkpoint-500/model-00003-of-00004.safetensors b/checkpoint-500/model-00003-of-00004.safetensors new file mode 100644 index 0000000..9e3b0af --- /dev/null +++ b/checkpoint-500/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61e3e7d2c4b53ec95c1ad1e8a2c2770709a5ab20cb556486922f3722569615e8 +size 4330865200 diff --git a/checkpoint-500/model-00004-of-00004.safetensors b/checkpoint-500/model-00004-of-00004.safetensors new file mode 100644 index 0000000..7193b5f --- /dev/null +++ b/checkpoint-500/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7bb33e9d51c137141218117b751ad1da933bd40ac5c30ca729aea7e79c68ed7 +size 1089994880 diff --git a/checkpoint-500/model.safetensors.index.json b/checkpoint-500/model.safetensors.index.json new file mode 100644 index 0000000..6ca5084 --- /dev/null +++ b/checkpoint-500/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 15231233024 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00003-of-00004.safetensors" + } +} diff --git a/checkpoint-500/rng_state_0.pth b/checkpoint-500/rng_state_0.pth new file mode 100644 index 0000000..7b7ff31 --- /dev/null +++ b/checkpoint-500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31a89b0384f9bb1822e4729d969cea6e7ee72e8284f449afe40d72529b41495f +size 15984 diff --git a/checkpoint-500/rng_state_1.pth b/checkpoint-500/rng_state_1.pth new file mode 100644 index 0000000..c690e78 --- /dev/null +++ b/checkpoint-500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1f63b4012f7c44010911fca257140455b8dcc0348facde0081110fb01a2f4b3 +size 15984 diff --git a/checkpoint-500/rng_state_2.pth b/checkpoint-500/rng_state_2.pth new file mode 100644 index 0000000..76dc906 --- /dev/null +++ b/checkpoint-500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c8269445fce1fb57423c61c2fbea5530b846483a0f14d361f41387cbc698ee0 +size 15984 diff --git a/checkpoint-500/rng_state_3.pth b/checkpoint-500/rng_state_3.pth new file mode 100644 index 0000000..1f33e20 --- /dev/null +++ b/checkpoint-500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0e477ac830ad4ca17a759dbe236dae36761137b0b63d36efc7601491878041f +size 15984 diff --git a/checkpoint-500/rng_state_4.pth b/checkpoint-500/rng_state_4.pth new file mode 100644 index 0000000..036d901 --- /dev/null +++ b/checkpoint-500/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39949e929ac968a737922a436fd778057a7a82cfc15aee973b3a9ee99b62bd08 +size 15984 diff --git a/checkpoint-500/rng_state_5.pth b/checkpoint-500/rng_state_5.pth new file mode 100644 index 0000000..d0b0869 --- /dev/null +++ b/checkpoint-500/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aa4704a1dedee3f2891294940d4109acf4223e8c1c28954bd988ce0426e7c25 +size 15984 diff --git a/checkpoint-500/rng_state_6.pth b/checkpoint-500/rng_state_6.pth new file mode 100644 index 0000000..95c8764 --- /dev/null +++ b/checkpoint-500/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad2a762f21c2627156c6e10036844c58d0324aa751a1aa366677ddbc962fc5f7 +size 15984 diff --git a/checkpoint-500/rng_state_7.pth b/checkpoint-500/rng_state_7.pth new file mode 100644 index 0000000..ff6e897 --- /dev/null +++ b/checkpoint-500/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:041214fd2afb837bfa2045787db59e297593ae66156089783d80a17a7857e109 +size 15984 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000..7aac709 --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bffd28705f667fd2d53cbf38bdbf3ad68a22d34ececb929729232ad695ef0953 +size 1064 diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000..2938465 --- /dev/null +++ b/checkpoint-500/special_tokens_map.json @@ -0,0 +1,20 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-500/tokenizer.json b/checkpoint-500/tokenizer.json new file mode 100644 index 0000000..3ad2571 --- /dev/null +++ b/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14af073bd227de69ecbeab6dc6afdcae303bbbc1887df836548350491ecf3b67 +size 7028209 diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000..186e976 --- /dev/null +++ b/checkpoint-500/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": null, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "model_max_length": 32768, + "pad_token": "<|end_of_text|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000..3019e97 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,3561 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9740777666999003, + "eval_steps": 125, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.8329473944625845, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9189, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 0.8840048313140869, + "eval_runtime": 99.9262, + "eval_samples_per_second": 17.693, + "eval_steps_per_second": 0.37, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 1.7916344264608899, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8962, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 1.8909931480365287, + "learning_rate": 3e-06, + "loss": 0.8805, + "step": 3 + }, + { + "epoch": 0.02, + "grad_norm": 1.6318273112027453, + "learning_rate": 4.000000000000001e-06, + "loss": 0.913, + "step": 4 + }, + { + "epoch": 0.02, + "grad_norm": 1.2463401136319747, + "learning_rate": 5e-06, + "loss": 0.908, + "step": 5 + }, + { + "epoch": 0.02, + "grad_norm": 1.1463980681106876, + "learning_rate": 6e-06, + "loss": 0.8729, + "step": 6 + }, + { + "epoch": 0.03, + "grad_norm": 0.9477573494094379, + "learning_rate": 7e-06, + "loss": 0.8411, + "step": 7 + }, + { + "epoch": 0.03, + "grad_norm": 4.0165120162042935, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0541, + "step": 8 + }, + { + "epoch": 0.04, + "grad_norm": 1.0713771331971476, + "learning_rate": 9e-06, + "loss": 0.8495, + "step": 9 + }, + { + "epoch": 0.04, + "grad_norm": 0.8667235558943894, + "learning_rate": 1e-05, + "loss": 0.8199, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 0.7411429457268661, + "learning_rate": 9.999897234791831e-06, + "loss": 0.7964, + "step": 11 + }, + { + "epoch": 0.05, + "grad_norm": 0.5729968036750446, + "learning_rate": 9.999588943391597e-06, + "loss": 0.8039, + "step": 12 + }, + { + "epoch": 0.05, + "grad_norm": 0.5402964207486183, + "learning_rate": 9.99907513847195e-06, + "loss": 0.8287, + "step": 13 + }, + { + "epoch": 0.06, + "grad_norm": 0.5633328266442124, + "learning_rate": 9.9983558411534e-06, + "loss": 0.7842, + "step": 14 + }, + { + "epoch": 0.06, + "grad_norm": 0.5412290905686791, + "learning_rate": 9.99743108100344e-06, + "loss": 0.8345, + "step": 15 + }, + { + "epoch": 0.06, + "grad_norm": 0.4895379189634968, + "learning_rate": 9.99630089603534e-06, + "loss": 0.7922, + "step": 16 + }, + { + "epoch": 0.07, + "grad_norm": 0.5088260537094976, + "learning_rate": 9.994965332706574e-06, + "loss": 0.7969, + "step": 17 + }, + { + "epoch": 0.07, + "grad_norm": 0.47075507205524136, + "learning_rate": 9.993424445916923e-06, + "loss": 0.7931, + "step": 18 + }, + { + "epoch": 0.08, + "grad_norm": 0.3878407143429931, + "learning_rate": 9.991678299006206e-06, + "loss": 0.8041, + "step": 19 + }, + { + "epoch": 0.08, + "grad_norm": 0.3873731682942636, + "learning_rate": 9.989726963751683e-06, + "loss": 0.8107, + "step": 20 + }, + { + "epoch": 0.08, + "grad_norm": 0.42417629604043083, + "learning_rate": 9.987570520365105e-06, + "loss": 0.7874, + "step": 21 + }, + { + "epoch": 0.09, + "grad_norm": 0.4324680733617199, + "learning_rate": 9.98520905748941e-06, + "loss": 0.8025, + "step": 22 + }, + { + "epoch": 0.09, + "grad_norm": 0.34546199757993784, + "learning_rate": 9.982642672195093e-06, + "loss": 0.8048, + "step": 23 + }, + { + "epoch": 0.1, + "grad_norm": 0.35958771648273496, + "learning_rate": 9.979871469976197e-06, + "loss": 0.831, + "step": 24 + }, + { + "epoch": 0.1, + "grad_norm": 0.3940074197908444, + "learning_rate": 9.976895564745993e-06, + "loss": 0.7944, + "step": 25 + }, + { + "epoch": 0.1, + "grad_norm": 0.3818406187889153, + "learning_rate": 9.973715078832288e-06, + "loss": 0.7936, + "step": 26 + }, + { + "epoch": 0.11, + "grad_norm": 10.237346743255186, + "learning_rate": 9.970330142972403e-06, + "loss": 1.0017, + "step": 27 + }, + { + "epoch": 0.11, + "grad_norm": 6.504690612414681, + "learning_rate": 9.966740896307791e-06, + "loss": 1.0329, + "step": 28 + }, + { + "epoch": 0.12, + "grad_norm": 0.4028775109425473, + "learning_rate": 9.962947486378325e-06, + "loss": 0.7702, + "step": 29 + }, + { + "epoch": 0.12, + "grad_norm": 0.39810277536967076, + "learning_rate": 9.95895006911623e-06, + "loss": 0.771, + "step": 30 + }, + { + "epoch": 0.12, + "grad_norm": 0.29862663396811506, + "learning_rate": 9.954748808839675e-06, + "loss": 0.7767, + "step": 31 + }, + { + "epoch": 0.13, + "grad_norm": 0.3106188272362696, + "learning_rate": 9.950343878246011e-06, + "loss": 0.7943, + "step": 32 + }, + { + "epoch": 0.13, + "grad_norm": 0.34702364911134964, + "learning_rate": 9.945735458404681e-06, + "loss": 0.7972, + "step": 33 + }, + { + "epoch": 0.14, + "grad_norm": 0.3216448960978253, + "learning_rate": 9.94092373874978e-06, + "loss": 0.7847, + "step": 34 + }, + { + "epoch": 0.14, + "grad_norm": 0.31232207978504006, + "learning_rate": 9.935908917072253e-06, + "loss": 0.7738, + "step": 35 + }, + { + "epoch": 0.14, + "grad_norm": 0.3004886604892709, + "learning_rate": 9.930691199511775e-06, + "loss": 0.7877, + "step": 36 + }, + { + "epoch": 0.15, + "grad_norm": 0.2870013960815822, + "learning_rate": 9.925270800548285e-06, + "loss": 0.754, + "step": 37 + }, + { + "epoch": 0.15, + "grad_norm": 0.28322113595593756, + "learning_rate": 9.91964794299315e-06, + "loss": 0.7445, + "step": 38 + }, + { + "epoch": 0.16, + "grad_norm": 0.3065117198934518, + "learning_rate": 9.91382285798002e-06, + "loss": 0.787, + "step": 39 + }, + { + "epoch": 0.16, + "grad_norm": 0.2727693466806482, + "learning_rate": 9.907795784955327e-06, + "loss": 0.7865, + "step": 40 + }, + { + "epoch": 0.16, + "grad_norm": 0.2746198009076503, + "learning_rate": 9.901566971668437e-06, + "loss": 0.7755, + "step": 41 + }, + { + "epoch": 0.17, + "grad_norm": 0.2888207750688948, + "learning_rate": 9.895136674161466e-06, + "loss": 0.7789, + "step": 42 + }, + { + "epoch": 0.17, + "grad_norm": 0.26218141394209254, + "learning_rate": 9.888505156758758e-06, + "loss": 0.7781, + "step": 43 + }, + { + "epoch": 0.18, + "grad_norm": 0.27028128323788914, + "learning_rate": 9.881672692056022e-06, + "loss": 0.7596, + "step": 44 + }, + { + "epoch": 0.18, + "grad_norm": 0.301432889634355, + "learning_rate": 9.874639560909118e-06, + "loss": 0.7746, + "step": 45 + }, + { + "epoch": 0.18, + "grad_norm": 0.27768870163187315, + "learning_rate": 9.867406052422525e-06, + "loss": 0.7751, + "step": 46 + }, + { + "epoch": 0.19, + "grad_norm": 0.2638230079020965, + "learning_rate": 9.85997246393744e-06, + "loss": 0.8085, + "step": 47 + }, + { + "epoch": 0.19, + "grad_norm": 0.2826098837962784, + "learning_rate": 9.852339101019574e-06, + "loss": 0.7878, + "step": 48 + }, + { + "epoch": 0.2, + "grad_norm": 0.2673052298412088, + "learning_rate": 9.844506277446577e-06, + "loss": 0.7747, + "step": 49 + }, + { + "epoch": 0.2, + "grad_norm": 0.2589820555015507, + "learning_rate": 9.836474315195148e-06, + "loss": 0.7491, + "step": 50 + }, + { + "epoch": 0.2, + "grad_norm": 0.27744141325372174, + "learning_rate": 9.828243544427795e-06, + "loss": 0.771, + "step": 51 + }, + { + "epoch": 0.21, + "grad_norm": 0.25617202776049003, + "learning_rate": 9.819814303479268e-06, + "loss": 0.789, + "step": 52 + }, + { + "epoch": 0.21, + "grad_norm": 0.25777796417187593, + "learning_rate": 9.811186938842645e-06, + "loss": 0.7498, + "step": 53 + }, + { + "epoch": 0.22, + "grad_norm": 0.26356120702424557, + "learning_rate": 9.802361805155097e-06, + "loss": 0.7618, + "step": 54 + }, + { + "epoch": 0.22, + "grad_norm": 0.24594116238284844, + "learning_rate": 9.793339265183303e-06, + "loss": 0.7647, + "step": 55 + }, + { + "epoch": 0.22, + "grad_norm": 0.2766331605712476, + "learning_rate": 9.784119689808545e-06, + "loss": 0.7757, + "step": 56 + }, + { + "epoch": 0.23, + "grad_norm": 0.2674205732918615, + "learning_rate": 9.774703458011453e-06, + "loss": 0.7479, + "step": 57 + }, + { + "epoch": 0.23, + "grad_norm": 0.25100414008068433, + "learning_rate": 9.765090956856437e-06, + "loss": 0.7629, + "step": 58 + }, + { + "epoch": 0.24, + "grad_norm": 0.2558976905977626, + "learning_rate": 9.755282581475769e-06, + "loss": 0.7368, + "step": 59 + }, + { + "epoch": 0.24, + "grad_norm": 0.2816597522453804, + "learning_rate": 9.745278735053345e-06, + "loss": 0.7675, + "step": 60 + }, + { + "epoch": 0.24, + "grad_norm": 0.27864046582364604, + "learning_rate": 9.735079828808107e-06, + "loss": 0.7693, + "step": 61 + }, + { + "epoch": 0.25, + "grad_norm": 0.2537495381298166, + "learning_rate": 9.724686281977146e-06, + "loss": 0.7612, + "step": 62 + }, + { + "epoch": 0.25, + "grad_norm": 0.27161360619636454, + "learning_rate": 9.714098521798466e-06, + "loss": 0.7659, + "step": 63 + }, + { + "epoch": 0.26, + "grad_norm": 0.257282261183055, + "learning_rate": 9.703316983493414e-06, + "loss": 0.77, + "step": 64 + }, + { + "epoch": 0.26, + "grad_norm": 0.2598148868150837, + "learning_rate": 9.692342110248802e-06, + "loss": 0.7637, + "step": 65 + }, + { + "epoch": 0.26, + "grad_norm": 0.25319486577746536, + "learning_rate": 9.681174353198687e-06, + "loss": 0.7529, + "step": 66 + }, + { + "epoch": 0.27, + "grad_norm": 0.2616187230129625, + "learning_rate": 9.669814171405818e-06, + "loss": 0.7482, + "step": 67 + }, + { + "epoch": 0.27, + "grad_norm": 0.2531735015293101, + "learning_rate": 9.658262031842772e-06, + "loss": 0.7507, + "step": 68 + }, + { + "epoch": 0.28, + "grad_norm": 0.2540031125746497, + "learning_rate": 9.64651840937276e-06, + "loss": 0.7573, + "step": 69 + }, + { + "epoch": 0.28, + "grad_norm": 0.26251145119756225, + "learning_rate": 9.63458378673011e-06, + "loss": 0.7617, + "step": 70 + }, + { + "epoch": 0.28, + "grad_norm": 11.659656865913, + "learning_rate": 9.622458654500408e-06, + "loss": 0.9807, + "step": 71 + }, + { + "epoch": 0.29, + "grad_norm": 43.33218979251603, + "learning_rate": 9.610143511100354e-06, + "loss": 1.0213, + "step": 72 + }, + { + "epoch": 0.29, + "grad_norm": 0.29661440448786996, + "learning_rate": 9.597638862757255e-06, + "loss": 0.7597, + "step": 73 + }, + { + "epoch": 0.3, + "grad_norm": 0.2674359864711363, + "learning_rate": 9.584945223488227e-06, + "loss": 0.7716, + "step": 74 + }, + { + "epoch": 0.3, + "grad_norm": 0.2587397735578842, + "learning_rate": 9.572063115079063e-06, + "loss": 0.7654, + "step": 75 + }, + { + "epoch": 0.3, + "grad_norm": 0.27326638279450294, + "learning_rate": 9.558993067062785e-06, + "loss": 0.7832, + "step": 76 + }, + { + "epoch": 0.31, + "grad_norm": 0.26424783232216553, + "learning_rate": 9.545735616697875e-06, + "loss": 0.7509, + "step": 77 + }, + { + "epoch": 0.31, + "grad_norm": 0.26894661215694415, + "learning_rate": 9.532291308946191e-06, + "loss": 0.7638, + "step": 78 + }, + { + "epoch": 0.32, + "grad_norm": 12.381149099110814, + "learning_rate": 9.518660696450567e-06, + "loss": 0.9726, + "step": 79 + }, + { + "epoch": 0.32, + "grad_norm": 63.276974873593076, + "learning_rate": 9.504844339512096e-06, + "loss": 0.961, + "step": 80 + }, + { + "epoch": 0.32, + "grad_norm": 0.34404347007223207, + "learning_rate": 9.490842806067095e-06, + "loss": 0.7366, + "step": 81 + }, + { + "epoch": 0.33, + "grad_norm": 0.2761892805994169, + "learning_rate": 9.476656671663766e-06, + "loss": 0.7565, + "step": 82 + }, + { + "epoch": 0.33, + "grad_norm": 0.2938700568825168, + "learning_rate": 9.462286519438531e-06, + "loss": 0.7586, + "step": 83 + }, + { + "epoch": 0.33, + "grad_norm": 0.30998708104141814, + "learning_rate": 9.44773294009206e-06, + "loss": 0.747, + "step": 84 + }, + { + "epoch": 0.34, + "grad_norm": 0.2789622879446074, + "learning_rate": 9.432996531865001e-06, + "loss": 0.7381, + "step": 85 + }, + { + "epoch": 0.34, + "grad_norm": 0.3043211841621936, + "learning_rate": 9.418077900513377e-06, + "loss": 0.7648, + "step": 86 + }, + { + "epoch": 0.35, + "grad_norm": 0.27269347275749684, + "learning_rate": 9.40297765928369e-06, + "loss": 0.7287, + "step": 87 + }, + { + "epoch": 0.35, + "grad_norm": 0.29165683068711035, + "learning_rate": 9.387696428887715e-06, + "loss": 0.7714, + "step": 88 + }, + { + "epoch": 0.35, + "grad_norm": 0.29093659611546846, + "learning_rate": 9.372234837476979e-06, + "loss": 0.754, + "step": 89 + }, + { + "epoch": 0.36, + "grad_norm": 0.2622000520062877, + "learning_rate": 9.356593520616948e-06, + "loss": 0.7604, + "step": 90 + }, + { + "epoch": 0.36, + "grad_norm": 0.29648676556314774, + "learning_rate": 9.340773121260893e-06, + "loss": 0.7677, + "step": 91 + }, + { + "epoch": 0.37, + "grad_norm": 0.2971691719126809, + "learning_rate": 9.324774289723469e-06, + "loss": 0.7826, + "step": 92 + }, + { + "epoch": 0.37, + "grad_norm": 0.2695147958164756, + "learning_rate": 9.308597683653976e-06, + "loss": 0.7675, + "step": 93 + }, + { + "epoch": 0.37, + "grad_norm": 0.2947547264550856, + "learning_rate": 9.292243968009332e-06, + "loss": 0.7611, + "step": 94 + }, + { + "epoch": 0.38, + "grad_norm": 0.2698951119585888, + "learning_rate": 9.275713815026732e-06, + "loss": 0.7437, + "step": 95 + }, + { + "epoch": 0.38, + "grad_norm": 0.2922871464880811, + "learning_rate": 9.259007904196023e-06, + "loss": 0.7716, + "step": 96 + }, + { + "epoch": 0.39, + "grad_norm": 13.342043215041077, + "learning_rate": 9.242126922231763e-06, + "loss": 1.0262, + "step": 97 + }, + { + "epoch": 0.39, + "grad_norm": 3.3348436860369577, + "learning_rate": 9.225071563045007e-06, + "loss": 0.9733, + "step": 98 + }, + { + "epoch": 0.39, + "grad_norm": 0.30515152013617763, + "learning_rate": 9.207842527714767e-06, + "loss": 0.7491, + "step": 99 + }, + { + "epoch": 0.4, + "grad_norm": 0.2797372956926758, + "learning_rate": 9.190440524459203e-06, + "loss": 0.7658, + "step": 100 + }, + { + "epoch": 0.4, + "grad_norm": 0.295442681103485, + "learning_rate": 9.172866268606514e-06, + "loss": 0.7359, + "step": 101 + }, + { + "epoch": 0.41, + "grad_norm": 0.2890934213055238, + "learning_rate": 9.15512048256552e-06, + "loss": 0.7783, + "step": 102 + }, + { + "epoch": 0.41, + "grad_norm": 0.2698602196909741, + "learning_rate": 9.137203895795983e-06, + "loss": 0.7476, + "step": 103 + }, + { + "epoch": 0.41, + "grad_norm": 0.30586672847809404, + "learning_rate": 9.119117244778609e-06, + "loss": 0.7494, + "step": 104 + }, + { + "epoch": 0.42, + "grad_norm": 0.28256137853789653, + "learning_rate": 9.10086127298478e-06, + "loss": 0.7347, + "step": 105 + }, + { + "epoch": 0.42, + "grad_norm": 0.2654565204147507, + "learning_rate": 9.082436730845993e-06, + "loss": 0.7282, + "step": 106 + }, + { + "epoch": 0.43, + "grad_norm": 0.3110313172517606, + "learning_rate": 9.063844375723014e-06, + "loss": 0.7442, + "step": 107 + }, + { + "epoch": 0.43, + "grad_norm": 0.30406144580285027, + "learning_rate": 9.045084971874738e-06, + "loss": 0.7365, + "step": 108 + }, + { + "epoch": 0.43, + "grad_norm": 0.2578097986689305, + "learning_rate": 9.026159290426782e-06, + "loss": 0.7644, + "step": 109 + }, + { + "epoch": 0.44, + "grad_norm": 0.2875228334986064, + "learning_rate": 9.007068109339783e-06, + "loss": 0.7359, + "step": 110 + }, + { + "epoch": 0.44, + "grad_norm": 0.3007292657335934, + "learning_rate": 8.987812213377423e-06, + "loss": 0.7571, + "step": 111 + }, + { + "epoch": 0.45, + "grad_norm": 0.2705781647990633, + "learning_rate": 8.968392394074164e-06, + "loss": 0.7321, + "step": 112 + }, + { + "epoch": 0.45, + "grad_norm": 7.709717121399015, + "learning_rate": 8.948809449702712e-06, + "loss": 1.0663, + "step": 113 + }, + { + "epoch": 0.45, + "grad_norm": 0.3159530858994423, + "learning_rate": 8.929064185241214e-06, + "loss": 0.7594, + "step": 114 + }, + { + "epoch": 0.46, + "grad_norm": 0.3001925080979955, + "learning_rate": 8.90915741234015e-06, + "loss": 0.7486, + "step": 115 + }, + { + "epoch": 0.46, + "grad_norm": 2.7719217922453914, + "learning_rate": 8.889089949288986e-06, + "loss": 1.0014, + "step": 116 + }, + { + "epoch": 0.47, + "grad_norm": 0.2980097580186808, + "learning_rate": 8.868862620982534e-06, + "loss": 0.7302, + "step": 117 + }, + { + "epoch": 0.47, + "grad_norm": 0.2674175783919389, + "learning_rate": 8.84847625888703e-06, + "loss": 0.7515, + "step": 118 + }, + { + "epoch": 0.47, + "grad_norm": 0.33089914745986465, + "learning_rate": 8.827931701005974e-06, + "loss": 0.7452, + "step": 119 + }, + { + "epoch": 0.48, + "grad_norm": 0.2657873873108844, + "learning_rate": 8.807229791845673e-06, + "loss": 0.7565, + "step": 120 + }, + { + "epoch": 0.48, + "grad_norm": 0.26891569095038903, + "learning_rate": 8.786371382380527e-06, + "loss": 0.7525, + "step": 121 + }, + { + "epoch": 0.49, + "grad_norm": 0.30828049821760906, + "learning_rate": 8.765357330018056e-06, + "loss": 0.7395, + "step": 122 + }, + { + "epoch": 0.49, + "grad_norm": 0.2703956947723179, + "learning_rate": 8.74418849856364e-06, + "loss": 0.7762, + "step": 123 + }, + { + "epoch": 0.49, + "grad_norm": 0.27665743831770573, + "learning_rate": 8.722865758185036e-06, + "loss": 0.7499, + "step": 124 + }, + { + "epoch": 0.5, + "grad_norm": 0.31328445024397394, + "learning_rate": 8.701389985376578e-06, + "loss": 0.7368, + "step": 125 + }, + { + "epoch": 0.5, + "eval_loss": 0.7192811369895935, + "eval_runtime": 97.0775, + "eval_samples_per_second": 18.212, + "eval_steps_per_second": 0.381, + "step": 125 + }, + { + "epoch": 0.5, + "grad_norm": 0.26565830658390915, + "learning_rate": 8.679762062923176e-06, + "loss": 0.7727, + "step": 126 + }, + { + "epoch": 0.51, + "grad_norm": 0.2768705145101062, + "learning_rate": 8.657982879864007e-06, + "loss": 0.7178, + "step": 127 + }, + { + "epoch": 0.51, + "grad_norm": 0.27658899618203814, + "learning_rate": 8.636053331455986e-06, + "loss": 0.7521, + "step": 128 + }, + { + "epoch": 0.51, + "grad_norm": 0.2687326456666238, + "learning_rate": 8.613974319136959e-06, + "loss": 0.7411, + "step": 129 + }, + { + "epoch": 0.52, + "grad_norm": 0.2618083386724651, + "learning_rate": 8.591746750488639e-06, + "loss": 0.7306, + "step": 130 + }, + { + "epoch": 0.52, + "grad_norm": 0.25666246646393165, + "learning_rate": 8.569371539199316e-06, + "loss": 0.7505, + "step": 131 + }, + { + "epoch": 0.53, + "grad_norm": 0.3203048983481449, + "learning_rate": 8.54684960502629e-06, + "loss": 0.7515, + "step": 132 + }, + { + "epoch": 0.53, + "grad_norm": 0.2521993776332652, + "learning_rate": 8.52418187375806e-06, + "loss": 0.7505, + "step": 133 + }, + { + "epoch": 0.53, + "grad_norm": 0.26591933789428035, + "learning_rate": 8.501369277176275e-06, + "loss": 0.7353, + "step": 134 + }, + { + "epoch": 0.54, + "grad_norm": 0.27603393300812845, + "learning_rate": 8.478412753017433e-06, + "loss": 0.7609, + "step": 135 + }, + { + "epoch": 0.54, + "grad_norm": 0.2668194745887302, + "learning_rate": 8.455313244934324e-06, + "loss": 0.7141, + "step": 136 + }, + { + "epoch": 0.55, + "grad_norm": 0.2707242484249978, + "learning_rate": 8.432071702457253e-06, + "loss": 0.7223, + "step": 137 + }, + { + "epoch": 0.55, + "grad_norm": 0.25511126409448154, + "learning_rate": 8.408689080954997e-06, + "loss": 0.7153, + "step": 138 + }, + { + "epoch": 0.55, + "grad_norm": 15.420395873510664, + "learning_rate": 8.38516634159555e-06, + "loss": 1.0042, + "step": 139 + }, + { + "epoch": 0.56, + "grad_norm": 0.3012234081880187, + "learning_rate": 8.361504451306585e-06, + "loss": 0.7713, + "step": 140 + }, + { + "epoch": 0.56, + "grad_norm": 0.2598491249346932, + "learning_rate": 8.337704382735741e-06, + "loss": 0.7288, + "step": 141 + }, + { + "epoch": 0.57, + "grad_norm": 0.26256298238822373, + "learning_rate": 8.313767114210615e-06, + "loss": 0.7379, + "step": 142 + }, + { + "epoch": 0.57, + "grad_norm": 0.2945113973208366, + "learning_rate": 8.289693629698564e-06, + "loss": 0.7401, + "step": 143 + }, + { + "epoch": 0.57, + "grad_norm": 0.24981458420819586, + "learning_rate": 8.265484918766243e-06, + "loss": 0.7512, + "step": 144 + }, + { + "epoch": 0.58, + "grad_norm": 0.2678413297548206, + "learning_rate": 8.241141976538944e-06, + "loss": 0.7449, + "step": 145 + }, + { + "epoch": 0.58, + "grad_norm": 0.2623417103083203, + "learning_rate": 8.216665803659671e-06, + "loss": 0.7647, + "step": 146 + }, + { + "epoch": 0.59, + "grad_norm": 0.263785777979794, + "learning_rate": 8.192057406248028e-06, + "loss": 0.7725, + "step": 147 + }, + { + "epoch": 0.59, + "grad_norm": 0.2519540317965661, + "learning_rate": 8.16731779585885e-06, + "loss": 0.715, + "step": 148 + }, + { + "epoch": 0.59, + "grad_norm": 0.27015785362121375, + "learning_rate": 8.142447989440618e-06, + "loss": 0.7532, + "step": 149 + }, + { + "epoch": 0.6, + "grad_norm": 0.25863328277564784, + "learning_rate": 8.117449009293668e-06, + "loss": 0.7335, + "step": 150 + }, + { + "epoch": 0.6, + "grad_norm": 0.2550714525590909, + "learning_rate": 8.092321883028157e-06, + "loss": 0.7182, + "step": 151 + }, + { + "epoch": 0.61, + "grad_norm": 0.2752483825047847, + "learning_rate": 8.067067643521834e-06, + "loss": 0.772, + "step": 152 + }, + { + "epoch": 0.61, + "grad_norm": 0.2582002365542859, + "learning_rate": 8.041687328877566e-06, + "loss": 0.7284, + "step": 153 + }, + { + "epoch": 0.61, + "grad_norm": 0.24823845365447103, + "learning_rate": 8.016181982380682e-06, + "loss": 0.7467, + "step": 154 + }, + { + "epoch": 0.62, + "grad_norm": 0.25644169647568194, + "learning_rate": 7.99055265245608e-06, + "loss": 0.7191, + "step": 155 + }, + { + "epoch": 0.62, + "grad_norm": 0.27463144458405375, + "learning_rate": 7.96480039262513e-06, + "loss": 0.7375, + "step": 156 + }, + { + "epoch": 0.63, + "grad_norm": 0.2642553744129046, + "learning_rate": 7.938926261462366e-06, + "loss": 0.7587, + "step": 157 + }, + { + "epoch": 0.63, + "grad_norm": 0.25679436337027056, + "learning_rate": 7.912931322551981e-06, + "loss": 0.7312, + "step": 158 + }, + { + "epoch": 0.63, + "grad_norm": 0.2784563408983632, + "learning_rate": 7.886816644444099e-06, + "loss": 0.7213, + "step": 159 + }, + { + "epoch": 0.64, + "grad_norm": 0.2638439438037005, + "learning_rate": 7.860583300610849e-06, + "loss": 0.7286, + "step": 160 + }, + { + "epoch": 0.64, + "grad_norm": 0.2599487424469649, + "learning_rate": 7.83423236940225e-06, + "loss": 0.7211, + "step": 161 + }, + { + "epoch": 0.65, + "grad_norm": 0.2620169279227201, + "learning_rate": 7.807764934001875e-06, + "loss": 0.7243, + "step": 162 + }, + { + "epoch": 0.65, + "grad_norm": 0.3455142385888707, + "learning_rate": 7.781182082382325e-06, + "loss": 0.7709, + "step": 163 + }, + { + "epoch": 0.65, + "grad_norm": 0.25797958121628417, + "learning_rate": 7.754484907260513e-06, + "loss": 0.7371, + "step": 164 + }, + { + "epoch": 0.66, + "grad_norm": 0.26245124486082283, + "learning_rate": 7.727674506052744e-06, + "loss": 0.7625, + "step": 165 + }, + { + "epoch": 0.66, + "grad_norm": 0.259525851556208, + "learning_rate": 7.700751980829601e-06, + "loss": 0.7785, + "step": 166 + }, + { + "epoch": 0.67, + "grad_norm": 0.25578616887441574, + "learning_rate": 7.673718438270649e-06, + "loss": 0.7349, + "step": 167 + }, + { + "epoch": 0.67, + "grad_norm": 0.26880908011952676, + "learning_rate": 7.646574989618938e-06, + "loss": 0.7423, + "step": 168 + }, + { + "epoch": 0.67, + "grad_norm": 0.28268284846763475, + "learning_rate": 7.619322750635327e-06, + "loss": 0.8089, + "step": 169 + }, + { + "epoch": 0.68, + "grad_norm": 0.2565025440158926, + "learning_rate": 7.591962841552627e-06, + "loss": 0.708, + "step": 170 + }, + { + "epoch": 0.68, + "grad_norm": 0.2589330645555015, + "learning_rate": 7.564496387029532e-06, + "loss": 0.7276, + "step": 171 + }, + { + "epoch": 0.69, + "grad_norm": 4.072287518324262, + "learning_rate": 7.536924516104411e-06, + "loss": 0.963, + "step": 172 + }, + { + "epoch": 0.69, + "grad_norm": 5.236195816930884, + "learning_rate": 7.509248362148889e-06, + "loss": 0.9786, + "step": 173 + }, + { + "epoch": 0.69, + "grad_norm": 0.3208811366448085, + "learning_rate": 7.481469062821252e-06, + "loss": 0.7417, + "step": 174 + }, + { + "epoch": 0.7, + "grad_norm": 0.280850125752099, + "learning_rate": 7.453587760019691e-06, + "loss": 0.7249, + "step": 175 + }, + { + "epoch": 0.7, + "grad_norm": 0.26692675004354643, + "learning_rate": 7.42560559983536e-06, + "loss": 0.727, + "step": 176 + }, + { + "epoch": 0.71, + "grad_norm": 0.27369623001787907, + "learning_rate": 7.39752373250527e-06, + "loss": 0.7617, + "step": 177 + }, + { + "epoch": 0.71, + "grad_norm": 0.2784142807735342, + "learning_rate": 7.369343312364994e-06, + "loss": 0.7466, + "step": 178 + }, + { + "epoch": 0.71, + "grad_norm": 37.27250895412763, + "learning_rate": 7.34106549780123e-06, + "loss": 1.0667, + "step": 179 + }, + { + "epoch": 0.72, + "grad_norm": 0.2760061039631398, + "learning_rate": 7.312691451204178e-06, + "loss": 0.7244, + "step": 180 + }, + { + "epoch": 0.72, + "grad_norm": 0.25459829854169064, + "learning_rate": 7.284222338919758e-06, + "loss": 0.7364, + "step": 181 + }, + { + "epoch": 0.73, + "grad_norm": 0.26179305555253735, + "learning_rate": 7.255659331201673e-06, + "loss": 0.733, + "step": 182 + }, + { + "epoch": 0.73, + "grad_norm": 0.2604418741829541, + "learning_rate": 7.227003602163296e-06, + "loss": 0.7209, + "step": 183 + }, + { + "epoch": 0.73, + "grad_norm": 0.26185681215109163, + "learning_rate": 7.198256329729412e-06, + "loss": 0.7164, + "step": 184 + }, + { + "epoch": 0.74, + "grad_norm": 16.152387103951856, + "learning_rate": 7.169418695587791e-06, + "loss": 1.0372, + "step": 185 + }, + { + "epoch": 0.74, + "grad_norm": 21.228735850953576, + "learning_rate": 7.140491885140629e-06, + "loss": 1.0402, + "step": 186 + }, + { + "epoch": 0.75, + "grad_norm": 0.28404810159037286, + "learning_rate": 7.1114770874558e-06, + "loss": 0.7006, + "step": 187 + }, + { + "epoch": 0.75, + "grad_norm": 0.26609034714435736, + "learning_rate": 7.082375495217996e-06, + "loss": 0.7537, + "step": 188 + }, + { + "epoch": 0.75, + "grad_norm": 0.25765084929276133, + "learning_rate": 7.053188304679691e-06, + "loss": 0.7302, + "step": 189 + }, + { + "epoch": 0.76, + "grad_norm": 0.26384158886834463, + "learning_rate": 7.023916715611969e-06, + "loss": 0.712, + "step": 190 + }, + { + "epoch": 0.76, + "grad_norm": 0.27151931787506317, + "learning_rate": 6.994561931255209e-06, + "loss": 0.7502, + "step": 191 + }, + { + "epoch": 0.77, + "grad_norm": 0.27031492068457535, + "learning_rate": 6.965125158269619e-06, + "loss": 0.7179, + "step": 192 + }, + { + "epoch": 0.77, + "grad_norm": 0.26995073084719196, + "learning_rate": 6.935607606685642e-06, + "loss": 0.7624, + "step": 193 + }, + { + "epoch": 0.77, + "grad_norm": 0.25666755324587454, + "learning_rate": 6.906010489854209e-06, + "loss": 0.7426, + "step": 194 + }, + { + "epoch": 0.78, + "grad_norm": 0.2764461509009301, + "learning_rate": 6.876335024396872e-06, + "loss": 0.723, + "step": 195 + }, + { + "epoch": 0.78, + "grad_norm": 0.2597906002555833, + "learning_rate": 6.846582430155783e-06, + "loss": 0.7407, + "step": 196 + }, + { + "epoch": 0.79, + "grad_norm": 0.26409742487438864, + "learning_rate": 6.816753930143558e-06, + "loss": 0.7206, + "step": 197 + }, + { + "epoch": 0.79, + "grad_norm": 0.25320169233675405, + "learning_rate": 6.786850750493006e-06, + "loss": 0.7437, + "step": 198 + }, + { + "epoch": 0.79, + "grad_norm": 0.2708696048462205, + "learning_rate": 6.7568741204067145e-06, + "loss": 0.7422, + "step": 199 + }, + { + "epoch": 0.8, + "grad_norm": 0.26542323915181154, + "learning_rate": 6.726825272106539e-06, + "loss": 0.7514, + "step": 200 + }, + { + "epoch": 0.8, + "grad_norm": 0.26307166597433396, + "learning_rate": 6.696705440782939e-06, + "loss": 0.7509, + "step": 201 + }, + { + "epoch": 0.81, + "grad_norm": 0.26671446872754456, + "learning_rate": 6.66651586454421e-06, + "loss": 0.7465, + "step": 202 + }, + { + "epoch": 0.81, + "grad_norm": 0.2720083369272757, + "learning_rate": 6.636257784365585e-06, + "loss": 0.7349, + "step": 203 + }, + { + "epoch": 0.81, + "grad_norm": 0.2652218770116059, + "learning_rate": 6.605932444038229e-06, + "loss": 0.7348, + "step": 204 + }, + { + "epoch": 0.82, + "grad_norm": 0.26402314109149694, + "learning_rate": 6.575541090118105e-06, + "loss": 0.7495, + "step": 205 + }, + { + "epoch": 0.82, + "grad_norm": 0.2639803511821082, + "learning_rate": 6.545084971874738e-06, + "loss": 0.7138, + "step": 206 + }, + { + "epoch": 0.83, + "grad_norm": 0.2673567043268493, + "learning_rate": 6.514565341239861e-06, + "loss": 0.7341, + "step": 207 + }, + { + "epoch": 0.83, + "grad_norm": 74.0236556664021, + "learning_rate": 6.483983452755953e-06, + "loss": 1.084, + "step": 208 + }, + { + "epoch": 0.83, + "grad_norm": 0.2694930198888902, + "learning_rate": 6.4533405635246696e-06, + "loss": 0.7422, + "step": 209 + }, + { + "epoch": 0.84, + "grad_norm": 0.2808089325365656, + "learning_rate": 6.4226379331551625e-06, + "loss": 0.7543, + "step": 210 + }, + { + "epoch": 0.84, + "grad_norm": 0.24629087243802783, + "learning_rate": 6.3918768237123175e-06, + "loss": 0.7088, + "step": 211 + }, + { + "epoch": 0.85, + "grad_norm": 0.2605148206784209, + "learning_rate": 6.361058499664856e-06, + "loss": 0.7434, + "step": 212 + }, + { + "epoch": 0.85, + "grad_norm": 0.26199687024127344, + "learning_rate": 6.330184227833376e-06, + "loss": 0.7369, + "step": 213 + }, + { + "epoch": 0.85, + "grad_norm": 0.2693080928270376, + "learning_rate": 6.299255277338265e-06, + "loss": 0.7337, + "step": 214 + }, + { + "epoch": 0.86, + "grad_norm": 0.2573571779304293, + "learning_rate": 6.268272919547537e-06, + "loss": 0.7366, + "step": 215 + }, + { + "epoch": 0.86, + "grad_norm": 0.25347655388671, + "learning_rate": 6.237238428024573e-06, + "loss": 0.7392, + "step": 216 + }, + { + "epoch": 0.87, + "grad_norm": 0.254807709796356, + "learning_rate": 6.2061530784757625e-06, + "loss": 0.7709, + "step": 217 + }, + { + "epoch": 0.87, + "grad_norm": 0.25435065962054804, + "learning_rate": 6.175018148698077e-06, + "loss": 0.7472, + "step": 218 + }, + { + "epoch": 0.87, + "grad_norm": 0.25856868944475736, + "learning_rate": 6.143834918526528e-06, + "loss": 0.7442, + "step": 219 + }, + { + "epoch": 0.88, + "grad_norm": 0.24960062893507637, + "learning_rate": 6.112604669781572e-06, + "loss": 0.7163, + "step": 220 + }, + { + "epoch": 0.88, + "grad_norm": 0.2544024553733407, + "learning_rate": 6.0813286862164175e-06, + "loss": 0.7236, + "step": 221 + }, + { + "epoch": 0.89, + "grad_norm": 0.2532920039697931, + "learning_rate": 6.050008253464247e-06, + "loss": 0.7427, + "step": 222 + }, + { + "epoch": 0.89, + "grad_norm": 0.25372808971698796, + "learning_rate": 6.018644658985378e-06, + "loss": 0.7286, + "step": 223 + }, + { + "epoch": 0.89, + "grad_norm": 0.2570514856547558, + "learning_rate": 5.987239192014336e-06, + "loss": 0.7349, + "step": 224 + }, + { + "epoch": 0.9, + "grad_norm": 0.2578576277551542, + "learning_rate": 5.955793143506863e-06, + "loss": 0.7266, + "step": 225 + }, + { + "epoch": 0.9, + "grad_norm": 0.26312215832145636, + "learning_rate": 5.9243078060868445e-06, + "loss": 0.7389, + "step": 226 + }, + { + "epoch": 0.91, + "grad_norm": 0.26518617358808877, + "learning_rate": 5.892784473993184e-06, + "loss": 0.7108, + "step": 227 + }, + { + "epoch": 0.91, + "grad_norm": 0.25620517627113376, + "learning_rate": 5.861224443026595e-06, + "loss": 0.7232, + "step": 228 + }, + { + "epoch": 0.91, + "grad_norm": 28.36402580586963, + "learning_rate": 5.82962901049634e-06, + "loss": 0.9734, + "step": 229 + }, + { + "epoch": 0.92, + "grad_norm": 0.2807037939514787, + "learning_rate": 5.797999475166897e-06, + "loss": 0.7341, + "step": 230 + }, + { + "epoch": 0.92, + "grad_norm": 8.208344028346868, + "learning_rate": 5.766337137204579e-06, + "loss": 0.938, + "step": 231 + }, + { + "epoch": 0.93, + "grad_norm": 0.26445130385050997, + "learning_rate": 5.734643298124091e-06, + "loss": 0.7316, + "step": 232 + }, + { + "epoch": 0.93, + "grad_norm": 0.251567451954335, + "learning_rate": 5.702919260735015e-06, + "loss": 0.6966, + "step": 233 + }, + { + "epoch": 0.93, + "grad_norm": 0.26329080787916564, + "learning_rate": 5.671166329088278e-06, + "loss": 0.7319, + "step": 234 + }, + { + "epoch": 0.94, + "grad_norm": 0.2566777339661679, + "learning_rate": 5.6393858084225305e-06, + "loss": 0.7529, + "step": 235 + }, + { + "epoch": 0.94, + "grad_norm": 0.2710815554700812, + "learning_rate": 5.6075790051105025e-06, + "loss": 0.7515, + "step": 236 + }, + { + "epoch": 0.95, + "grad_norm": 0.27096961550302734, + "learning_rate": 5.575747226605298e-06, + "loss": 0.7073, + "step": 237 + }, + { + "epoch": 0.95, + "grad_norm": 0.2509131795738037, + "learning_rate": 5.543891781386655e-06, + "loss": 0.7513, + "step": 238 + }, + { + "epoch": 0.95, + "grad_norm": 0.26210506205941153, + "learning_rate": 5.512013978907157e-06, + "loss": 0.7569, + "step": 239 + }, + { + "epoch": 0.96, + "grad_norm": 0.25123130642497177, + "learning_rate": 5.480115129538409e-06, + "loss": 0.7239, + "step": 240 + }, + { + "epoch": 0.96, + "grad_norm": 0.2596821607229612, + "learning_rate": 5.448196544517168e-06, + "loss": 0.7256, + "step": 241 + }, + { + "epoch": 0.97, + "grad_norm": 0.2714818563550966, + "learning_rate": 5.4162595358914475e-06, + "loss": 0.7329, + "step": 242 + }, + { + "epoch": 0.97, + "grad_norm": 0.260503064108439, + "learning_rate": 5.384305416466584e-06, + "loss": 0.7112, + "step": 243 + }, + { + "epoch": 0.97, + "grad_norm": 0.2661267608396215, + "learning_rate": 5.35233549975127e-06, + "loss": 0.7534, + "step": 244 + }, + { + "epoch": 0.98, + "grad_norm": 0.27502743208671454, + "learning_rate": 5.320351099903565e-06, + "loss": 0.7355, + "step": 245 + }, + { + "epoch": 0.98, + "grad_norm": 0.2598641343680277, + "learning_rate": 5.288353531676873e-06, + "loss": 0.7476, + "step": 246 + }, + { + "epoch": 0.99, + "grad_norm": 0.2629788348419056, + "learning_rate": 5.256344110365896e-06, + "loss": 0.7523, + "step": 247 + }, + { + "epoch": 0.99, + "grad_norm": 0.256123432185156, + "learning_rate": 5.224324151752575e-06, + "loss": 0.7479, + "step": 248 + }, + { + "epoch": 0.99, + "grad_norm": 0.2592695095071067, + "learning_rate": 5.192294972051992e-06, + "loss": 0.7586, + "step": 249 + }, + { + "epoch": 1.0, + "grad_norm": 0.26264999139615697, + "learning_rate": 5.160257887858278e-06, + "loss": 0.7406, + "step": 250 + }, + { + "epoch": 1.0, + "eval_loss": 0.7036678791046143, + "eval_runtime": 96.3087, + "eval_samples_per_second": 18.358, + "eval_steps_per_second": 0.384, + "step": 250 + }, + { + "epoch": 1.0, + "grad_norm": 0.2614921961545108, + "learning_rate": 5.128214216090478e-06, + "loss": 0.7488, + "step": 251 + }, + { + "epoch": 1.0, + "grad_norm": 0.2605743808221771, + "learning_rate": 5.0961652739384356e-06, + "loss": 0.7338, + "step": 252 + }, + { + "epoch": 1.01, + "grad_norm": 2.904417203670962, + "learning_rate": 5.064112378808636e-06, + "loss": 0.9738, + "step": 253 + }, + { + "epoch": 1.01, + "grad_norm": 0.2581985759494367, + "learning_rate": 5.032056848270056e-06, + "loss": 0.7693, + "step": 254 + }, + { + "epoch": 1.02, + "grad_norm": 0.25102446332314765, + "learning_rate": 5e-06, + "loss": 0.7213, + "step": 255 + }, + { + "epoch": 1.0, + "grad_norm": 1.4216598983588058, + "learning_rate": 4.967943151729945e-06, + "loss": 0.9193, + "step": 256 + }, + { + "epoch": 1.0, + "grad_norm": 0.32982276331099014, + "learning_rate": 4.935887621191364e-06, + "loss": 0.6842, + "step": 257 + }, + { + "epoch": 1.01, + "grad_norm": 0.29043411467478625, + "learning_rate": 4.903834726061565e-06, + "loss": 0.7087, + "step": 258 + }, + { + "epoch": 1.01, + "grad_norm": 0.25986254756592664, + "learning_rate": 4.871785783909523e-06, + "loss": 0.6741, + "step": 259 + }, + { + "epoch": 1.02, + "grad_norm": 0.30049816553828484, + "learning_rate": 4.839742112141725e-06, + "loss": 0.7063, + "step": 260 + }, + { + "epoch": 1.02, + "grad_norm": 0.2895999616622155, + "learning_rate": 4.807705027948008e-06, + "loss": 0.7146, + "step": 261 + }, + { + "epoch": 1.02, + "grad_norm": 0.30041272052643164, + "learning_rate": 4.775675848247427e-06, + "loss": 0.7134, + "step": 262 + }, + { + "epoch": 1.03, + "grad_norm": 0.27518299819790887, + "learning_rate": 4.743655889634105e-06, + "loss": 0.692, + "step": 263 + }, + { + "epoch": 1.03, + "grad_norm": 0.26955160521446175, + "learning_rate": 4.711646468323129e-06, + "loss": 0.658, + "step": 264 + }, + { + "epoch": 1.04, + "grad_norm": 0.27535739664976405, + "learning_rate": 4.679648900096436e-06, + "loss": 0.6908, + "step": 265 + }, + { + "epoch": 1.04, + "grad_norm": 0.26763089124769246, + "learning_rate": 4.64766450024873e-06, + "loss": 0.6861, + "step": 266 + }, + { + "epoch": 1.04, + "grad_norm": 74.38254133611925, + "learning_rate": 4.615694583533418e-06, + "loss": 0.9994, + "step": 267 + }, + { + "epoch": 1.05, + "grad_norm": 0.2986551656205794, + "learning_rate": 4.583740464108554e-06, + "loss": 0.7075, + "step": 268 + }, + { + "epoch": 1.05, + "grad_norm": 0.27619714658975547, + "learning_rate": 4.551803455482833e-06, + "loss": 0.6596, + "step": 269 + }, + { + "epoch": 1.06, + "grad_norm": 0.25242413092583954, + "learning_rate": 4.5198848704615915e-06, + "loss": 0.6628, + "step": 270 + }, + { + "epoch": 1.06, + "grad_norm": 0.26017582720690735, + "learning_rate": 4.487986021092844e-06, + "loss": 0.6916, + "step": 271 + }, + { + "epoch": 1.06, + "grad_norm": 0.2719334401383232, + "learning_rate": 4.456108218613346e-06, + "loss": 0.6935, + "step": 272 + }, + { + "epoch": 1.07, + "grad_norm": 0.2874168693732095, + "learning_rate": 4.424252773394704e-06, + "loss": 0.7013, + "step": 273 + }, + { + "epoch": 1.07, + "grad_norm": 0.27088215577908004, + "learning_rate": 4.392420994889498e-06, + "loss": 0.693, + "step": 274 + }, + { + "epoch": 1.08, + "grad_norm": 0.2532812233498042, + "learning_rate": 4.3606141915774695e-06, + "loss": 0.6762, + "step": 275 + }, + { + "epoch": 1.08, + "grad_norm": 0.263089719520046, + "learning_rate": 4.3288336709117246e-06, + "loss": 0.6677, + "step": 276 + }, + { + "epoch": 1.08, + "grad_norm": 0.2544227492529696, + "learning_rate": 4.297080739264987e-06, + "loss": 0.6744, + "step": 277 + }, + { + "epoch": 1.09, + "grad_norm": 0.2645751047762965, + "learning_rate": 4.265356701875911e-06, + "loss": 0.7047, + "step": 278 + }, + { + "epoch": 1.09, + "grad_norm": 0.2602397068309733, + "learning_rate": 4.23366286279542e-06, + "loss": 0.6792, + "step": 279 + }, + { + "epoch": 1.1, + "grad_norm": 0.27012218470061766, + "learning_rate": 4.2020005248331056e-06, + "loss": 0.6914, + "step": 280 + }, + { + "epoch": 1.1, + "grad_norm": 0.2645729945558582, + "learning_rate": 4.170370989503662e-06, + "loss": 0.6812, + "step": 281 + }, + { + "epoch": 1.1, + "grad_norm": 0.26158176244234604, + "learning_rate": 4.138775556973406e-06, + "loss": 0.6545, + "step": 282 + }, + { + "epoch": 1.11, + "grad_norm": 0.2609966416888788, + "learning_rate": 4.107215526006818e-06, + "loss": 0.6534, + "step": 283 + }, + { + "epoch": 1.11, + "grad_norm": 0.2633177456953443, + "learning_rate": 4.075692193913156e-06, + "loss": 0.6617, + "step": 284 + }, + { + "epoch": 1.12, + "grad_norm": 0.2711366514748812, + "learning_rate": 4.04420685649314e-06, + "loss": 0.7026, + "step": 285 + }, + { + "epoch": 1.12, + "grad_norm": 0.26016920693531187, + "learning_rate": 4.012760807985665e-06, + "loss": 0.685, + "step": 286 + }, + { + "epoch": 1.12, + "grad_norm": 0.2634077734164485, + "learning_rate": 3.9813553410146225e-06, + "loss": 0.6732, + "step": 287 + }, + { + "epoch": 1.13, + "grad_norm": 0.2630739058989318, + "learning_rate": 3.949991746535753e-06, + "loss": 0.6898, + "step": 288 + }, + { + "epoch": 1.13, + "grad_norm": 0.26810735877032293, + "learning_rate": 3.918671313783583e-06, + "loss": 0.6739, + "step": 289 + }, + { + "epoch": 1.14, + "grad_norm": 0.2667138269733132, + "learning_rate": 3.887395330218429e-06, + "loss": 0.6649, + "step": 290 + }, + { + "epoch": 1.14, + "grad_norm": 0.2563222658817468, + "learning_rate": 3.856165081473474e-06, + "loss": 0.708, + "step": 291 + }, + { + "epoch": 1.14, + "grad_norm": 0.26451201369218524, + "learning_rate": 3.824981851301924e-06, + "loss": 0.6715, + "step": 292 + }, + { + "epoch": 1.15, + "grad_norm": 0.2598494927634439, + "learning_rate": 3.7938469215242374e-06, + "loss": 0.6955, + "step": 293 + }, + { + "epoch": 1.15, + "grad_norm": 0.25396403307478727, + "learning_rate": 3.7627615719754294e-06, + "loss": 0.6676, + "step": 294 + }, + { + "epoch": 1.16, + "grad_norm": 0.2532765659210287, + "learning_rate": 3.731727080452464e-06, + "loss": 0.6748, + "step": 295 + }, + { + "epoch": 1.16, + "grad_norm": 0.26061271523616963, + "learning_rate": 3.7007447226617367e-06, + "loss": 0.7058, + "step": 296 + }, + { + "epoch": 1.16, + "grad_norm": 0.25801226716086006, + "learning_rate": 3.669815772166625e-06, + "loss": 0.6717, + "step": 297 + }, + { + "epoch": 1.17, + "grad_norm": 0.2678578983084559, + "learning_rate": 3.638941500335145e-06, + "loss": 0.6785, + "step": 298 + }, + { + "epoch": 1.17, + "grad_norm": 0.2629273311111566, + "learning_rate": 3.608123176287685e-06, + "loss": 0.6846, + "step": 299 + }, + { + "epoch": 1.18, + "grad_norm": 0.26372287416738016, + "learning_rate": 3.5773620668448384e-06, + "loss": 0.7155, + "step": 300 + }, + { + "epoch": 1.18, + "grad_norm": 0.2705437923133382, + "learning_rate": 3.5466594364753325e-06, + "loss": 0.6723, + "step": 301 + }, + { + "epoch": 1.18, + "grad_norm": 0.2839291053250124, + "learning_rate": 3.516016547244047e-06, + "loss": 0.7035, + "step": 302 + }, + { + "epoch": 1.19, + "grad_norm": 0.2760313377640414, + "learning_rate": 3.48543465876014e-06, + "loss": 0.6751, + "step": 303 + }, + { + "epoch": 1.19, + "grad_norm": 0.25394626546919435, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.6795, + "step": 304 + }, + { + "epoch": 1.2, + "grad_norm": 0.2591221216055477, + "learning_rate": 3.424458909881897e-06, + "loss": 0.6766, + "step": 305 + }, + { + "epoch": 1.2, + "grad_norm": 0.2691782924782941, + "learning_rate": 3.3940675559617724e-06, + "loss": 0.6895, + "step": 306 + }, + { + "epoch": 1.2, + "grad_norm": 0.26815145183562566, + "learning_rate": 3.363742215634416e-06, + "loss": 0.6671, + "step": 307 + }, + { + "epoch": 1.21, + "grad_norm": 0.26601253229287064, + "learning_rate": 3.3334841354557923e-06, + "loss": 0.6902, + "step": 308 + }, + { + "epoch": 1.21, + "grad_norm": 0.2759140499002526, + "learning_rate": 3.303294559217063e-06, + "loss": 0.7011, + "step": 309 + }, + { + "epoch": 1.22, + "grad_norm": 0.2532152571509874, + "learning_rate": 3.273174727893463e-06, + "loss": 0.6631, + "step": 310 + }, + { + "epoch": 1.22, + "grad_norm": 0.2587732106097895, + "learning_rate": 3.2431258795932863e-06, + "loss": 0.6964, + "step": 311 + }, + { + "epoch": 1.22, + "grad_norm": 0.26154429819114283, + "learning_rate": 3.213149249506997e-06, + "loss": 0.7018, + "step": 312 + }, + { + "epoch": 1.23, + "grad_norm": 0.2640699829932556, + "learning_rate": 3.183246069856443e-06, + "loss": 0.6809, + "step": 313 + }, + { + "epoch": 1.23, + "grad_norm": 0.26350081604751235, + "learning_rate": 3.1534175698442194e-06, + "loss": 0.655, + "step": 314 + }, + { + "epoch": 1.24, + "grad_norm": 0.2620810766791341, + "learning_rate": 3.12366497560313e-06, + "loss": 0.7034, + "step": 315 + }, + { + "epoch": 1.24, + "grad_norm": 0.26759545817238656, + "learning_rate": 3.093989510145792e-06, + "loss": 0.7238, + "step": 316 + }, + { + "epoch": 1.24, + "grad_norm": 0.26779990918516644, + "learning_rate": 3.0643923933143603e-06, + "loss": 0.6733, + "step": 317 + }, + { + "epoch": 1.25, + "grad_norm": 0.266460909749917, + "learning_rate": 3.0348748417303826e-06, + "loss": 0.6708, + "step": 318 + }, + { + "epoch": 1.25, + "grad_norm": 0.26133722668937404, + "learning_rate": 3.005438068744792e-06, + "loss": 0.6838, + "step": 319 + }, + { + "epoch": 1.26, + "grad_norm": 0.26893527319559857, + "learning_rate": 2.976083284388031e-06, + "loss": 0.662, + "step": 320 + }, + { + "epoch": 1.26, + "grad_norm": 0.2699706439018165, + "learning_rate": 2.9468116953203107e-06, + "loss": 0.6867, + "step": 321 + }, + { + "epoch": 1.26, + "grad_norm": 0.2660357193246072, + "learning_rate": 2.9176245047820064e-06, + "loss": 0.6802, + "step": 322 + }, + { + "epoch": 1.27, + "grad_norm": 0.2676372441332764, + "learning_rate": 2.8885229125442022e-06, + "loss": 0.7143, + "step": 323 + }, + { + "epoch": 1.27, + "grad_norm": 0.26172654998349437, + "learning_rate": 2.859508114859374e-06, + "loss": 0.6688, + "step": 324 + }, + { + "epoch": 1.28, + "grad_norm": 0.26622148926216194, + "learning_rate": 2.83058130441221e-06, + "loss": 0.671, + "step": 325 + }, + { + "epoch": 1.28, + "grad_norm": 0.28436198488003145, + "learning_rate": 2.80174367027059e-06, + "loss": 0.7157, + "step": 326 + }, + { + "epoch": 1.28, + "grad_norm": 0.2637348527869296, + "learning_rate": 2.772996397836704e-06, + "loss": 0.6893, + "step": 327 + }, + { + "epoch": 1.29, + "grad_norm": 0.27094660677035604, + "learning_rate": 2.7443406687983267e-06, + "loss": 0.7149, + "step": 328 + }, + { + "epoch": 1.29, + "grad_norm": 0.26943858696681655, + "learning_rate": 2.7157776610802416e-06, + "loss": 0.6756, + "step": 329 + }, + { + "epoch": 1.3, + "grad_norm": 0.2637431948145577, + "learning_rate": 2.687308548795825e-06, + "loss": 0.6731, + "step": 330 + }, + { + "epoch": 1.3, + "grad_norm": 0.2605764257900338, + "learning_rate": 2.6589345021987725e-06, + "loss": 0.6601, + "step": 331 + }, + { + "epoch": 1.3, + "grad_norm": 0.2691370320929689, + "learning_rate": 2.6306566876350072e-06, + "loss": 0.6747, + "step": 332 + }, + { + "epoch": 1.31, + "grad_norm": 0.26505353096492007, + "learning_rate": 2.6024762674947313e-06, + "loss": 0.6355, + "step": 333 + }, + { + "epoch": 1.31, + "grad_norm": 0.2771254154185314, + "learning_rate": 2.5743944001646394e-06, + "loss": 0.6679, + "step": 334 + }, + { + "epoch": 1.32, + "grad_norm": 0.2734747453519109, + "learning_rate": 2.5464122399803126e-06, + "loss": 0.6842, + "step": 335 + }, + { + "epoch": 1.32, + "grad_norm": 0.26644275829657593, + "learning_rate": 2.5185309371787515e-06, + "loss": 0.6986, + "step": 336 + }, + { + "epoch": 1.32, + "grad_norm": 2.921169787601039, + "learning_rate": 2.4907516378511137e-06, + "loss": 0.9339, + "step": 337 + }, + { + "epoch": 1.33, + "grad_norm": 0.25960894123414624, + "learning_rate": 2.46307548389559e-06, + "loss": 0.6743, + "step": 338 + }, + { + "epoch": 1.33, + "grad_norm": 0.2637156946948773, + "learning_rate": 2.43550361297047e-06, + "loss": 0.682, + "step": 339 + }, + { + "epoch": 1.34, + "grad_norm": 0.26481304373722364, + "learning_rate": 2.408037158447375e-06, + "loss": 0.6838, + "step": 340 + }, + { + "epoch": 1.34, + "grad_norm": 0.3098032445823631, + "learning_rate": 2.3806772493646725e-06, + "loss": 0.6569, + "step": 341 + }, + { + "epoch": 1.34, + "grad_norm": 0.26219269959571206, + "learning_rate": 2.353425010381063e-06, + "loss": 0.6761, + "step": 342 + }, + { + "epoch": 1.35, + "grad_norm": 0.2595963563694489, + "learning_rate": 2.3262815617293517e-06, + "loss": 0.6705, + "step": 343 + }, + { + "epoch": 1.35, + "grad_norm": 6.834107729255299, + "learning_rate": 2.2992480191704003e-06, + "loss": 0.9304, + "step": 344 + }, + { + "epoch": 1.36, + "grad_norm": 0.27845487671808766, + "learning_rate": 2.272325493947257e-06, + "loss": 0.7032, + "step": 345 + }, + { + "epoch": 1.36, + "grad_norm": 0.27640918477115356, + "learning_rate": 2.245515092739488e-06, + "loss": 0.6782, + "step": 346 + }, + { + "epoch": 1.36, + "grad_norm": 0.2807587758407648, + "learning_rate": 2.2188179176176767e-06, + "loss": 0.6932, + "step": 347 + }, + { + "epoch": 1.37, + "grad_norm": 0.27767259525555504, + "learning_rate": 2.1922350659981262e-06, + "loss": 0.6466, + "step": 348 + }, + { + "epoch": 1.37, + "grad_norm": 0.267658673895331, + "learning_rate": 2.165767630597752e-06, + "loss": 0.7089, + "step": 349 + }, + { + "epoch": 1.38, + "grad_norm": 0.2745228363539692, + "learning_rate": 2.139416699389153e-06, + "loss": 0.6778, + "step": 350 + }, + { + "epoch": 1.38, + "grad_norm": 0.26408657536920843, + "learning_rate": 2.1131833555559037e-06, + "loss": 0.693, + "step": 351 + }, + { + "epoch": 1.38, + "grad_norm": 0.27582062474841573, + "learning_rate": 2.08706867744802e-06, + "loss": 0.6896, + "step": 352 + }, + { + "epoch": 1.39, + "grad_norm": 0.26826511006390147, + "learning_rate": 2.061073738537635e-06, + "loss": 0.6796, + "step": 353 + }, + { + "epoch": 1.39, + "grad_norm": 0.26630216372896987, + "learning_rate": 2.0351996073748713e-06, + "loss": 0.664, + "step": 354 + }, + { + "epoch": 1.4, + "grad_norm": 0.2809694093215496, + "learning_rate": 2.00944734754392e-06, + "loss": 0.7044, + "step": 355 + }, + { + "epoch": 1.4, + "grad_norm": 1.2984772108139377, + "learning_rate": 1.983818017619318e-06, + "loss": 0.9348, + "step": 356 + }, + { + "epoch": 1.4, + "grad_norm": 0.2721943586746493, + "learning_rate": 1.9583126711224342e-06, + "loss": 0.6918, + "step": 357 + }, + { + "epoch": 1.41, + "grad_norm": 0.27480703238103743, + "learning_rate": 1.932932356478168e-06, + "loss": 0.6854, + "step": 358 + }, + { + "epoch": 1.41, + "grad_norm": 0.27867368393846137, + "learning_rate": 1.9076781169718426e-06, + "loss": 0.6892, + "step": 359 + }, + { + "epoch": 1.42, + "grad_norm": 0.2747868621778029, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.6947, + "step": 360 + }, + { + "epoch": 1.42, + "grad_norm": 0.2819831023326237, + "learning_rate": 1.857552010559382e-06, + "loss": 0.7059, + "step": 361 + }, + { + "epoch": 1.42, + "grad_norm": 0.27392909848006375, + "learning_rate": 1.8326822041411524e-06, + "loss": 0.6909, + "step": 362 + }, + { + "epoch": 1.43, + "grad_norm": 0.2743865258535757, + "learning_rate": 1.8079425937519729e-06, + "loss": 0.679, + "step": 363 + }, + { + "epoch": 1.43, + "grad_norm": 2.5387012983068686, + "learning_rate": 1.7833341963403312e-06, + "loss": 0.8855, + "step": 364 + }, + { + "epoch": 1.44, + "grad_norm": 0.265533513891752, + "learning_rate": 1.7588580234610592e-06, + "loss": 0.6915, + "step": 365 + }, + { + "epoch": 1.44, + "grad_norm": 0.27359948413194535, + "learning_rate": 1.7345150812337564e-06, + "loss": 0.6983, + "step": 366 + }, + { + "epoch": 1.44, + "grad_norm": 0.2742529435490673, + "learning_rate": 1.7103063703014372e-06, + "loss": 0.6712, + "step": 367 + }, + { + "epoch": 1.45, + "grad_norm": 0.27199602044830407, + "learning_rate": 1.6862328857893856e-06, + "loss": 0.6929, + "step": 368 + }, + { + "epoch": 1.45, + "grad_norm": 0.26752081718353027, + "learning_rate": 1.6622956172642601e-06, + "loss": 0.6693, + "step": 369 + }, + { + "epoch": 1.46, + "grad_norm": 0.2729904749450422, + "learning_rate": 1.6384955486934157e-06, + "loss": 0.6545, + "step": 370 + }, + { + "epoch": 1.46, + "grad_norm": 0.27686659493721505, + "learning_rate": 1.6148336584044539e-06, + "loss": 0.6957, + "step": 371 + }, + { + "epoch": 1.46, + "grad_norm": 0.27203211099316454, + "learning_rate": 1.5913109190450033e-06, + "loss": 0.6709, + "step": 372 + }, + { + "epoch": 1.47, + "grad_norm": 0.2748872025880921, + "learning_rate": 1.567928297542749e-06, + "loss": 0.6648, + "step": 373 + }, + { + "epoch": 1.47, + "grad_norm": 0.28544562477258, + "learning_rate": 1.544686755065677e-06, + "loss": 0.6937, + "step": 374 + }, + { + "epoch": 1.48, + "grad_norm": 0.27110159404128226, + "learning_rate": 1.5215872469825682e-06, + "loss": 0.6593, + "step": 375 + }, + { + "epoch": 1.48, + "eval_loss": 0.6996302008628845, + "eval_runtime": 96.9399, + "eval_samples_per_second": 18.238, + "eval_steps_per_second": 0.382, + "step": 375 + }, + { + "epoch": 1.48, + "grad_norm": 0.28648432605335455, + "learning_rate": 1.4986307228237268e-06, + "loss": 0.6883, + "step": 376 + }, + { + "epoch": 1.48, + "grad_norm": 0.2695264027977092, + "learning_rate": 1.4758181262419425e-06, + "loss": 0.6696, + "step": 377 + }, + { + "epoch": 1.49, + "grad_norm": 0.2786040566891135, + "learning_rate": 1.4531503949737107e-06, + "loss": 0.6768, + "step": 378 + }, + { + "epoch": 1.49, + "grad_norm": 0.2730138945863401, + "learning_rate": 1.4306284608006837e-06, + "loss": 0.699, + "step": 379 + }, + { + "epoch": 1.5, + "grad_norm": 0.28711818986138005, + "learning_rate": 1.4082532495113627e-06, + "loss": 0.6961, + "step": 380 + }, + { + "epoch": 1.5, + "grad_norm": 0.27838100192134935, + "learning_rate": 1.3860256808630429e-06, + "loss": 0.6589, + "step": 381 + }, + { + "epoch": 1.5, + "grad_norm": 0.2798399005913698, + "learning_rate": 1.3639466685440133e-06, + "loss": 0.6924, + "step": 382 + }, + { + "epoch": 1.51, + "grad_norm": 0.2801056534585314, + "learning_rate": 1.3420171201359933e-06, + "loss": 0.7047, + "step": 383 + }, + { + "epoch": 1.51, + "grad_norm": 0.28576658015544487, + "learning_rate": 1.3202379370768254e-06, + "loss": 0.6617, + "step": 384 + }, + { + "epoch": 1.52, + "grad_norm": 1.3100321738765892, + "learning_rate": 1.298610014623423e-06, + "loss": 0.9236, + "step": 385 + }, + { + "epoch": 1.52, + "grad_norm": 0.303191205875695, + "learning_rate": 1.2771342418149658e-06, + "loss": 0.6896, + "step": 386 + }, + { + "epoch": 1.52, + "grad_norm": 0.29469211064765755, + "learning_rate": 1.2558115014363592e-06, + "loss": 0.6804, + "step": 387 + }, + { + "epoch": 1.53, + "grad_norm": 0.2772656639598833, + "learning_rate": 1.234642669981946e-06, + "loss": 0.7043, + "step": 388 + }, + { + "epoch": 1.53, + "grad_norm": 0.28874341170670975, + "learning_rate": 1.2136286176194744e-06, + "loss": 0.6839, + "step": 389 + }, + { + "epoch": 1.54, + "grad_norm": 0.29443238526351323, + "learning_rate": 1.1927702081543279e-06, + "loss": 0.6852, + "step": 390 + }, + { + "epoch": 1.54, + "grad_norm": 0.28011985365824127, + "learning_rate": 1.1720682989940264e-06, + "loss": 0.7019, + "step": 391 + }, + { + "epoch": 1.54, + "grad_norm": 0.2987249208096274, + "learning_rate": 1.1515237411129698e-06, + "loss": 0.6625, + "step": 392 + }, + { + "epoch": 1.55, + "grad_norm": 0.30150125882304396, + "learning_rate": 1.1311373790174656e-06, + "loss": 0.7102, + "step": 393 + }, + { + "epoch": 1.55, + "grad_norm": 0.28396138619493894, + "learning_rate": 1.1109100507110133e-06, + "loss": 0.6538, + "step": 394 + }, + { + "epoch": 1.56, + "grad_norm": 0.28445333602874173, + "learning_rate": 1.0908425876598512e-06, + "loss": 0.6719, + "step": 395 + }, + { + "epoch": 1.56, + "grad_norm": 0.2914051878027514, + "learning_rate": 1.0709358147587883e-06, + "loss": 0.6803, + "step": 396 + }, + { + "epoch": 1.56, + "grad_norm": 0.2969873740157493, + "learning_rate": 1.0511905502972885e-06, + "loss": 0.6845, + "step": 397 + }, + { + "epoch": 1.57, + "grad_norm": 0.27955221055069657, + "learning_rate": 1.031607605925839e-06, + "loss": 0.6819, + "step": 398 + }, + { + "epoch": 1.57, + "grad_norm": 0.2840904640426781, + "learning_rate": 1.0121877866225783e-06, + "loss": 0.6685, + "step": 399 + }, + { + "epoch": 1.58, + "grad_norm": 0.2866769315662431, + "learning_rate": 9.929318906602176e-07, + "loss": 0.7126, + "step": 400 + }, + { + "epoch": 1.58, + "grad_norm": 0.28635331619928306, + "learning_rate": 9.738407095732195e-07, + "loss": 0.6825, + "step": 401 + }, + { + "epoch": 1.58, + "grad_norm": 0.29612030431665504, + "learning_rate": 9.549150281252633e-07, + "loss": 0.6889, + "step": 402 + }, + { + "epoch": 1.59, + "grad_norm": 0.2800350134356635, + "learning_rate": 9.361556242769871e-07, + "loss": 0.6902, + "step": 403 + }, + { + "epoch": 1.59, + "grad_norm": 0.303825841465723, + "learning_rate": 9.175632691540065e-07, + "loss": 0.6949, + "step": 404 + }, + { + "epoch": 1.6, + "grad_norm": 0.2869303466691903, + "learning_rate": 8.991387270152202e-07, + "loss": 0.6953, + "step": 405 + }, + { + "epoch": 1.6, + "grad_norm": 0.2822567891211309, + "learning_rate": 8.808827552213917e-07, + "loss": 0.6733, + "step": 406 + }, + { + "epoch": 1.6, + "grad_norm": 0.29613652418881947, + "learning_rate": 8.627961042040183e-07, + "loss": 0.6721, + "step": 407 + }, + { + "epoch": 1.61, + "grad_norm": 0.2904383828418472, + "learning_rate": 8.448795174344803e-07, + "loss": 0.6849, + "step": 408 + }, + { + "epoch": 1.61, + "grad_norm": 0.28422518396116003, + "learning_rate": 8.271337313934869e-07, + "loss": 0.676, + "step": 409 + }, + { + "epoch": 1.62, + "grad_norm": 0.2998884374161429, + "learning_rate": 8.095594755407971e-07, + "loss": 0.72, + "step": 410 + }, + { + "epoch": 1.62, + "grad_norm": 0.2877123964038153, + "learning_rate": 7.921574722852343e-07, + "loss": 0.686, + "step": 411 + }, + { + "epoch": 1.62, + "grad_norm": 0.2848351263069502, + "learning_rate": 7.749284369549954e-07, + "loss": 0.6755, + "step": 412 + }, + { + "epoch": 1.63, + "grad_norm": 3.762805837262192, + "learning_rate": 7.578730777682386e-07, + "loss": 0.9037, + "step": 413 + }, + { + "epoch": 1.63, + "grad_norm": 0.2782769592021476, + "learning_rate": 7.409920958039795e-07, + "loss": 0.6686, + "step": 414 + }, + { + "epoch": 1.64, + "grad_norm": 0.28369386272785363, + "learning_rate": 7.242861849732696e-07, + "loss": 0.6772, + "step": 415 + }, + { + "epoch": 1.64, + "grad_norm": 0.28870985589458403, + "learning_rate": 7.077560319906696e-07, + "loss": 0.6665, + "step": 416 + }, + { + "epoch": 1.64, + "grad_norm": 0.2880267458624612, + "learning_rate": 6.914023163460248e-07, + "loss": 0.6767, + "step": 417 + }, + { + "epoch": 1.65, + "grad_norm": 0.2879073116640725, + "learning_rate": 6.752257102765325e-07, + "loss": 0.6733, + "step": 418 + }, + { + "epoch": 1.65, + "grad_norm": 0.2978223401759706, + "learning_rate": 6.592268787391077e-07, + "loss": 0.707, + "step": 419 + }, + { + "epoch": 1.66, + "grad_norm": 0.2781074725093229, + "learning_rate": 6.43406479383053e-07, + "loss": 0.6962, + "step": 420 + }, + { + "epoch": 1.66, + "grad_norm": 0.29577562012306474, + "learning_rate": 6.277651625230219e-07, + "loss": 0.6772, + "step": 421 + }, + { + "epoch": 1.66, + "grad_norm": 0.2848699679509908, + "learning_rate": 6.12303571112286e-07, + "loss": 0.7008, + "step": 422 + }, + { + "epoch": 1.67, + "grad_norm": 0.2728708533046375, + "learning_rate": 5.9702234071631e-07, + "loss": 0.6994, + "step": 423 + }, + { + "epoch": 1.67, + "grad_norm": 0.2971147397482144, + "learning_rate": 5.819220994866237e-07, + "loss": 0.6784, + "step": 424 + }, + { + "epoch": 1.67, + "grad_norm": 0.2918077307247773, + "learning_rate": 5.670034681349995e-07, + "loss": 0.6798, + "step": 425 + }, + { + "epoch": 1.68, + "grad_norm": 0.2766527969263755, + "learning_rate": 5.522670599079416e-07, + "loss": 0.692, + "step": 426 + }, + { + "epoch": 1.68, + "grad_norm": 0.2896023594106076, + "learning_rate": 5.377134805614714e-07, + "loss": 0.6885, + "step": 427 + }, + { + "epoch": 1.69, + "grad_norm": 0.29226174571780184, + "learning_rate": 5.233433283362349e-07, + "loss": 0.6609, + "step": 428 + }, + { + "epoch": 1.69, + "grad_norm": 0.30313380575685006, + "learning_rate": 5.091571939329049e-07, + "loss": 0.6559, + "step": 429 + }, + { + "epoch": 1.69, + "grad_norm": 0.2851579977806079, + "learning_rate": 4.951556604879049e-07, + "loss": 0.6862, + "step": 430 + }, + { + "epoch": 1.7, + "grad_norm": 0.28828722620682673, + "learning_rate": 4.813393035494329e-07, + "loss": 0.673, + "step": 431 + }, + { + "epoch": 1.7, + "grad_norm": 0.283083619090625, + "learning_rate": 4.677086910538092e-07, + "loss": 0.6477, + "step": 432 + }, + { + "epoch": 1.71, + "grad_norm": 0.29809618255032516, + "learning_rate": 4.542643833021254e-07, + "loss": 0.7054, + "step": 433 + }, + { + "epoch": 1.71, + "grad_norm": 0.3086409841744957, + "learning_rate": 4.410069329372152e-07, + "loss": 0.6763, + "step": 434 + }, + { + "epoch": 1.71, + "grad_norm": 0.3016981542599131, + "learning_rate": 4.279368849209381e-07, + "loss": 0.6843, + "step": 435 + }, + { + "epoch": 1.72, + "grad_norm": 0.27698126686942587, + "learning_rate": 4.150547765117746e-07, + "loss": 0.6891, + "step": 436 + }, + { + "epoch": 1.72, + "grad_norm": 0.2793586730481018, + "learning_rate": 4.0236113724274716e-07, + "loss": 0.6796, + "step": 437 + }, + { + "epoch": 1.73, + "grad_norm": 0.28666974827878056, + "learning_rate": 3.8985648889964755e-07, + "loss": 0.6648, + "step": 438 + }, + { + "epoch": 1.73, + "grad_norm": 0.28527293041641727, + "learning_rate": 3.77541345499593e-07, + "loss": 0.7071, + "step": 439 + }, + { + "epoch": 1.73, + "grad_norm": 0.29843069047155474, + "learning_rate": 3.6541621326989183e-07, + "loss": 0.6803, + "step": 440 + }, + { + "epoch": 1.74, + "grad_norm": 0.3001228333782996, + "learning_rate": 3.534815906272404e-07, + "loss": 0.7176, + "step": 441 + }, + { + "epoch": 1.74, + "grad_norm": 0.2928174202718284, + "learning_rate": 3.417379681572297e-07, + "loss": 0.6747, + "step": 442 + }, + { + "epoch": 1.75, + "grad_norm": 0.2951128501223636, + "learning_rate": 3.301858285941845e-07, + "loss": 0.7046, + "step": 443 + }, + { + "epoch": 1.75, + "grad_norm": 0.2931876464433515, + "learning_rate": 3.18825646801314e-07, + "loss": 0.6734, + "step": 444 + }, + { + "epoch": 1.75, + "grad_norm": 0.29349560436630445, + "learning_rate": 3.076578897511978e-07, + "loss": 0.6852, + "step": 445 + }, + { + "epoch": 1.76, + "grad_norm": 0.7457047514934827, + "learning_rate": 2.966830165065876e-07, + "loss": 0.9017, + "step": 446 + }, + { + "epoch": 1.76, + "grad_norm": 0.2813462622367945, + "learning_rate": 2.8590147820153513e-07, + "loss": 0.6969, + "step": 447 + }, + { + "epoch": 1.77, + "grad_norm": 0.30904433658187347, + "learning_rate": 2.7531371802285436e-07, + "loss": 0.6829, + "step": 448 + }, + { + "epoch": 1.77, + "grad_norm": 0.28837856935691286, + "learning_rate": 2.6492017119189415e-07, + "loss": 0.6527, + "step": 449 + }, + { + "epoch": 1.77, + "grad_norm": 0.2940858357017207, + "learning_rate": 2.547212649466568e-07, + "loss": 0.6696, + "step": 450 + }, + { + "epoch": 1.78, + "grad_norm": 0.27956534271888384, + "learning_rate": 2.447174185242324e-07, + "loss": 0.7048, + "step": 451 + }, + { + "epoch": 1.78, + "grad_norm": 0.28259810422391984, + "learning_rate": 2.3490904314356412e-07, + "loss": 0.6772, + "step": 452 + }, + { + "epoch": 1.79, + "grad_norm": 0.2846763408984384, + "learning_rate": 2.2529654198854834e-07, + "loss": 0.7507, + "step": 453 + }, + { + "epoch": 1.79, + "grad_norm": 0.2784656918785677, + "learning_rate": 2.1588031019145638e-07, + "loss": 0.7072, + "step": 454 + }, + { + "epoch": 1.79, + "grad_norm": 0.28593045031546543, + "learning_rate": 2.0666073481669714e-07, + "loss": 0.6944, + "step": 455 + }, + { + "epoch": 1.8, + "grad_norm": 0.296176705173677, + "learning_rate": 1.9763819484490353e-07, + "loss": 0.6691, + "step": 456 + }, + { + "epoch": 1.8, + "grad_norm": 0.2923471158891105, + "learning_rate": 1.8881306115735632e-07, + "loss": 0.705, + "step": 457 + }, + { + "epoch": 1.81, + "grad_norm": 0.2769134760770555, + "learning_rate": 1.801856965207338e-07, + "loss": 0.6845, + "step": 458 + }, + { + "epoch": 1.81, + "grad_norm": 0.30153347516541434, + "learning_rate": 1.7175645557220567e-07, + "loss": 0.6935, + "step": 459 + }, + { + "epoch": 1.81, + "grad_norm": 0.29938078931561396, + "learning_rate": 1.6352568480485277e-07, + "loss": 0.6822, + "step": 460 + }, + { + "epoch": 1.82, + "grad_norm": 0.2909845977727433, + "learning_rate": 1.5549372255342367e-07, + "loss": 0.6959, + "step": 461 + }, + { + "epoch": 1.82, + "grad_norm": 0.2851155920589762, + "learning_rate": 1.4766089898042678e-07, + "loss": 0.6909, + "step": 462 + }, + { + "epoch": 1.83, + "grad_norm": 3.590844633307425, + "learning_rate": 1.4002753606256082e-07, + "loss": 0.9279, + "step": 463 + }, + { + "epoch": 1.83, + "grad_norm": 0.289769942223222, + "learning_rate": 1.3259394757747678e-07, + "loss": 0.6664, + "step": 464 + }, + { + "epoch": 1.83, + "grad_norm": 1.4756345980091514, + "learning_rate": 1.253604390908819e-07, + "loss": 0.9066, + "step": 465 + }, + { + "epoch": 1.84, + "grad_norm": 0.2905542054012534, + "learning_rate": 1.1832730794397951e-07, + "loss": 0.6989, + "step": 466 + }, + { + "epoch": 1.84, + "grad_norm": 0.3056790622208962, + "learning_rate": 1.1149484324124326e-07, + "loss": 0.64, + "step": 467 + }, + { + "epoch": 1.85, + "grad_norm": 0.2915224050071343, + "learning_rate": 1.0486332583853565e-07, + "loss": 0.6411, + "step": 468 + }, + { + "epoch": 1.85, + "grad_norm": 0.2947477867782055, + "learning_rate": 9.843302833156377e-08, + "loss": 0.6901, + "step": 469 + }, + { + "epoch": 1.85, + "grad_norm": 0.6968434274277037, + "learning_rate": 9.22042150446728e-08, + "loss": 0.9234, + "step": 470 + }, + { + "epoch": 1.86, + "grad_norm": 0.28365980605268926, + "learning_rate": 8.617714201998084e-08, + "loss": 0.6871, + "step": 471 + }, + { + "epoch": 1.86, + "grad_norm": 0.29456041125148436, + "learning_rate": 8.035205700685167e-08, + "loss": 0.6841, + "step": 472 + }, + { + "epoch": 1.87, + "grad_norm": 0.3143240746562965, + "learning_rate": 7.47291994517163e-08, + "loss": 0.6793, + "step": 473 + }, + { + "epoch": 1.87, + "grad_norm": 0.3002190997085016, + "learning_rate": 6.930880048822531e-08, + "loss": 0.6909, + "step": 474 + }, + { + "epoch": 1.87, + "grad_norm": 0.28198233683666907, + "learning_rate": 6.409108292774912e-08, + "loss": 0.6677, + "step": 475 + }, + { + "epoch": 1.88, + "grad_norm": 0.2974351663902672, + "learning_rate": 5.907626125022159e-08, + "loss": 0.6863, + "step": 476 + }, + { + "epoch": 1.88, + "grad_norm": 0.3167342201027022, + "learning_rate": 5.426454159531913e-08, + "loss": 0.6728, + "step": 477 + }, + { + "epoch": 1.89, + "grad_norm": 0.2838121481556639, + "learning_rate": 4.9656121753990924e-08, + "loss": 0.6765, + "step": 478 + }, + { + "epoch": 1.89, + "grad_norm": 0.28117138518414553, + "learning_rate": 4.52511911603265e-08, + "loss": 0.6827, + "step": 479 + }, + { + "epoch": 1.89, + "grad_norm": 0.29399576903346636, + "learning_rate": 4.104993088376974e-08, + "loss": 0.6933, + "step": 480 + }, + { + "epoch": 1.9, + "grad_norm": 0.29636385882946953, + "learning_rate": 3.705251362167484e-08, + "loss": 0.6641, + "step": 481 + }, + { + "epoch": 1.9, + "grad_norm": 0.2913085312105263, + "learning_rate": 3.325910369220975e-08, + "loss": 0.6973, + "step": 482 + }, + { + "epoch": 1.91, + "grad_norm": 0.29080057862930364, + "learning_rate": 2.966985702759828e-08, + "loss": 0.6678, + "step": 483 + }, + { + "epoch": 1.91, + "grad_norm": 0.2952306166117519, + "learning_rate": 2.6284921167712975e-08, + "loss": 0.7017, + "step": 484 + }, + { + "epoch": 1.91, + "grad_norm": 0.2959396595629797, + "learning_rate": 2.3104435254008852e-08, + "loss": 0.6569, + "step": 485 + }, + { + "epoch": 1.92, + "grad_norm": 0.28791733733513286, + "learning_rate": 2.012853002380466e-08, + "loss": 0.6573, + "step": 486 + }, + { + "epoch": 1.92, + "grad_norm": 0.2923526176448832, + "learning_rate": 1.735732780490884e-08, + "loss": 0.6903, + "step": 487 + }, + { + "epoch": 1.93, + "grad_norm": 0.2908268098513957, + "learning_rate": 1.4790942510590767e-08, + "loss": 0.6756, + "step": 488 + }, + { + "epoch": 1.93, + "grad_norm": 0.29069043095745606, + "learning_rate": 1.2429479634897268e-08, + "loss": 0.6722, + "step": 489 + }, + { + "epoch": 1.93, + "grad_norm": 0.28912174671892155, + "learning_rate": 1.0273036248318325e-08, + "loss": 0.6927, + "step": 490 + }, + { + "epoch": 1.94, + "grad_norm": 0.29773909318477504, + "learning_rate": 8.321700993795812e-09, + "loss": 0.6703, + "step": 491 + }, + { + "epoch": 1.94, + "grad_norm": 0.2846360921300275, + "learning_rate": 6.575554083078084e-09, + "loss": 0.6915, + "step": 492 + }, + { + "epoch": 1.95, + "grad_norm": 0.3040183654289367, + "learning_rate": 5.034667293427053e-09, + "loss": 0.6836, + "step": 493 + }, + { + "epoch": 1.95, + "grad_norm": 0.29012455377167307, + "learning_rate": 3.6991039646616657e-09, + "loss": 0.6844, + "step": 494 + }, + { + "epoch": 1.95, + "grad_norm": 0.2778518633390048, + "learning_rate": 2.568918996560532e-09, + "loss": 0.6779, + "step": 495 + }, + { + "epoch": 1.96, + "grad_norm": 0.29541155663074187, + "learning_rate": 1.6441588466009627e-09, + "loss": 0.6979, + "step": 496 + }, + { + "epoch": 1.96, + "grad_norm": 0.28364315270086676, + "learning_rate": 9.248615280499362e-10, + "loss": 0.6792, + "step": 497 + }, + { + "epoch": 1.97, + "grad_norm": 0.2865142006406564, + "learning_rate": 4.1105660840368154e-10, + "loss": 0.7034, + "step": 498 + }, + { + "epoch": 1.97, + "grad_norm": 0.2854424675916699, + "learning_rate": 1.0276520816976388e-10, + "loss": 0.6747, + "step": 499 + }, + { + "epoch": 1.97, + "grad_norm": 0.29666466368391803, + "learning_rate": 0.0, + "loss": 0.6754, + "step": 500 + }, + { + "epoch": 1.97, + "eval_loss": 0.6983408331871033, + "eval_runtime": 93.907, + "eval_samples_per_second": 18.827, + "eval_steps_per_second": 0.394, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 250, + "total_flos": 1571976955035648.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000..f514ebe --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65ef0124c2666d0d78b18fb6fcae7801286925d17b161abe928ae1b45915ae68 +size 7736 diff --git a/checkpoint-500/vocab.json b/checkpoint-500/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/checkpoint-500/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833 diff --git a/checkpoint-500/zero_to_fp32.py b/checkpoint-500/zero_to_fp32.py new file mode 100644 index 0000000..24cc342 --- /dev/null +++ b/checkpoint-500/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000..e858a09 --- /dev/null +++ b/config.json @@ -0,0 +1,27 @@ +{ + "_name_or_path": "Qwen/Qwen2-7B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..85199a2 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": 151643, + "max_new_tokens": 2048, + "transformers_version": "4.40.0.dev0" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..3fd8dab --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f421710e83034813f0366192f32dd36a6005990365885d2b7b3fad1f95ee71a1 +size 4877660776 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..b7bc6db --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ee3add1c94133fccf2b4c5a11fdea6167e7de07c2f58f1cbfc8e7da0844518 +size 4932751008 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..9e3b0af --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61e3e7d2c4b53ec95c1ad1e8a2c2770709a5ab20cb556486922f3722569615e8 +size 4330865200 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..7193b5f --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7bb33e9d51c137141218117b751ad1da933bd40ac5c30ca729aea7e79c68ed7 +size 1089994880 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..6ca5084 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 15231233024 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00003-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..2938465 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,20 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..3ad2571 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14af073bd227de69ecbeab6dc6afdcae303bbbc1887df836548350491ecf3b67 +size 7028209 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..186e976 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": null, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "model_max_length": 32768, + "pad_token": "<|end_of_text|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..3019e97 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3561 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9740777666999003, + "eval_steps": 125, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.8329473944625845, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9189, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 0.8840048313140869, + "eval_runtime": 99.9262, + "eval_samples_per_second": 17.693, + "eval_steps_per_second": 0.37, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 1.7916344264608899, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8962, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 1.8909931480365287, + "learning_rate": 3e-06, + "loss": 0.8805, + "step": 3 + }, + { + "epoch": 0.02, + "grad_norm": 1.6318273112027453, + "learning_rate": 4.000000000000001e-06, + "loss": 0.913, + "step": 4 + }, + { + "epoch": 0.02, + "grad_norm": 1.2463401136319747, + "learning_rate": 5e-06, + "loss": 0.908, + "step": 5 + }, + { + "epoch": 0.02, + "grad_norm": 1.1463980681106876, + "learning_rate": 6e-06, + "loss": 0.8729, + "step": 6 + }, + { + "epoch": 0.03, + "grad_norm": 0.9477573494094379, + "learning_rate": 7e-06, + "loss": 0.8411, + "step": 7 + }, + { + "epoch": 0.03, + "grad_norm": 4.0165120162042935, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0541, + "step": 8 + }, + { + "epoch": 0.04, + "grad_norm": 1.0713771331971476, + "learning_rate": 9e-06, + "loss": 0.8495, + "step": 9 + }, + { + "epoch": 0.04, + "grad_norm": 0.8667235558943894, + "learning_rate": 1e-05, + "loss": 0.8199, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 0.7411429457268661, + "learning_rate": 9.999897234791831e-06, + "loss": 0.7964, + "step": 11 + }, + { + "epoch": 0.05, + "grad_norm": 0.5729968036750446, + "learning_rate": 9.999588943391597e-06, + "loss": 0.8039, + "step": 12 + }, + { + "epoch": 0.05, + "grad_norm": 0.5402964207486183, + "learning_rate": 9.99907513847195e-06, + "loss": 0.8287, + "step": 13 + }, + { + "epoch": 0.06, + "grad_norm": 0.5633328266442124, + "learning_rate": 9.9983558411534e-06, + "loss": 0.7842, + "step": 14 + }, + { + "epoch": 0.06, + "grad_norm": 0.5412290905686791, + "learning_rate": 9.99743108100344e-06, + "loss": 0.8345, + "step": 15 + }, + { + "epoch": 0.06, + "grad_norm": 0.4895379189634968, + "learning_rate": 9.99630089603534e-06, + "loss": 0.7922, + "step": 16 + }, + { + "epoch": 0.07, + "grad_norm": 0.5088260537094976, + "learning_rate": 9.994965332706574e-06, + "loss": 0.7969, + "step": 17 + }, + { + "epoch": 0.07, + "grad_norm": 0.47075507205524136, + "learning_rate": 9.993424445916923e-06, + "loss": 0.7931, + "step": 18 + }, + { + "epoch": 0.08, + "grad_norm": 0.3878407143429931, + "learning_rate": 9.991678299006206e-06, + "loss": 0.8041, + "step": 19 + }, + { + "epoch": 0.08, + "grad_norm": 0.3873731682942636, + "learning_rate": 9.989726963751683e-06, + "loss": 0.8107, + "step": 20 + }, + { + "epoch": 0.08, + "grad_norm": 0.42417629604043083, + "learning_rate": 9.987570520365105e-06, + "loss": 0.7874, + "step": 21 + }, + { + "epoch": 0.09, + "grad_norm": 0.4324680733617199, + "learning_rate": 9.98520905748941e-06, + "loss": 0.8025, + "step": 22 + }, + { + "epoch": 0.09, + "grad_norm": 0.34546199757993784, + "learning_rate": 9.982642672195093e-06, + "loss": 0.8048, + "step": 23 + }, + { + "epoch": 0.1, + "grad_norm": 0.35958771648273496, + "learning_rate": 9.979871469976197e-06, + "loss": 0.831, + "step": 24 + }, + { + "epoch": 0.1, + "grad_norm": 0.3940074197908444, + "learning_rate": 9.976895564745993e-06, + "loss": 0.7944, + "step": 25 + }, + { + "epoch": 0.1, + "grad_norm": 0.3818406187889153, + "learning_rate": 9.973715078832288e-06, + "loss": 0.7936, + "step": 26 + }, + { + "epoch": 0.11, + "grad_norm": 10.237346743255186, + "learning_rate": 9.970330142972403e-06, + "loss": 1.0017, + "step": 27 + }, + { + "epoch": 0.11, + "grad_norm": 6.504690612414681, + "learning_rate": 9.966740896307791e-06, + "loss": 1.0329, + "step": 28 + }, + { + "epoch": 0.12, + "grad_norm": 0.4028775109425473, + "learning_rate": 9.962947486378325e-06, + "loss": 0.7702, + "step": 29 + }, + { + "epoch": 0.12, + "grad_norm": 0.39810277536967076, + "learning_rate": 9.95895006911623e-06, + "loss": 0.771, + "step": 30 + }, + { + "epoch": 0.12, + "grad_norm": 0.29862663396811506, + "learning_rate": 9.954748808839675e-06, + "loss": 0.7767, + "step": 31 + }, + { + "epoch": 0.13, + "grad_norm": 0.3106188272362696, + "learning_rate": 9.950343878246011e-06, + "loss": 0.7943, + "step": 32 + }, + { + "epoch": 0.13, + "grad_norm": 0.34702364911134964, + "learning_rate": 9.945735458404681e-06, + "loss": 0.7972, + "step": 33 + }, + { + "epoch": 0.14, + "grad_norm": 0.3216448960978253, + "learning_rate": 9.94092373874978e-06, + "loss": 0.7847, + "step": 34 + }, + { + "epoch": 0.14, + "grad_norm": 0.31232207978504006, + "learning_rate": 9.935908917072253e-06, + "loss": 0.7738, + "step": 35 + }, + { + "epoch": 0.14, + "grad_norm": 0.3004886604892709, + "learning_rate": 9.930691199511775e-06, + "loss": 0.7877, + "step": 36 + }, + { + "epoch": 0.15, + "grad_norm": 0.2870013960815822, + "learning_rate": 9.925270800548285e-06, + "loss": 0.754, + "step": 37 + }, + { + "epoch": 0.15, + "grad_norm": 0.28322113595593756, + "learning_rate": 9.91964794299315e-06, + "loss": 0.7445, + "step": 38 + }, + { + "epoch": 0.16, + "grad_norm": 0.3065117198934518, + "learning_rate": 9.91382285798002e-06, + "loss": 0.787, + "step": 39 + }, + { + "epoch": 0.16, + "grad_norm": 0.2727693466806482, + "learning_rate": 9.907795784955327e-06, + "loss": 0.7865, + "step": 40 + }, + { + "epoch": 0.16, + "grad_norm": 0.2746198009076503, + "learning_rate": 9.901566971668437e-06, + "loss": 0.7755, + "step": 41 + }, + { + "epoch": 0.17, + "grad_norm": 0.2888207750688948, + "learning_rate": 9.895136674161466e-06, + "loss": 0.7789, + "step": 42 + }, + { + "epoch": 0.17, + "grad_norm": 0.26218141394209254, + "learning_rate": 9.888505156758758e-06, + "loss": 0.7781, + "step": 43 + }, + { + "epoch": 0.18, + "grad_norm": 0.27028128323788914, + "learning_rate": 9.881672692056022e-06, + "loss": 0.7596, + "step": 44 + }, + { + "epoch": 0.18, + "grad_norm": 0.301432889634355, + "learning_rate": 9.874639560909118e-06, + "loss": 0.7746, + "step": 45 + }, + { + "epoch": 0.18, + "grad_norm": 0.27768870163187315, + "learning_rate": 9.867406052422525e-06, + "loss": 0.7751, + "step": 46 + }, + { + "epoch": 0.19, + "grad_norm": 0.2638230079020965, + "learning_rate": 9.85997246393744e-06, + "loss": 0.8085, + "step": 47 + }, + { + "epoch": 0.19, + "grad_norm": 0.2826098837962784, + "learning_rate": 9.852339101019574e-06, + "loss": 0.7878, + "step": 48 + }, + { + "epoch": 0.2, + "grad_norm": 0.2673052298412088, + "learning_rate": 9.844506277446577e-06, + "loss": 0.7747, + "step": 49 + }, + { + "epoch": 0.2, + "grad_norm": 0.2589820555015507, + "learning_rate": 9.836474315195148e-06, + "loss": 0.7491, + "step": 50 + }, + { + "epoch": 0.2, + "grad_norm": 0.27744141325372174, + "learning_rate": 9.828243544427795e-06, + "loss": 0.771, + "step": 51 + }, + { + "epoch": 0.21, + "grad_norm": 0.25617202776049003, + "learning_rate": 9.819814303479268e-06, + "loss": 0.789, + "step": 52 + }, + { + "epoch": 0.21, + "grad_norm": 0.25777796417187593, + "learning_rate": 9.811186938842645e-06, + "loss": 0.7498, + "step": 53 + }, + { + "epoch": 0.22, + "grad_norm": 0.26356120702424557, + "learning_rate": 9.802361805155097e-06, + "loss": 0.7618, + "step": 54 + }, + { + "epoch": 0.22, + "grad_norm": 0.24594116238284844, + "learning_rate": 9.793339265183303e-06, + "loss": 0.7647, + "step": 55 + }, + { + "epoch": 0.22, + "grad_norm": 0.2766331605712476, + "learning_rate": 9.784119689808545e-06, + "loss": 0.7757, + "step": 56 + }, + { + "epoch": 0.23, + "grad_norm": 0.2674205732918615, + "learning_rate": 9.774703458011453e-06, + "loss": 0.7479, + "step": 57 + }, + { + "epoch": 0.23, + "grad_norm": 0.25100414008068433, + "learning_rate": 9.765090956856437e-06, + "loss": 0.7629, + "step": 58 + }, + { + "epoch": 0.24, + "grad_norm": 0.2558976905977626, + "learning_rate": 9.755282581475769e-06, + "loss": 0.7368, + "step": 59 + }, + { + "epoch": 0.24, + "grad_norm": 0.2816597522453804, + "learning_rate": 9.745278735053345e-06, + "loss": 0.7675, + "step": 60 + }, + { + "epoch": 0.24, + "grad_norm": 0.27864046582364604, + "learning_rate": 9.735079828808107e-06, + "loss": 0.7693, + "step": 61 + }, + { + "epoch": 0.25, + "grad_norm": 0.2537495381298166, + "learning_rate": 9.724686281977146e-06, + "loss": 0.7612, + "step": 62 + }, + { + "epoch": 0.25, + "grad_norm": 0.27161360619636454, + "learning_rate": 9.714098521798466e-06, + "loss": 0.7659, + "step": 63 + }, + { + "epoch": 0.26, + "grad_norm": 0.257282261183055, + "learning_rate": 9.703316983493414e-06, + "loss": 0.77, + "step": 64 + }, + { + "epoch": 0.26, + "grad_norm": 0.2598148868150837, + "learning_rate": 9.692342110248802e-06, + "loss": 0.7637, + "step": 65 + }, + { + "epoch": 0.26, + "grad_norm": 0.25319486577746536, + "learning_rate": 9.681174353198687e-06, + "loss": 0.7529, + "step": 66 + }, + { + "epoch": 0.27, + "grad_norm": 0.2616187230129625, + "learning_rate": 9.669814171405818e-06, + "loss": 0.7482, + "step": 67 + }, + { + "epoch": 0.27, + "grad_norm": 0.2531735015293101, + "learning_rate": 9.658262031842772e-06, + "loss": 0.7507, + "step": 68 + }, + { + "epoch": 0.28, + "grad_norm": 0.2540031125746497, + "learning_rate": 9.64651840937276e-06, + "loss": 0.7573, + "step": 69 + }, + { + "epoch": 0.28, + "grad_norm": 0.26251145119756225, + "learning_rate": 9.63458378673011e-06, + "loss": 0.7617, + "step": 70 + }, + { + "epoch": 0.28, + "grad_norm": 11.659656865913, + "learning_rate": 9.622458654500408e-06, + "loss": 0.9807, + "step": 71 + }, + { + "epoch": 0.29, + "grad_norm": 43.33218979251603, + "learning_rate": 9.610143511100354e-06, + "loss": 1.0213, + "step": 72 + }, + { + "epoch": 0.29, + "grad_norm": 0.29661440448786996, + "learning_rate": 9.597638862757255e-06, + "loss": 0.7597, + "step": 73 + }, + { + "epoch": 0.3, + "grad_norm": 0.2674359864711363, + "learning_rate": 9.584945223488227e-06, + "loss": 0.7716, + "step": 74 + }, + { + "epoch": 0.3, + "grad_norm": 0.2587397735578842, + "learning_rate": 9.572063115079063e-06, + "loss": 0.7654, + "step": 75 + }, + { + "epoch": 0.3, + "grad_norm": 0.27326638279450294, + "learning_rate": 9.558993067062785e-06, + "loss": 0.7832, + "step": 76 + }, + { + "epoch": 0.31, + "grad_norm": 0.26424783232216553, + "learning_rate": 9.545735616697875e-06, + "loss": 0.7509, + "step": 77 + }, + { + "epoch": 0.31, + "grad_norm": 0.26894661215694415, + "learning_rate": 9.532291308946191e-06, + "loss": 0.7638, + "step": 78 + }, + { + "epoch": 0.32, + "grad_norm": 12.381149099110814, + "learning_rate": 9.518660696450567e-06, + "loss": 0.9726, + "step": 79 + }, + { + "epoch": 0.32, + "grad_norm": 63.276974873593076, + "learning_rate": 9.504844339512096e-06, + "loss": 0.961, + "step": 80 + }, + { + "epoch": 0.32, + "grad_norm": 0.34404347007223207, + "learning_rate": 9.490842806067095e-06, + "loss": 0.7366, + "step": 81 + }, + { + "epoch": 0.33, + "grad_norm": 0.2761892805994169, + "learning_rate": 9.476656671663766e-06, + "loss": 0.7565, + "step": 82 + }, + { + "epoch": 0.33, + "grad_norm": 0.2938700568825168, + "learning_rate": 9.462286519438531e-06, + "loss": 0.7586, + "step": 83 + }, + { + "epoch": 0.33, + "grad_norm": 0.30998708104141814, + "learning_rate": 9.44773294009206e-06, + "loss": 0.747, + "step": 84 + }, + { + "epoch": 0.34, + "grad_norm": 0.2789622879446074, + "learning_rate": 9.432996531865001e-06, + "loss": 0.7381, + "step": 85 + }, + { + "epoch": 0.34, + "grad_norm": 0.3043211841621936, + "learning_rate": 9.418077900513377e-06, + "loss": 0.7648, + "step": 86 + }, + { + "epoch": 0.35, + "grad_norm": 0.27269347275749684, + "learning_rate": 9.40297765928369e-06, + "loss": 0.7287, + "step": 87 + }, + { + "epoch": 0.35, + "grad_norm": 0.29165683068711035, + "learning_rate": 9.387696428887715e-06, + "loss": 0.7714, + "step": 88 + }, + { + "epoch": 0.35, + "grad_norm": 0.29093659611546846, + "learning_rate": 9.372234837476979e-06, + "loss": 0.754, + "step": 89 + }, + { + "epoch": 0.36, + "grad_norm": 0.2622000520062877, + "learning_rate": 9.356593520616948e-06, + "loss": 0.7604, + "step": 90 + }, + { + "epoch": 0.36, + "grad_norm": 0.29648676556314774, + "learning_rate": 9.340773121260893e-06, + "loss": 0.7677, + "step": 91 + }, + { + "epoch": 0.37, + "grad_norm": 0.2971691719126809, + "learning_rate": 9.324774289723469e-06, + "loss": 0.7826, + "step": 92 + }, + { + "epoch": 0.37, + "grad_norm": 0.2695147958164756, + "learning_rate": 9.308597683653976e-06, + "loss": 0.7675, + "step": 93 + }, + { + "epoch": 0.37, + "grad_norm": 0.2947547264550856, + "learning_rate": 9.292243968009332e-06, + "loss": 0.7611, + "step": 94 + }, + { + "epoch": 0.38, + "grad_norm": 0.2698951119585888, + "learning_rate": 9.275713815026732e-06, + "loss": 0.7437, + "step": 95 + }, + { + "epoch": 0.38, + "grad_norm": 0.2922871464880811, + "learning_rate": 9.259007904196023e-06, + "loss": 0.7716, + "step": 96 + }, + { + "epoch": 0.39, + "grad_norm": 13.342043215041077, + "learning_rate": 9.242126922231763e-06, + "loss": 1.0262, + "step": 97 + }, + { + "epoch": 0.39, + "grad_norm": 3.3348436860369577, + "learning_rate": 9.225071563045007e-06, + "loss": 0.9733, + "step": 98 + }, + { + "epoch": 0.39, + "grad_norm": 0.30515152013617763, + "learning_rate": 9.207842527714767e-06, + "loss": 0.7491, + "step": 99 + }, + { + "epoch": 0.4, + "grad_norm": 0.2797372956926758, + "learning_rate": 9.190440524459203e-06, + "loss": 0.7658, + "step": 100 + }, + { + "epoch": 0.4, + "grad_norm": 0.295442681103485, + "learning_rate": 9.172866268606514e-06, + "loss": 0.7359, + "step": 101 + }, + { + "epoch": 0.41, + "grad_norm": 0.2890934213055238, + "learning_rate": 9.15512048256552e-06, + "loss": 0.7783, + "step": 102 + }, + { + "epoch": 0.41, + "grad_norm": 0.2698602196909741, + "learning_rate": 9.137203895795983e-06, + "loss": 0.7476, + "step": 103 + }, + { + "epoch": 0.41, + "grad_norm": 0.30586672847809404, + "learning_rate": 9.119117244778609e-06, + "loss": 0.7494, + "step": 104 + }, + { + "epoch": 0.42, + "grad_norm": 0.28256137853789653, + "learning_rate": 9.10086127298478e-06, + "loss": 0.7347, + "step": 105 + }, + { + "epoch": 0.42, + "grad_norm": 0.2654565204147507, + "learning_rate": 9.082436730845993e-06, + "loss": 0.7282, + "step": 106 + }, + { + "epoch": 0.43, + "grad_norm": 0.3110313172517606, + "learning_rate": 9.063844375723014e-06, + "loss": 0.7442, + "step": 107 + }, + { + "epoch": 0.43, + "grad_norm": 0.30406144580285027, + "learning_rate": 9.045084971874738e-06, + "loss": 0.7365, + "step": 108 + }, + { + "epoch": 0.43, + "grad_norm": 0.2578097986689305, + "learning_rate": 9.026159290426782e-06, + "loss": 0.7644, + "step": 109 + }, + { + "epoch": 0.44, + "grad_norm": 0.2875228334986064, + "learning_rate": 9.007068109339783e-06, + "loss": 0.7359, + "step": 110 + }, + { + "epoch": 0.44, + "grad_norm": 0.3007292657335934, + "learning_rate": 8.987812213377423e-06, + "loss": 0.7571, + "step": 111 + }, + { + "epoch": 0.45, + "grad_norm": 0.2705781647990633, + "learning_rate": 8.968392394074164e-06, + "loss": 0.7321, + "step": 112 + }, + { + "epoch": 0.45, + "grad_norm": 7.709717121399015, + "learning_rate": 8.948809449702712e-06, + "loss": 1.0663, + "step": 113 + }, + { + "epoch": 0.45, + "grad_norm": 0.3159530858994423, + "learning_rate": 8.929064185241214e-06, + "loss": 0.7594, + "step": 114 + }, + { + "epoch": 0.46, + "grad_norm": 0.3001925080979955, + "learning_rate": 8.90915741234015e-06, + "loss": 0.7486, + "step": 115 + }, + { + "epoch": 0.46, + "grad_norm": 2.7719217922453914, + "learning_rate": 8.889089949288986e-06, + "loss": 1.0014, + "step": 116 + }, + { + "epoch": 0.47, + "grad_norm": 0.2980097580186808, + "learning_rate": 8.868862620982534e-06, + "loss": 0.7302, + "step": 117 + }, + { + "epoch": 0.47, + "grad_norm": 0.2674175783919389, + "learning_rate": 8.84847625888703e-06, + "loss": 0.7515, + "step": 118 + }, + { + "epoch": 0.47, + "grad_norm": 0.33089914745986465, + "learning_rate": 8.827931701005974e-06, + "loss": 0.7452, + "step": 119 + }, + { + "epoch": 0.48, + "grad_norm": 0.2657873873108844, + "learning_rate": 8.807229791845673e-06, + "loss": 0.7565, + "step": 120 + }, + { + "epoch": 0.48, + "grad_norm": 0.26891569095038903, + "learning_rate": 8.786371382380527e-06, + "loss": 0.7525, + "step": 121 + }, + { + "epoch": 0.49, + "grad_norm": 0.30828049821760906, + "learning_rate": 8.765357330018056e-06, + "loss": 0.7395, + "step": 122 + }, + { + "epoch": 0.49, + "grad_norm": 0.2703956947723179, + "learning_rate": 8.74418849856364e-06, + "loss": 0.7762, + "step": 123 + }, + { + "epoch": 0.49, + "grad_norm": 0.27665743831770573, + "learning_rate": 8.722865758185036e-06, + "loss": 0.7499, + "step": 124 + }, + { + "epoch": 0.5, + "grad_norm": 0.31328445024397394, + "learning_rate": 8.701389985376578e-06, + "loss": 0.7368, + "step": 125 + }, + { + "epoch": 0.5, + "eval_loss": 0.7192811369895935, + "eval_runtime": 97.0775, + "eval_samples_per_second": 18.212, + "eval_steps_per_second": 0.381, + "step": 125 + }, + { + "epoch": 0.5, + "grad_norm": 0.26565830658390915, + "learning_rate": 8.679762062923176e-06, + "loss": 0.7727, + "step": 126 + }, + { + "epoch": 0.51, + "grad_norm": 0.2768705145101062, + "learning_rate": 8.657982879864007e-06, + "loss": 0.7178, + "step": 127 + }, + { + "epoch": 0.51, + "grad_norm": 0.27658899618203814, + "learning_rate": 8.636053331455986e-06, + "loss": 0.7521, + "step": 128 + }, + { + "epoch": 0.51, + "grad_norm": 0.2687326456666238, + "learning_rate": 8.613974319136959e-06, + "loss": 0.7411, + "step": 129 + }, + { + "epoch": 0.52, + "grad_norm": 0.2618083386724651, + "learning_rate": 8.591746750488639e-06, + "loss": 0.7306, + "step": 130 + }, + { + "epoch": 0.52, + "grad_norm": 0.25666246646393165, + "learning_rate": 8.569371539199316e-06, + "loss": 0.7505, + "step": 131 + }, + { + "epoch": 0.53, + "grad_norm": 0.3203048983481449, + "learning_rate": 8.54684960502629e-06, + "loss": 0.7515, + "step": 132 + }, + { + "epoch": 0.53, + "grad_norm": 0.2521993776332652, + "learning_rate": 8.52418187375806e-06, + "loss": 0.7505, + "step": 133 + }, + { + "epoch": 0.53, + "grad_norm": 0.26591933789428035, + "learning_rate": 8.501369277176275e-06, + "loss": 0.7353, + "step": 134 + }, + { + "epoch": 0.54, + "grad_norm": 0.27603393300812845, + "learning_rate": 8.478412753017433e-06, + "loss": 0.7609, + "step": 135 + }, + { + "epoch": 0.54, + "grad_norm": 0.2668194745887302, + "learning_rate": 8.455313244934324e-06, + "loss": 0.7141, + "step": 136 + }, + { + "epoch": 0.55, + "grad_norm": 0.2707242484249978, + "learning_rate": 8.432071702457253e-06, + "loss": 0.7223, + "step": 137 + }, + { + "epoch": 0.55, + "grad_norm": 0.25511126409448154, + "learning_rate": 8.408689080954997e-06, + "loss": 0.7153, + "step": 138 + }, + { + "epoch": 0.55, + "grad_norm": 15.420395873510664, + "learning_rate": 8.38516634159555e-06, + "loss": 1.0042, + "step": 139 + }, + { + "epoch": 0.56, + "grad_norm": 0.3012234081880187, + "learning_rate": 8.361504451306585e-06, + "loss": 0.7713, + "step": 140 + }, + { + "epoch": 0.56, + "grad_norm": 0.2598491249346932, + "learning_rate": 8.337704382735741e-06, + "loss": 0.7288, + "step": 141 + }, + { + "epoch": 0.57, + "grad_norm": 0.26256298238822373, + "learning_rate": 8.313767114210615e-06, + "loss": 0.7379, + "step": 142 + }, + { + "epoch": 0.57, + "grad_norm": 0.2945113973208366, + "learning_rate": 8.289693629698564e-06, + "loss": 0.7401, + "step": 143 + }, + { + "epoch": 0.57, + "grad_norm": 0.24981458420819586, + "learning_rate": 8.265484918766243e-06, + "loss": 0.7512, + "step": 144 + }, + { + "epoch": 0.58, + "grad_norm": 0.2678413297548206, + "learning_rate": 8.241141976538944e-06, + "loss": 0.7449, + "step": 145 + }, + { + "epoch": 0.58, + "grad_norm": 0.2623417103083203, + "learning_rate": 8.216665803659671e-06, + "loss": 0.7647, + "step": 146 + }, + { + "epoch": 0.59, + "grad_norm": 0.263785777979794, + "learning_rate": 8.192057406248028e-06, + "loss": 0.7725, + "step": 147 + }, + { + "epoch": 0.59, + "grad_norm": 0.2519540317965661, + "learning_rate": 8.16731779585885e-06, + "loss": 0.715, + "step": 148 + }, + { + "epoch": 0.59, + "grad_norm": 0.27015785362121375, + "learning_rate": 8.142447989440618e-06, + "loss": 0.7532, + "step": 149 + }, + { + "epoch": 0.6, + "grad_norm": 0.25863328277564784, + "learning_rate": 8.117449009293668e-06, + "loss": 0.7335, + "step": 150 + }, + { + "epoch": 0.6, + "grad_norm": 0.2550714525590909, + "learning_rate": 8.092321883028157e-06, + "loss": 0.7182, + "step": 151 + }, + { + "epoch": 0.61, + "grad_norm": 0.2752483825047847, + "learning_rate": 8.067067643521834e-06, + "loss": 0.772, + "step": 152 + }, + { + "epoch": 0.61, + "grad_norm": 0.2582002365542859, + "learning_rate": 8.041687328877566e-06, + "loss": 0.7284, + "step": 153 + }, + { + "epoch": 0.61, + "grad_norm": 0.24823845365447103, + "learning_rate": 8.016181982380682e-06, + "loss": 0.7467, + "step": 154 + }, + { + "epoch": 0.62, + "grad_norm": 0.25644169647568194, + "learning_rate": 7.99055265245608e-06, + "loss": 0.7191, + "step": 155 + }, + { + "epoch": 0.62, + "grad_norm": 0.27463144458405375, + "learning_rate": 7.96480039262513e-06, + "loss": 0.7375, + "step": 156 + }, + { + "epoch": 0.63, + "grad_norm": 0.2642553744129046, + "learning_rate": 7.938926261462366e-06, + "loss": 0.7587, + "step": 157 + }, + { + "epoch": 0.63, + "grad_norm": 0.25679436337027056, + "learning_rate": 7.912931322551981e-06, + "loss": 0.7312, + "step": 158 + }, + { + "epoch": 0.63, + "grad_norm": 0.2784563408983632, + "learning_rate": 7.886816644444099e-06, + "loss": 0.7213, + "step": 159 + }, + { + "epoch": 0.64, + "grad_norm": 0.2638439438037005, + "learning_rate": 7.860583300610849e-06, + "loss": 0.7286, + "step": 160 + }, + { + "epoch": 0.64, + "grad_norm": 0.2599487424469649, + "learning_rate": 7.83423236940225e-06, + "loss": 0.7211, + "step": 161 + }, + { + "epoch": 0.65, + "grad_norm": 0.2620169279227201, + "learning_rate": 7.807764934001875e-06, + "loss": 0.7243, + "step": 162 + }, + { + "epoch": 0.65, + "grad_norm": 0.3455142385888707, + "learning_rate": 7.781182082382325e-06, + "loss": 0.7709, + "step": 163 + }, + { + "epoch": 0.65, + "grad_norm": 0.25797958121628417, + "learning_rate": 7.754484907260513e-06, + "loss": 0.7371, + "step": 164 + }, + { + "epoch": 0.66, + "grad_norm": 0.26245124486082283, + "learning_rate": 7.727674506052744e-06, + "loss": 0.7625, + "step": 165 + }, + { + "epoch": 0.66, + "grad_norm": 0.259525851556208, + "learning_rate": 7.700751980829601e-06, + "loss": 0.7785, + "step": 166 + }, + { + "epoch": 0.67, + "grad_norm": 0.25578616887441574, + "learning_rate": 7.673718438270649e-06, + "loss": 0.7349, + "step": 167 + }, + { + "epoch": 0.67, + "grad_norm": 0.26880908011952676, + "learning_rate": 7.646574989618938e-06, + "loss": 0.7423, + "step": 168 + }, + { + "epoch": 0.67, + "grad_norm": 0.28268284846763475, + "learning_rate": 7.619322750635327e-06, + "loss": 0.8089, + "step": 169 + }, + { + "epoch": 0.68, + "grad_norm": 0.2565025440158926, + "learning_rate": 7.591962841552627e-06, + "loss": 0.708, + "step": 170 + }, + { + "epoch": 0.68, + "grad_norm": 0.2589330645555015, + "learning_rate": 7.564496387029532e-06, + "loss": 0.7276, + "step": 171 + }, + { + "epoch": 0.69, + "grad_norm": 4.072287518324262, + "learning_rate": 7.536924516104411e-06, + "loss": 0.963, + "step": 172 + }, + { + "epoch": 0.69, + "grad_norm": 5.236195816930884, + "learning_rate": 7.509248362148889e-06, + "loss": 0.9786, + "step": 173 + }, + { + "epoch": 0.69, + "grad_norm": 0.3208811366448085, + "learning_rate": 7.481469062821252e-06, + "loss": 0.7417, + "step": 174 + }, + { + "epoch": 0.7, + "grad_norm": 0.280850125752099, + "learning_rate": 7.453587760019691e-06, + "loss": 0.7249, + "step": 175 + }, + { + "epoch": 0.7, + "grad_norm": 0.26692675004354643, + "learning_rate": 7.42560559983536e-06, + "loss": 0.727, + "step": 176 + }, + { + "epoch": 0.71, + "grad_norm": 0.27369623001787907, + "learning_rate": 7.39752373250527e-06, + "loss": 0.7617, + "step": 177 + }, + { + "epoch": 0.71, + "grad_norm": 0.2784142807735342, + "learning_rate": 7.369343312364994e-06, + "loss": 0.7466, + "step": 178 + }, + { + "epoch": 0.71, + "grad_norm": 37.27250895412763, + "learning_rate": 7.34106549780123e-06, + "loss": 1.0667, + "step": 179 + }, + { + "epoch": 0.72, + "grad_norm": 0.2760061039631398, + "learning_rate": 7.312691451204178e-06, + "loss": 0.7244, + "step": 180 + }, + { + "epoch": 0.72, + "grad_norm": 0.25459829854169064, + "learning_rate": 7.284222338919758e-06, + "loss": 0.7364, + "step": 181 + }, + { + "epoch": 0.73, + "grad_norm": 0.26179305555253735, + "learning_rate": 7.255659331201673e-06, + "loss": 0.733, + "step": 182 + }, + { + "epoch": 0.73, + "grad_norm": 0.2604418741829541, + "learning_rate": 7.227003602163296e-06, + "loss": 0.7209, + "step": 183 + }, + { + "epoch": 0.73, + "grad_norm": 0.26185681215109163, + "learning_rate": 7.198256329729412e-06, + "loss": 0.7164, + "step": 184 + }, + { + "epoch": 0.74, + "grad_norm": 16.152387103951856, + "learning_rate": 7.169418695587791e-06, + "loss": 1.0372, + "step": 185 + }, + { + "epoch": 0.74, + "grad_norm": 21.228735850953576, + "learning_rate": 7.140491885140629e-06, + "loss": 1.0402, + "step": 186 + }, + { + "epoch": 0.75, + "grad_norm": 0.28404810159037286, + "learning_rate": 7.1114770874558e-06, + "loss": 0.7006, + "step": 187 + }, + { + "epoch": 0.75, + "grad_norm": 0.26609034714435736, + "learning_rate": 7.082375495217996e-06, + "loss": 0.7537, + "step": 188 + }, + { + "epoch": 0.75, + "grad_norm": 0.25765084929276133, + "learning_rate": 7.053188304679691e-06, + "loss": 0.7302, + "step": 189 + }, + { + "epoch": 0.76, + "grad_norm": 0.26384158886834463, + "learning_rate": 7.023916715611969e-06, + "loss": 0.712, + "step": 190 + }, + { + "epoch": 0.76, + "grad_norm": 0.27151931787506317, + "learning_rate": 6.994561931255209e-06, + "loss": 0.7502, + "step": 191 + }, + { + "epoch": 0.77, + "grad_norm": 0.27031492068457535, + "learning_rate": 6.965125158269619e-06, + "loss": 0.7179, + "step": 192 + }, + { + "epoch": 0.77, + "grad_norm": 0.26995073084719196, + "learning_rate": 6.935607606685642e-06, + "loss": 0.7624, + "step": 193 + }, + { + "epoch": 0.77, + "grad_norm": 0.25666755324587454, + "learning_rate": 6.906010489854209e-06, + "loss": 0.7426, + "step": 194 + }, + { + "epoch": 0.78, + "grad_norm": 0.2764461509009301, + "learning_rate": 6.876335024396872e-06, + "loss": 0.723, + "step": 195 + }, + { + "epoch": 0.78, + "grad_norm": 0.2597906002555833, + "learning_rate": 6.846582430155783e-06, + "loss": 0.7407, + "step": 196 + }, + { + "epoch": 0.79, + "grad_norm": 0.26409742487438864, + "learning_rate": 6.816753930143558e-06, + "loss": 0.7206, + "step": 197 + }, + { + "epoch": 0.79, + "grad_norm": 0.25320169233675405, + "learning_rate": 6.786850750493006e-06, + "loss": 0.7437, + "step": 198 + }, + { + "epoch": 0.79, + "grad_norm": 0.2708696048462205, + "learning_rate": 6.7568741204067145e-06, + "loss": 0.7422, + "step": 199 + }, + { + "epoch": 0.8, + "grad_norm": 0.26542323915181154, + "learning_rate": 6.726825272106539e-06, + "loss": 0.7514, + "step": 200 + }, + { + "epoch": 0.8, + "grad_norm": 0.26307166597433396, + "learning_rate": 6.696705440782939e-06, + "loss": 0.7509, + "step": 201 + }, + { + "epoch": 0.81, + "grad_norm": 0.26671446872754456, + "learning_rate": 6.66651586454421e-06, + "loss": 0.7465, + "step": 202 + }, + { + "epoch": 0.81, + "grad_norm": 0.2720083369272757, + "learning_rate": 6.636257784365585e-06, + "loss": 0.7349, + "step": 203 + }, + { + "epoch": 0.81, + "grad_norm": 0.2652218770116059, + "learning_rate": 6.605932444038229e-06, + "loss": 0.7348, + "step": 204 + }, + { + "epoch": 0.82, + "grad_norm": 0.26402314109149694, + "learning_rate": 6.575541090118105e-06, + "loss": 0.7495, + "step": 205 + }, + { + "epoch": 0.82, + "grad_norm": 0.2639803511821082, + "learning_rate": 6.545084971874738e-06, + "loss": 0.7138, + "step": 206 + }, + { + "epoch": 0.83, + "grad_norm": 0.2673567043268493, + "learning_rate": 6.514565341239861e-06, + "loss": 0.7341, + "step": 207 + }, + { + "epoch": 0.83, + "grad_norm": 74.0236556664021, + "learning_rate": 6.483983452755953e-06, + "loss": 1.084, + "step": 208 + }, + { + "epoch": 0.83, + "grad_norm": 0.2694930198888902, + "learning_rate": 6.4533405635246696e-06, + "loss": 0.7422, + "step": 209 + }, + { + "epoch": 0.84, + "grad_norm": 0.2808089325365656, + "learning_rate": 6.4226379331551625e-06, + "loss": 0.7543, + "step": 210 + }, + { + "epoch": 0.84, + "grad_norm": 0.24629087243802783, + "learning_rate": 6.3918768237123175e-06, + "loss": 0.7088, + "step": 211 + }, + { + "epoch": 0.85, + "grad_norm": 0.2605148206784209, + "learning_rate": 6.361058499664856e-06, + "loss": 0.7434, + "step": 212 + }, + { + "epoch": 0.85, + "grad_norm": 0.26199687024127344, + "learning_rate": 6.330184227833376e-06, + "loss": 0.7369, + "step": 213 + }, + { + "epoch": 0.85, + "grad_norm": 0.2693080928270376, + "learning_rate": 6.299255277338265e-06, + "loss": 0.7337, + "step": 214 + }, + { + "epoch": 0.86, + "grad_norm": 0.2573571779304293, + "learning_rate": 6.268272919547537e-06, + "loss": 0.7366, + "step": 215 + }, + { + "epoch": 0.86, + "grad_norm": 0.25347655388671, + "learning_rate": 6.237238428024573e-06, + "loss": 0.7392, + "step": 216 + }, + { + "epoch": 0.87, + "grad_norm": 0.254807709796356, + "learning_rate": 6.2061530784757625e-06, + "loss": 0.7709, + "step": 217 + }, + { + "epoch": 0.87, + "grad_norm": 0.25435065962054804, + "learning_rate": 6.175018148698077e-06, + "loss": 0.7472, + "step": 218 + }, + { + "epoch": 0.87, + "grad_norm": 0.25856868944475736, + "learning_rate": 6.143834918526528e-06, + "loss": 0.7442, + "step": 219 + }, + { + "epoch": 0.88, + "grad_norm": 0.24960062893507637, + "learning_rate": 6.112604669781572e-06, + "loss": 0.7163, + "step": 220 + }, + { + "epoch": 0.88, + "grad_norm": 0.2544024553733407, + "learning_rate": 6.0813286862164175e-06, + "loss": 0.7236, + "step": 221 + }, + { + "epoch": 0.89, + "grad_norm": 0.2532920039697931, + "learning_rate": 6.050008253464247e-06, + "loss": 0.7427, + "step": 222 + }, + { + "epoch": 0.89, + "grad_norm": 0.25372808971698796, + "learning_rate": 6.018644658985378e-06, + "loss": 0.7286, + "step": 223 + }, + { + "epoch": 0.89, + "grad_norm": 0.2570514856547558, + "learning_rate": 5.987239192014336e-06, + "loss": 0.7349, + "step": 224 + }, + { + "epoch": 0.9, + "grad_norm": 0.2578576277551542, + "learning_rate": 5.955793143506863e-06, + "loss": 0.7266, + "step": 225 + }, + { + "epoch": 0.9, + "grad_norm": 0.26312215832145636, + "learning_rate": 5.9243078060868445e-06, + "loss": 0.7389, + "step": 226 + }, + { + "epoch": 0.91, + "grad_norm": 0.26518617358808877, + "learning_rate": 5.892784473993184e-06, + "loss": 0.7108, + "step": 227 + }, + { + "epoch": 0.91, + "grad_norm": 0.25620517627113376, + "learning_rate": 5.861224443026595e-06, + "loss": 0.7232, + "step": 228 + }, + { + "epoch": 0.91, + "grad_norm": 28.36402580586963, + "learning_rate": 5.82962901049634e-06, + "loss": 0.9734, + "step": 229 + }, + { + "epoch": 0.92, + "grad_norm": 0.2807037939514787, + "learning_rate": 5.797999475166897e-06, + "loss": 0.7341, + "step": 230 + }, + { + "epoch": 0.92, + "grad_norm": 8.208344028346868, + "learning_rate": 5.766337137204579e-06, + "loss": 0.938, + "step": 231 + }, + { + "epoch": 0.93, + "grad_norm": 0.26445130385050997, + "learning_rate": 5.734643298124091e-06, + "loss": 0.7316, + "step": 232 + }, + { + "epoch": 0.93, + "grad_norm": 0.251567451954335, + "learning_rate": 5.702919260735015e-06, + "loss": 0.6966, + "step": 233 + }, + { + "epoch": 0.93, + "grad_norm": 0.26329080787916564, + "learning_rate": 5.671166329088278e-06, + "loss": 0.7319, + "step": 234 + }, + { + "epoch": 0.94, + "grad_norm": 0.2566777339661679, + "learning_rate": 5.6393858084225305e-06, + "loss": 0.7529, + "step": 235 + }, + { + "epoch": 0.94, + "grad_norm": 0.2710815554700812, + "learning_rate": 5.6075790051105025e-06, + "loss": 0.7515, + "step": 236 + }, + { + "epoch": 0.95, + "grad_norm": 0.27096961550302734, + "learning_rate": 5.575747226605298e-06, + "loss": 0.7073, + "step": 237 + }, + { + "epoch": 0.95, + "grad_norm": 0.2509131795738037, + "learning_rate": 5.543891781386655e-06, + "loss": 0.7513, + "step": 238 + }, + { + "epoch": 0.95, + "grad_norm": 0.26210506205941153, + "learning_rate": 5.512013978907157e-06, + "loss": 0.7569, + "step": 239 + }, + { + "epoch": 0.96, + "grad_norm": 0.25123130642497177, + "learning_rate": 5.480115129538409e-06, + "loss": 0.7239, + "step": 240 + }, + { + "epoch": 0.96, + "grad_norm": 0.2596821607229612, + "learning_rate": 5.448196544517168e-06, + "loss": 0.7256, + "step": 241 + }, + { + "epoch": 0.97, + "grad_norm": 0.2714818563550966, + "learning_rate": 5.4162595358914475e-06, + "loss": 0.7329, + "step": 242 + }, + { + "epoch": 0.97, + "grad_norm": 0.260503064108439, + "learning_rate": 5.384305416466584e-06, + "loss": 0.7112, + "step": 243 + }, + { + "epoch": 0.97, + "grad_norm": 0.2661267608396215, + "learning_rate": 5.35233549975127e-06, + "loss": 0.7534, + "step": 244 + }, + { + "epoch": 0.98, + "grad_norm": 0.27502743208671454, + "learning_rate": 5.320351099903565e-06, + "loss": 0.7355, + "step": 245 + }, + { + "epoch": 0.98, + "grad_norm": 0.2598641343680277, + "learning_rate": 5.288353531676873e-06, + "loss": 0.7476, + "step": 246 + }, + { + "epoch": 0.99, + "grad_norm": 0.2629788348419056, + "learning_rate": 5.256344110365896e-06, + "loss": 0.7523, + "step": 247 + }, + { + "epoch": 0.99, + "grad_norm": 0.256123432185156, + "learning_rate": 5.224324151752575e-06, + "loss": 0.7479, + "step": 248 + }, + { + "epoch": 0.99, + "grad_norm": 0.2592695095071067, + "learning_rate": 5.192294972051992e-06, + "loss": 0.7586, + "step": 249 + }, + { + "epoch": 1.0, + "grad_norm": 0.26264999139615697, + "learning_rate": 5.160257887858278e-06, + "loss": 0.7406, + "step": 250 + }, + { + "epoch": 1.0, + "eval_loss": 0.7036678791046143, + "eval_runtime": 96.3087, + "eval_samples_per_second": 18.358, + "eval_steps_per_second": 0.384, + "step": 250 + }, + { + "epoch": 1.0, + "grad_norm": 0.2614921961545108, + "learning_rate": 5.128214216090478e-06, + "loss": 0.7488, + "step": 251 + }, + { + "epoch": 1.0, + "grad_norm": 0.2605743808221771, + "learning_rate": 5.0961652739384356e-06, + "loss": 0.7338, + "step": 252 + }, + { + "epoch": 1.01, + "grad_norm": 2.904417203670962, + "learning_rate": 5.064112378808636e-06, + "loss": 0.9738, + "step": 253 + }, + { + "epoch": 1.01, + "grad_norm": 0.2581985759494367, + "learning_rate": 5.032056848270056e-06, + "loss": 0.7693, + "step": 254 + }, + { + "epoch": 1.02, + "grad_norm": 0.25102446332314765, + "learning_rate": 5e-06, + "loss": 0.7213, + "step": 255 + }, + { + "epoch": 1.0, + "grad_norm": 1.4216598983588058, + "learning_rate": 4.967943151729945e-06, + "loss": 0.9193, + "step": 256 + }, + { + "epoch": 1.0, + "grad_norm": 0.32982276331099014, + "learning_rate": 4.935887621191364e-06, + "loss": 0.6842, + "step": 257 + }, + { + "epoch": 1.01, + "grad_norm": 0.29043411467478625, + "learning_rate": 4.903834726061565e-06, + "loss": 0.7087, + "step": 258 + }, + { + "epoch": 1.01, + "grad_norm": 0.25986254756592664, + "learning_rate": 4.871785783909523e-06, + "loss": 0.6741, + "step": 259 + }, + { + "epoch": 1.02, + "grad_norm": 0.30049816553828484, + "learning_rate": 4.839742112141725e-06, + "loss": 0.7063, + "step": 260 + }, + { + "epoch": 1.02, + "grad_norm": 0.2895999616622155, + "learning_rate": 4.807705027948008e-06, + "loss": 0.7146, + "step": 261 + }, + { + "epoch": 1.02, + "grad_norm": 0.30041272052643164, + "learning_rate": 4.775675848247427e-06, + "loss": 0.7134, + "step": 262 + }, + { + "epoch": 1.03, + "grad_norm": 0.27518299819790887, + "learning_rate": 4.743655889634105e-06, + "loss": 0.692, + "step": 263 + }, + { + "epoch": 1.03, + "grad_norm": 0.26955160521446175, + "learning_rate": 4.711646468323129e-06, + "loss": 0.658, + "step": 264 + }, + { + "epoch": 1.04, + "grad_norm": 0.27535739664976405, + "learning_rate": 4.679648900096436e-06, + "loss": 0.6908, + "step": 265 + }, + { + "epoch": 1.04, + "grad_norm": 0.26763089124769246, + "learning_rate": 4.64766450024873e-06, + "loss": 0.6861, + "step": 266 + }, + { + "epoch": 1.04, + "grad_norm": 74.38254133611925, + "learning_rate": 4.615694583533418e-06, + "loss": 0.9994, + "step": 267 + }, + { + "epoch": 1.05, + "grad_norm": 0.2986551656205794, + "learning_rate": 4.583740464108554e-06, + "loss": 0.7075, + "step": 268 + }, + { + "epoch": 1.05, + "grad_norm": 0.27619714658975547, + "learning_rate": 4.551803455482833e-06, + "loss": 0.6596, + "step": 269 + }, + { + "epoch": 1.06, + "grad_norm": 0.25242413092583954, + "learning_rate": 4.5198848704615915e-06, + "loss": 0.6628, + "step": 270 + }, + { + "epoch": 1.06, + "grad_norm": 0.26017582720690735, + "learning_rate": 4.487986021092844e-06, + "loss": 0.6916, + "step": 271 + }, + { + "epoch": 1.06, + "grad_norm": 0.2719334401383232, + "learning_rate": 4.456108218613346e-06, + "loss": 0.6935, + "step": 272 + }, + { + "epoch": 1.07, + "grad_norm": 0.2874168693732095, + "learning_rate": 4.424252773394704e-06, + "loss": 0.7013, + "step": 273 + }, + { + "epoch": 1.07, + "grad_norm": 0.27088215577908004, + "learning_rate": 4.392420994889498e-06, + "loss": 0.693, + "step": 274 + }, + { + "epoch": 1.08, + "grad_norm": 0.2532812233498042, + "learning_rate": 4.3606141915774695e-06, + "loss": 0.6762, + "step": 275 + }, + { + "epoch": 1.08, + "grad_norm": 0.263089719520046, + "learning_rate": 4.3288336709117246e-06, + "loss": 0.6677, + "step": 276 + }, + { + "epoch": 1.08, + "grad_norm": 0.2544227492529696, + "learning_rate": 4.297080739264987e-06, + "loss": 0.6744, + "step": 277 + }, + { + "epoch": 1.09, + "grad_norm": 0.2645751047762965, + "learning_rate": 4.265356701875911e-06, + "loss": 0.7047, + "step": 278 + }, + { + "epoch": 1.09, + "grad_norm": 0.2602397068309733, + "learning_rate": 4.23366286279542e-06, + "loss": 0.6792, + "step": 279 + }, + { + "epoch": 1.1, + "grad_norm": 0.27012218470061766, + "learning_rate": 4.2020005248331056e-06, + "loss": 0.6914, + "step": 280 + }, + { + "epoch": 1.1, + "grad_norm": 0.2645729945558582, + "learning_rate": 4.170370989503662e-06, + "loss": 0.6812, + "step": 281 + }, + { + "epoch": 1.1, + "grad_norm": 0.26158176244234604, + "learning_rate": 4.138775556973406e-06, + "loss": 0.6545, + "step": 282 + }, + { + "epoch": 1.11, + "grad_norm": 0.2609966416888788, + "learning_rate": 4.107215526006818e-06, + "loss": 0.6534, + "step": 283 + }, + { + "epoch": 1.11, + "grad_norm": 0.2633177456953443, + "learning_rate": 4.075692193913156e-06, + "loss": 0.6617, + "step": 284 + }, + { + "epoch": 1.12, + "grad_norm": 0.2711366514748812, + "learning_rate": 4.04420685649314e-06, + "loss": 0.7026, + "step": 285 + }, + { + "epoch": 1.12, + "grad_norm": 0.26016920693531187, + "learning_rate": 4.012760807985665e-06, + "loss": 0.685, + "step": 286 + }, + { + "epoch": 1.12, + "grad_norm": 0.2634077734164485, + "learning_rate": 3.9813553410146225e-06, + "loss": 0.6732, + "step": 287 + }, + { + "epoch": 1.13, + "grad_norm": 0.2630739058989318, + "learning_rate": 3.949991746535753e-06, + "loss": 0.6898, + "step": 288 + }, + { + "epoch": 1.13, + "grad_norm": 0.26810735877032293, + "learning_rate": 3.918671313783583e-06, + "loss": 0.6739, + "step": 289 + }, + { + "epoch": 1.14, + "grad_norm": 0.2667138269733132, + "learning_rate": 3.887395330218429e-06, + "loss": 0.6649, + "step": 290 + }, + { + "epoch": 1.14, + "grad_norm": 0.2563222658817468, + "learning_rate": 3.856165081473474e-06, + "loss": 0.708, + "step": 291 + }, + { + "epoch": 1.14, + "grad_norm": 0.26451201369218524, + "learning_rate": 3.824981851301924e-06, + "loss": 0.6715, + "step": 292 + }, + { + "epoch": 1.15, + "grad_norm": 0.2598494927634439, + "learning_rate": 3.7938469215242374e-06, + "loss": 0.6955, + "step": 293 + }, + { + "epoch": 1.15, + "grad_norm": 0.25396403307478727, + "learning_rate": 3.7627615719754294e-06, + "loss": 0.6676, + "step": 294 + }, + { + "epoch": 1.16, + "grad_norm": 0.2532765659210287, + "learning_rate": 3.731727080452464e-06, + "loss": 0.6748, + "step": 295 + }, + { + "epoch": 1.16, + "grad_norm": 0.26061271523616963, + "learning_rate": 3.7007447226617367e-06, + "loss": 0.7058, + "step": 296 + }, + { + "epoch": 1.16, + "grad_norm": 0.25801226716086006, + "learning_rate": 3.669815772166625e-06, + "loss": 0.6717, + "step": 297 + }, + { + "epoch": 1.17, + "grad_norm": 0.2678578983084559, + "learning_rate": 3.638941500335145e-06, + "loss": 0.6785, + "step": 298 + }, + { + "epoch": 1.17, + "grad_norm": 0.2629273311111566, + "learning_rate": 3.608123176287685e-06, + "loss": 0.6846, + "step": 299 + }, + { + "epoch": 1.18, + "grad_norm": 0.26372287416738016, + "learning_rate": 3.5773620668448384e-06, + "loss": 0.7155, + "step": 300 + }, + { + "epoch": 1.18, + "grad_norm": 0.2705437923133382, + "learning_rate": 3.5466594364753325e-06, + "loss": 0.6723, + "step": 301 + }, + { + "epoch": 1.18, + "grad_norm": 0.2839291053250124, + "learning_rate": 3.516016547244047e-06, + "loss": 0.7035, + "step": 302 + }, + { + "epoch": 1.19, + "grad_norm": 0.2760313377640414, + "learning_rate": 3.48543465876014e-06, + "loss": 0.6751, + "step": 303 + }, + { + "epoch": 1.19, + "grad_norm": 0.25394626546919435, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.6795, + "step": 304 + }, + { + "epoch": 1.2, + "grad_norm": 0.2591221216055477, + "learning_rate": 3.424458909881897e-06, + "loss": 0.6766, + "step": 305 + }, + { + "epoch": 1.2, + "grad_norm": 0.2691782924782941, + "learning_rate": 3.3940675559617724e-06, + "loss": 0.6895, + "step": 306 + }, + { + "epoch": 1.2, + "grad_norm": 0.26815145183562566, + "learning_rate": 3.363742215634416e-06, + "loss": 0.6671, + "step": 307 + }, + { + "epoch": 1.21, + "grad_norm": 0.26601253229287064, + "learning_rate": 3.3334841354557923e-06, + "loss": 0.6902, + "step": 308 + }, + { + "epoch": 1.21, + "grad_norm": 0.2759140499002526, + "learning_rate": 3.303294559217063e-06, + "loss": 0.7011, + "step": 309 + }, + { + "epoch": 1.22, + "grad_norm": 0.2532152571509874, + "learning_rate": 3.273174727893463e-06, + "loss": 0.6631, + "step": 310 + }, + { + "epoch": 1.22, + "grad_norm": 0.2587732106097895, + "learning_rate": 3.2431258795932863e-06, + "loss": 0.6964, + "step": 311 + }, + { + "epoch": 1.22, + "grad_norm": 0.26154429819114283, + "learning_rate": 3.213149249506997e-06, + "loss": 0.7018, + "step": 312 + }, + { + "epoch": 1.23, + "grad_norm": 0.2640699829932556, + "learning_rate": 3.183246069856443e-06, + "loss": 0.6809, + "step": 313 + }, + { + "epoch": 1.23, + "grad_norm": 0.26350081604751235, + "learning_rate": 3.1534175698442194e-06, + "loss": 0.655, + "step": 314 + }, + { + "epoch": 1.24, + "grad_norm": 0.2620810766791341, + "learning_rate": 3.12366497560313e-06, + "loss": 0.7034, + "step": 315 + }, + { + "epoch": 1.24, + "grad_norm": 0.26759545817238656, + "learning_rate": 3.093989510145792e-06, + "loss": 0.7238, + "step": 316 + }, + { + "epoch": 1.24, + "grad_norm": 0.26779990918516644, + "learning_rate": 3.0643923933143603e-06, + "loss": 0.6733, + "step": 317 + }, + { + "epoch": 1.25, + "grad_norm": 0.266460909749917, + "learning_rate": 3.0348748417303826e-06, + "loss": 0.6708, + "step": 318 + }, + { + "epoch": 1.25, + "grad_norm": 0.26133722668937404, + "learning_rate": 3.005438068744792e-06, + "loss": 0.6838, + "step": 319 + }, + { + "epoch": 1.26, + "grad_norm": 0.26893527319559857, + "learning_rate": 2.976083284388031e-06, + "loss": 0.662, + "step": 320 + }, + { + "epoch": 1.26, + "grad_norm": 0.2699706439018165, + "learning_rate": 2.9468116953203107e-06, + "loss": 0.6867, + "step": 321 + }, + { + "epoch": 1.26, + "grad_norm": 0.2660357193246072, + "learning_rate": 2.9176245047820064e-06, + "loss": 0.6802, + "step": 322 + }, + { + "epoch": 1.27, + "grad_norm": 0.2676372441332764, + "learning_rate": 2.8885229125442022e-06, + "loss": 0.7143, + "step": 323 + }, + { + "epoch": 1.27, + "grad_norm": 0.26172654998349437, + "learning_rate": 2.859508114859374e-06, + "loss": 0.6688, + "step": 324 + }, + { + "epoch": 1.28, + "grad_norm": 0.26622148926216194, + "learning_rate": 2.83058130441221e-06, + "loss": 0.671, + "step": 325 + }, + { + "epoch": 1.28, + "grad_norm": 0.28436198488003145, + "learning_rate": 2.80174367027059e-06, + "loss": 0.7157, + "step": 326 + }, + { + "epoch": 1.28, + "grad_norm": 0.2637348527869296, + "learning_rate": 2.772996397836704e-06, + "loss": 0.6893, + "step": 327 + }, + { + "epoch": 1.29, + "grad_norm": 0.27094660677035604, + "learning_rate": 2.7443406687983267e-06, + "loss": 0.7149, + "step": 328 + }, + { + "epoch": 1.29, + "grad_norm": 0.26943858696681655, + "learning_rate": 2.7157776610802416e-06, + "loss": 0.6756, + "step": 329 + }, + { + "epoch": 1.3, + "grad_norm": 0.2637431948145577, + "learning_rate": 2.687308548795825e-06, + "loss": 0.6731, + "step": 330 + }, + { + "epoch": 1.3, + "grad_norm": 0.2605764257900338, + "learning_rate": 2.6589345021987725e-06, + "loss": 0.6601, + "step": 331 + }, + { + "epoch": 1.3, + "grad_norm": 0.2691370320929689, + "learning_rate": 2.6306566876350072e-06, + "loss": 0.6747, + "step": 332 + }, + { + "epoch": 1.31, + "grad_norm": 0.26505353096492007, + "learning_rate": 2.6024762674947313e-06, + "loss": 0.6355, + "step": 333 + }, + { + "epoch": 1.31, + "grad_norm": 0.2771254154185314, + "learning_rate": 2.5743944001646394e-06, + "loss": 0.6679, + "step": 334 + }, + { + "epoch": 1.32, + "grad_norm": 0.2734747453519109, + "learning_rate": 2.5464122399803126e-06, + "loss": 0.6842, + "step": 335 + }, + { + "epoch": 1.32, + "grad_norm": 0.26644275829657593, + "learning_rate": 2.5185309371787515e-06, + "loss": 0.6986, + "step": 336 + }, + { + "epoch": 1.32, + "grad_norm": 2.921169787601039, + "learning_rate": 2.4907516378511137e-06, + "loss": 0.9339, + "step": 337 + }, + { + "epoch": 1.33, + "grad_norm": 0.25960894123414624, + "learning_rate": 2.46307548389559e-06, + "loss": 0.6743, + "step": 338 + }, + { + "epoch": 1.33, + "grad_norm": 0.2637156946948773, + "learning_rate": 2.43550361297047e-06, + "loss": 0.682, + "step": 339 + }, + { + "epoch": 1.34, + "grad_norm": 0.26481304373722364, + "learning_rate": 2.408037158447375e-06, + "loss": 0.6838, + "step": 340 + }, + { + "epoch": 1.34, + "grad_norm": 0.3098032445823631, + "learning_rate": 2.3806772493646725e-06, + "loss": 0.6569, + "step": 341 + }, + { + "epoch": 1.34, + "grad_norm": 0.26219269959571206, + "learning_rate": 2.353425010381063e-06, + "loss": 0.6761, + "step": 342 + }, + { + "epoch": 1.35, + "grad_norm": 0.2595963563694489, + "learning_rate": 2.3262815617293517e-06, + "loss": 0.6705, + "step": 343 + }, + { + "epoch": 1.35, + "grad_norm": 6.834107729255299, + "learning_rate": 2.2992480191704003e-06, + "loss": 0.9304, + "step": 344 + }, + { + "epoch": 1.36, + "grad_norm": 0.27845487671808766, + "learning_rate": 2.272325493947257e-06, + "loss": 0.7032, + "step": 345 + }, + { + "epoch": 1.36, + "grad_norm": 0.27640918477115356, + "learning_rate": 2.245515092739488e-06, + "loss": 0.6782, + "step": 346 + }, + { + "epoch": 1.36, + "grad_norm": 0.2807587758407648, + "learning_rate": 2.2188179176176767e-06, + "loss": 0.6932, + "step": 347 + }, + { + "epoch": 1.37, + "grad_norm": 0.27767259525555504, + "learning_rate": 2.1922350659981262e-06, + "loss": 0.6466, + "step": 348 + }, + { + "epoch": 1.37, + "grad_norm": 0.267658673895331, + "learning_rate": 2.165767630597752e-06, + "loss": 0.7089, + "step": 349 + }, + { + "epoch": 1.38, + "grad_norm": 0.2745228363539692, + "learning_rate": 2.139416699389153e-06, + "loss": 0.6778, + "step": 350 + }, + { + "epoch": 1.38, + "grad_norm": 0.26408657536920843, + "learning_rate": 2.1131833555559037e-06, + "loss": 0.693, + "step": 351 + }, + { + "epoch": 1.38, + "grad_norm": 0.27582062474841573, + "learning_rate": 2.08706867744802e-06, + "loss": 0.6896, + "step": 352 + }, + { + "epoch": 1.39, + "grad_norm": 0.26826511006390147, + "learning_rate": 2.061073738537635e-06, + "loss": 0.6796, + "step": 353 + }, + { + "epoch": 1.39, + "grad_norm": 0.26630216372896987, + "learning_rate": 2.0351996073748713e-06, + "loss": 0.664, + "step": 354 + }, + { + "epoch": 1.4, + "grad_norm": 0.2809694093215496, + "learning_rate": 2.00944734754392e-06, + "loss": 0.7044, + "step": 355 + }, + { + "epoch": 1.4, + "grad_norm": 1.2984772108139377, + "learning_rate": 1.983818017619318e-06, + "loss": 0.9348, + "step": 356 + }, + { + "epoch": 1.4, + "grad_norm": 0.2721943586746493, + "learning_rate": 1.9583126711224342e-06, + "loss": 0.6918, + "step": 357 + }, + { + "epoch": 1.41, + "grad_norm": 0.27480703238103743, + "learning_rate": 1.932932356478168e-06, + "loss": 0.6854, + "step": 358 + }, + { + "epoch": 1.41, + "grad_norm": 0.27867368393846137, + "learning_rate": 1.9076781169718426e-06, + "loss": 0.6892, + "step": 359 + }, + { + "epoch": 1.42, + "grad_norm": 0.2747868621778029, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.6947, + "step": 360 + }, + { + "epoch": 1.42, + "grad_norm": 0.2819831023326237, + "learning_rate": 1.857552010559382e-06, + "loss": 0.7059, + "step": 361 + }, + { + "epoch": 1.42, + "grad_norm": 0.27392909848006375, + "learning_rate": 1.8326822041411524e-06, + "loss": 0.6909, + "step": 362 + }, + { + "epoch": 1.43, + "grad_norm": 0.2743865258535757, + "learning_rate": 1.8079425937519729e-06, + "loss": 0.679, + "step": 363 + }, + { + "epoch": 1.43, + "grad_norm": 2.5387012983068686, + "learning_rate": 1.7833341963403312e-06, + "loss": 0.8855, + "step": 364 + }, + { + "epoch": 1.44, + "grad_norm": 0.265533513891752, + "learning_rate": 1.7588580234610592e-06, + "loss": 0.6915, + "step": 365 + }, + { + "epoch": 1.44, + "grad_norm": 0.27359948413194535, + "learning_rate": 1.7345150812337564e-06, + "loss": 0.6983, + "step": 366 + }, + { + "epoch": 1.44, + "grad_norm": 0.2742529435490673, + "learning_rate": 1.7103063703014372e-06, + "loss": 0.6712, + "step": 367 + }, + { + "epoch": 1.45, + "grad_norm": 0.27199602044830407, + "learning_rate": 1.6862328857893856e-06, + "loss": 0.6929, + "step": 368 + }, + { + "epoch": 1.45, + "grad_norm": 0.26752081718353027, + "learning_rate": 1.6622956172642601e-06, + "loss": 0.6693, + "step": 369 + }, + { + "epoch": 1.46, + "grad_norm": 0.2729904749450422, + "learning_rate": 1.6384955486934157e-06, + "loss": 0.6545, + "step": 370 + }, + { + "epoch": 1.46, + "grad_norm": 0.27686659493721505, + "learning_rate": 1.6148336584044539e-06, + "loss": 0.6957, + "step": 371 + }, + { + "epoch": 1.46, + "grad_norm": 0.27203211099316454, + "learning_rate": 1.5913109190450033e-06, + "loss": 0.6709, + "step": 372 + }, + { + "epoch": 1.47, + "grad_norm": 0.2748872025880921, + "learning_rate": 1.567928297542749e-06, + "loss": 0.6648, + "step": 373 + }, + { + "epoch": 1.47, + "grad_norm": 0.28544562477258, + "learning_rate": 1.544686755065677e-06, + "loss": 0.6937, + "step": 374 + }, + { + "epoch": 1.48, + "grad_norm": 0.27110159404128226, + "learning_rate": 1.5215872469825682e-06, + "loss": 0.6593, + "step": 375 + }, + { + "epoch": 1.48, + "eval_loss": 0.6996302008628845, + "eval_runtime": 96.9399, + "eval_samples_per_second": 18.238, + "eval_steps_per_second": 0.382, + "step": 375 + }, + { + "epoch": 1.48, + "grad_norm": 0.28648432605335455, + "learning_rate": 1.4986307228237268e-06, + "loss": 0.6883, + "step": 376 + }, + { + "epoch": 1.48, + "grad_norm": 0.2695264027977092, + "learning_rate": 1.4758181262419425e-06, + "loss": 0.6696, + "step": 377 + }, + { + "epoch": 1.49, + "grad_norm": 0.2786040566891135, + "learning_rate": 1.4531503949737107e-06, + "loss": 0.6768, + "step": 378 + }, + { + "epoch": 1.49, + "grad_norm": 0.2730138945863401, + "learning_rate": 1.4306284608006837e-06, + "loss": 0.699, + "step": 379 + }, + { + "epoch": 1.5, + "grad_norm": 0.28711818986138005, + "learning_rate": 1.4082532495113627e-06, + "loss": 0.6961, + "step": 380 + }, + { + "epoch": 1.5, + "grad_norm": 0.27838100192134935, + "learning_rate": 1.3860256808630429e-06, + "loss": 0.6589, + "step": 381 + }, + { + "epoch": 1.5, + "grad_norm": 0.2798399005913698, + "learning_rate": 1.3639466685440133e-06, + "loss": 0.6924, + "step": 382 + }, + { + "epoch": 1.51, + "grad_norm": 0.2801056534585314, + "learning_rate": 1.3420171201359933e-06, + "loss": 0.7047, + "step": 383 + }, + { + "epoch": 1.51, + "grad_norm": 0.28576658015544487, + "learning_rate": 1.3202379370768254e-06, + "loss": 0.6617, + "step": 384 + }, + { + "epoch": 1.52, + "grad_norm": 1.3100321738765892, + "learning_rate": 1.298610014623423e-06, + "loss": 0.9236, + "step": 385 + }, + { + "epoch": 1.52, + "grad_norm": 0.303191205875695, + "learning_rate": 1.2771342418149658e-06, + "loss": 0.6896, + "step": 386 + }, + { + "epoch": 1.52, + "grad_norm": 0.29469211064765755, + "learning_rate": 1.2558115014363592e-06, + "loss": 0.6804, + "step": 387 + }, + { + "epoch": 1.53, + "grad_norm": 0.2772656639598833, + "learning_rate": 1.234642669981946e-06, + "loss": 0.7043, + "step": 388 + }, + { + "epoch": 1.53, + "grad_norm": 0.28874341170670975, + "learning_rate": 1.2136286176194744e-06, + "loss": 0.6839, + "step": 389 + }, + { + "epoch": 1.54, + "grad_norm": 0.29443238526351323, + "learning_rate": 1.1927702081543279e-06, + "loss": 0.6852, + "step": 390 + }, + { + "epoch": 1.54, + "grad_norm": 0.28011985365824127, + "learning_rate": 1.1720682989940264e-06, + "loss": 0.7019, + "step": 391 + }, + { + "epoch": 1.54, + "grad_norm": 0.2987249208096274, + "learning_rate": 1.1515237411129698e-06, + "loss": 0.6625, + "step": 392 + }, + { + "epoch": 1.55, + "grad_norm": 0.30150125882304396, + "learning_rate": 1.1311373790174656e-06, + "loss": 0.7102, + "step": 393 + }, + { + "epoch": 1.55, + "grad_norm": 0.28396138619493894, + "learning_rate": 1.1109100507110133e-06, + "loss": 0.6538, + "step": 394 + }, + { + "epoch": 1.56, + "grad_norm": 0.28445333602874173, + "learning_rate": 1.0908425876598512e-06, + "loss": 0.6719, + "step": 395 + }, + { + "epoch": 1.56, + "grad_norm": 0.2914051878027514, + "learning_rate": 1.0709358147587883e-06, + "loss": 0.6803, + "step": 396 + }, + { + "epoch": 1.56, + "grad_norm": 0.2969873740157493, + "learning_rate": 1.0511905502972885e-06, + "loss": 0.6845, + "step": 397 + }, + { + "epoch": 1.57, + "grad_norm": 0.27955221055069657, + "learning_rate": 1.031607605925839e-06, + "loss": 0.6819, + "step": 398 + }, + { + "epoch": 1.57, + "grad_norm": 0.2840904640426781, + "learning_rate": 1.0121877866225783e-06, + "loss": 0.6685, + "step": 399 + }, + { + "epoch": 1.58, + "grad_norm": 0.2866769315662431, + "learning_rate": 9.929318906602176e-07, + "loss": 0.7126, + "step": 400 + }, + { + "epoch": 1.58, + "grad_norm": 0.28635331619928306, + "learning_rate": 9.738407095732195e-07, + "loss": 0.6825, + "step": 401 + }, + { + "epoch": 1.58, + "grad_norm": 0.29612030431665504, + "learning_rate": 9.549150281252633e-07, + "loss": 0.6889, + "step": 402 + }, + { + "epoch": 1.59, + "grad_norm": 0.2800350134356635, + "learning_rate": 9.361556242769871e-07, + "loss": 0.6902, + "step": 403 + }, + { + "epoch": 1.59, + "grad_norm": 0.303825841465723, + "learning_rate": 9.175632691540065e-07, + "loss": 0.6949, + "step": 404 + }, + { + "epoch": 1.6, + "grad_norm": 0.2869303466691903, + "learning_rate": 8.991387270152202e-07, + "loss": 0.6953, + "step": 405 + }, + { + "epoch": 1.6, + "grad_norm": 0.2822567891211309, + "learning_rate": 8.808827552213917e-07, + "loss": 0.6733, + "step": 406 + }, + { + "epoch": 1.6, + "grad_norm": 0.29613652418881947, + "learning_rate": 8.627961042040183e-07, + "loss": 0.6721, + "step": 407 + }, + { + "epoch": 1.61, + "grad_norm": 0.2904383828418472, + "learning_rate": 8.448795174344803e-07, + "loss": 0.6849, + "step": 408 + }, + { + "epoch": 1.61, + "grad_norm": 0.28422518396116003, + "learning_rate": 8.271337313934869e-07, + "loss": 0.676, + "step": 409 + }, + { + "epoch": 1.62, + "grad_norm": 0.2998884374161429, + "learning_rate": 8.095594755407971e-07, + "loss": 0.72, + "step": 410 + }, + { + "epoch": 1.62, + "grad_norm": 0.2877123964038153, + "learning_rate": 7.921574722852343e-07, + "loss": 0.686, + "step": 411 + }, + { + "epoch": 1.62, + "grad_norm": 0.2848351263069502, + "learning_rate": 7.749284369549954e-07, + "loss": 0.6755, + "step": 412 + }, + { + "epoch": 1.63, + "grad_norm": 3.762805837262192, + "learning_rate": 7.578730777682386e-07, + "loss": 0.9037, + "step": 413 + }, + { + "epoch": 1.63, + "grad_norm": 0.2782769592021476, + "learning_rate": 7.409920958039795e-07, + "loss": 0.6686, + "step": 414 + }, + { + "epoch": 1.64, + "grad_norm": 0.28369386272785363, + "learning_rate": 7.242861849732696e-07, + "loss": 0.6772, + "step": 415 + }, + { + "epoch": 1.64, + "grad_norm": 0.28870985589458403, + "learning_rate": 7.077560319906696e-07, + "loss": 0.6665, + "step": 416 + }, + { + "epoch": 1.64, + "grad_norm": 0.2880267458624612, + "learning_rate": 6.914023163460248e-07, + "loss": 0.6767, + "step": 417 + }, + { + "epoch": 1.65, + "grad_norm": 0.2879073116640725, + "learning_rate": 6.752257102765325e-07, + "loss": 0.6733, + "step": 418 + }, + { + "epoch": 1.65, + "grad_norm": 0.2978223401759706, + "learning_rate": 6.592268787391077e-07, + "loss": 0.707, + "step": 419 + }, + { + "epoch": 1.66, + "grad_norm": 0.2781074725093229, + "learning_rate": 6.43406479383053e-07, + "loss": 0.6962, + "step": 420 + }, + { + "epoch": 1.66, + "grad_norm": 0.29577562012306474, + "learning_rate": 6.277651625230219e-07, + "loss": 0.6772, + "step": 421 + }, + { + "epoch": 1.66, + "grad_norm": 0.2848699679509908, + "learning_rate": 6.12303571112286e-07, + "loss": 0.7008, + "step": 422 + }, + { + "epoch": 1.67, + "grad_norm": 0.2728708533046375, + "learning_rate": 5.9702234071631e-07, + "loss": 0.6994, + "step": 423 + }, + { + "epoch": 1.67, + "grad_norm": 0.2971147397482144, + "learning_rate": 5.819220994866237e-07, + "loss": 0.6784, + "step": 424 + }, + { + "epoch": 1.67, + "grad_norm": 0.2918077307247773, + "learning_rate": 5.670034681349995e-07, + "loss": 0.6798, + "step": 425 + }, + { + "epoch": 1.68, + "grad_norm": 0.2766527969263755, + "learning_rate": 5.522670599079416e-07, + "loss": 0.692, + "step": 426 + }, + { + "epoch": 1.68, + "grad_norm": 0.2896023594106076, + "learning_rate": 5.377134805614714e-07, + "loss": 0.6885, + "step": 427 + }, + { + "epoch": 1.69, + "grad_norm": 0.29226174571780184, + "learning_rate": 5.233433283362349e-07, + "loss": 0.6609, + "step": 428 + }, + { + "epoch": 1.69, + "grad_norm": 0.30313380575685006, + "learning_rate": 5.091571939329049e-07, + "loss": 0.6559, + "step": 429 + }, + { + "epoch": 1.69, + "grad_norm": 0.2851579977806079, + "learning_rate": 4.951556604879049e-07, + "loss": 0.6862, + "step": 430 + }, + { + "epoch": 1.7, + "grad_norm": 0.28828722620682673, + "learning_rate": 4.813393035494329e-07, + "loss": 0.673, + "step": 431 + }, + { + "epoch": 1.7, + "grad_norm": 0.283083619090625, + "learning_rate": 4.677086910538092e-07, + "loss": 0.6477, + "step": 432 + }, + { + "epoch": 1.71, + "grad_norm": 0.29809618255032516, + "learning_rate": 4.542643833021254e-07, + "loss": 0.7054, + "step": 433 + }, + { + "epoch": 1.71, + "grad_norm": 0.3086409841744957, + "learning_rate": 4.410069329372152e-07, + "loss": 0.6763, + "step": 434 + }, + { + "epoch": 1.71, + "grad_norm": 0.3016981542599131, + "learning_rate": 4.279368849209381e-07, + "loss": 0.6843, + "step": 435 + }, + { + "epoch": 1.72, + "grad_norm": 0.27698126686942587, + "learning_rate": 4.150547765117746e-07, + "loss": 0.6891, + "step": 436 + }, + { + "epoch": 1.72, + "grad_norm": 0.2793586730481018, + "learning_rate": 4.0236113724274716e-07, + "loss": 0.6796, + "step": 437 + }, + { + "epoch": 1.73, + "grad_norm": 0.28666974827878056, + "learning_rate": 3.8985648889964755e-07, + "loss": 0.6648, + "step": 438 + }, + { + "epoch": 1.73, + "grad_norm": 0.28527293041641727, + "learning_rate": 3.77541345499593e-07, + "loss": 0.7071, + "step": 439 + }, + { + "epoch": 1.73, + "grad_norm": 0.29843069047155474, + "learning_rate": 3.6541621326989183e-07, + "loss": 0.6803, + "step": 440 + }, + { + "epoch": 1.74, + "grad_norm": 0.3001228333782996, + "learning_rate": 3.534815906272404e-07, + "loss": 0.7176, + "step": 441 + }, + { + "epoch": 1.74, + "grad_norm": 0.2928174202718284, + "learning_rate": 3.417379681572297e-07, + "loss": 0.6747, + "step": 442 + }, + { + "epoch": 1.75, + "grad_norm": 0.2951128501223636, + "learning_rate": 3.301858285941845e-07, + "loss": 0.7046, + "step": 443 + }, + { + "epoch": 1.75, + "grad_norm": 0.2931876464433515, + "learning_rate": 3.18825646801314e-07, + "loss": 0.6734, + "step": 444 + }, + { + "epoch": 1.75, + "grad_norm": 0.29349560436630445, + "learning_rate": 3.076578897511978e-07, + "loss": 0.6852, + "step": 445 + }, + { + "epoch": 1.76, + "grad_norm": 0.7457047514934827, + "learning_rate": 2.966830165065876e-07, + "loss": 0.9017, + "step": 446 + }, + { + "epoch": 1.76, + "grad_norm": 0.2813462622367945, + "learning_rate": 2.8590147820153513e-07, + "loss": 0.6969, + "step": 447 + }, + { + "epoch": 1.77, + "grad_norm": 0.30904433658187347, + "learning_rate": 2.7531371802285436e-07, + "loss": 0.6829, + "step": 448 + }, + { + "epoch": 1.77, + "grad_norm": 0.28837856935691286, + "learning_rate": 2.6492017119189415e-07, + "loss": 0.6527, + "step": 449 + }, + { + "epoch": 1.77, + "grad_norm": 0.2940858357017207, + "learning_rate": 2.547212649466568e-07, + "loss": 0.6696, + "step": 450 + }, + { + "epoch": 1.78, + "grad_norm": 0.27956534271888384, + "learning_rate": 2.447174185242324e-07, + "loss": 0.7048, + "step": 451 + }, + { + "epoch": 1.78, + "grad_norm": 0.28259810422391984, + "learning_rate": 2.3490904314356412e-07, + "loss": 0.6772, + "step": 452 + }, + { + "epoch": 1.79, + "grad_norm": 0.2846763408984384, + "learning_rate": 2.2529654198854834e-07, + "loss": 0.7507, + "step": 453 + }, + { + "epoch": 1.79, + "grad_norm": 0.2784656918785677, + "learning_rate": 2.1588031019145638e-07, + "loss": 0.7072, + "step": 454 + }, + { + "epoch": 1.79, + "grad_norm": 0.28593045031546543, + "learning_rate": 2.0666073481669714e-07, + "loss": 0.6944, + "step": 455 + }, + { + "epoch": 1.8, + "grad_norm": 0.296176705173677, + "learning_rate": 1.9763819484490353e-07, + "loss": 0.6691, + "step": 456 + }, + { + "epoch": 1.8, + "grad_norm": 0.2923471158891105, + "learning_rate": 1.8881306115735632e-07, + "loss": 0.705, + "step": 457 + }, + { + "epoch": 1.81, + "grad_norm": 0.2769134760770555, + "learning_rate": 1.801856965207338e-07, + "loss": 0.6845, + "step": 458 + }, + { + "epoch": 1.81, + "grad_norm": 0.30153347516541434, + "learning_rate": 1.7175645557220567e-07, + "loss": 0.6935, + "step": 459 + }, + { + "epoch": 1.81, + "grad_norm": 0.29938078931561396, + "learning_rate": 1.6352568480485277e-07, + "loss": 0.6822, + "step": 460 + }, + { + "epoch": 1.82, + "grad_norm": 0.2909845977727433, + "learning_rate": 1.5549372255342367e-07, + "loss": 0.6959, + "step": 461 + }, + { + "epoch": 1.82, + "grad_norm": 0.2851155920589762, + "learning_rate": 1.4766089898042678e-07, + "loss": 0.6909, + "step": 462 + }, + { + "epoch": 1.83, + "grad_norm": 3.590844633307425, + "learning_rate": 1.4002753606256082e-07, + "loss": 0.9279, + "step": 463 + }, + { + "epoch": 1.83, + "grad_norm": 0.289769942223222, + "learning_rate": 1.3259394757747678e-07, + "loss": 0.6664, + "step": 464 + }, + { + "epoch": 1.83, + "grad_norm": 1.4756345980091514, + "learning_rate": 1.253604390908819e-07, + "loss": 0.9066, + "step": 465 + }, + { + "epoch": 1.84, + "grad_norm": 0.2905542054012534, + "learning_rate": 1.1832730794397951e-07, + "loss": 0.6989, + "step": 466 + }, + { + "epoch": 1.84, + "grad_norm": 0.3056790622208962, + "learning_rate": 1.1149484324124326e-07, + "loss": 0.64, + "step": 467 + }, + { + "epoch": 1.85, + "grad_norm": 0.2915224050071343, + "learning_rate": 1.0486332583853565e-07, + "loss": 0.6411, + "step": 468 + }, + { + "epoch": 1.85, + "grad_norm": 0.2947477867782055, + "learning_rate": 9.843302833156377e-08, + "loss": 0.6901, + "step": 469 + }, + { + "epoch": 1.85, + "grad_norm": 0.6968434274277037, + "learning_rate": 9.22042150446728e-08, + "loss": 0.9234, + "step": 470 + }, + { + "epoch": 1.86, + "grad_norm": 0.28365980605268926, + "learning_rate": 8.617714201998084e-08, + "loss": 0.6871, + "step": 471 + }, + { + "epoch": 1.86, + "grad_norm": 0.29456041125148436, + "learning_rate": 8.035205700685167e-08, + "loss": 0.6841, + "step": 472 + }, + { + "epoch": 1.87, + "grad_norm": 0.3143240746562965, + "learning_rate": 7.47291994517163e-08, + "loss": 0.6793, + "step": 473 + }, + { + "epoch": 1.87, + "grad_norm": 0.3002190997085016, + "learning_rate": 6.930880048822531e-08, + "loss": 0.6909, + "step": 474 + }, + { + "epoch": 1.87, + "grad_norm": 0.28198233683666907, + "learning_rate": 6.409108292774912e-08, + "loss": 0.6677, + "step": 475 + }, + { + "epoch": 1.88, + "grad_norm": 0.2974351663902672, + "learning_rate": 5.907626125022159e-08, + "loss": 0.6863, + "step": 476 + }, + { + "epoch": 1.88, + "grad_norm": 0.3167342201027022, + "learning_rate": 5.426454159531913e-08, + "loss": 0.6728, + "step": 477 + }, + { + "epoch": 1.89, + "grad_norm": 0.2838121481556639, + "learning_rate": 4.9656121753990924e-08, + "loss": 0.6765, + "step": 478 + }, + { + "epoch": 1.89, + "grad_norm": 0.28117138518414553, + "learning_rate": 4.52511911603265e-08, + "loss": 0.6827, + "step": 479 + }, + { + "epoch": 1.89, + "grad_norm": 0.29399576903346636, + "learning_rate": 4.104993088376974e-08, + "loss": 0.6933, + "step": 480 + }, + { + "epoch": 1.9, + "grad_norm": 0.29636385882946953, + "learning_rate": 3.705251362167484e-08, + "loss": 0.6641, + "step": 481 + }, + { + "epoch": 1.9, + "grad_norm": 0.2913085312105263, + "learning_rate": 3.325910369220975e-08, + "loss": 0.6973, + "step": 482 + }, + { + "epoch": 1.91, + "grad_norm": 0.29080057862930364, + "learning_rate": 2.966985702759828e-08, + "loss": 0.6678, + "step": 483 + }, + { + "epoch": 1.91, + "grad_norm": 0.2952306166117519, + "learning_rate": 2.6284921167712975e-08, + "loss": 0.7017, + "step": 484 + }, + { + "epoch": 1.91, + "grad_norm": 0.2959396595629797, + "learning_rate": 2.3104435254008852e-08, + "loss": 0.6569, + "step": 485 + }, + { + "epoch": 1.92, + "grad_norm": 0.28791733733513286, + "learning_rate": 2.012853002380466e-08, + "loss": 0.6573, + "step": 486 + }, + { + "epoch": 1.92, + "grad_norm": 0.2923526176448832, + "learning_rate": 1.735732780490884e-08, + "loss": 0.6903, + "step": 487 + }, + { + "epoch": 1.93, + "grad_norm": 0.2908268098513957, + "learning_rate": 1.4790942510590767e-08, + "loss": 0.6756, + "step": 488 + }, + { + "epoch": 1.93, + "grad_norm": 0.29069043095745606, + "learning_rate": 1.2429479634897268e-08, + "loss": 0.6722, + "step": 489 + }, + { + "epoch": 1.93, + "grad_norm": 0.28912174671892155, + "learning_rate": 1.0273036248318325e-08, + "loss": 0.6927, + "step": 490 + }, + { + "epoch": 1.94, + "grad_norm": 0.29773909318477504, + "learning_rate": 8.321700993795812e-09, + "loss": 0.6703, + "step": 491 + }, + { + "epoch": 1.94, + "grad_norm": 0.2846360921300275, + "learning_rate": 6.575554083078084e-09, + "loss": 0.6915, + "step": 492 + }, + { + "epoch": 1.95, + "grad_norm": 0.3040183654289367, + "learning_rate": 5.034667293427053e-09, + "loss": 0.6836, + "step": 493 + }, + { + "epoch": 1.95, + "grad_norm": 0.29012455377167307, + "learning_rate": 3.6991039646616657e-09, + "loss": 0.6844, + "step": 494 + }, + { + "epoch": 1.95, + "grad_norm": 0.2778518633390048, + "learning_rate": 2.568918996560532e-09, + "loss": 0.6779, + "step": 495 + }, + { + "epoch": 1.96, + "grad_norm": 0.29541155663074187, + "learning_rate": 1.6441588466009627e-09, + "loss": 0.6979, + "step": 496 + }, + { + "epoch": 1.96, + "grad_norm": 0.28364315270086676, + "learning_rate": 9.248615280499362e-10, + "loss": 0.6792, + "step": 497 + }, + { + "epoch": 1.97, + "grad_norm": 0.2865142006406564, + "learning_rate": 4.1105660840368154e-10, + "loss": 0.7034, + "step": 498 + }, + { + "epoch": 1.97, + "grad_norm": 0.2854424675916699, + "learning_rate": 1.0276520816976388e-10, + "loss": 0.6747, + "step": 499 + }, + { + "epoch": 1.97, + "grad_norm": 0.29666466368391803, + "learning_rate": 0.0, + "loss": 0.6754, + "step": 500 + }, + { + "epoch": 1.97, + "eval_loss": 0.6983408331871033, + "eval_runtime": 93.907, + "eval_samples_per_second": 18.827, + "eval_steps_per_second": 0.394, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 250, + "total_flos": 1571976955035648.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..f514ebe --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65ef0124c2666d0d78b18fb6fcae7801286925d17b161abe928ae1b45915ae68 +size 7736 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833