commit 9645be15252529e5801604525ee76be2b2b41d64 Author: ModelHub XC Date: Mon Jun 22 14:18:13 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ebb8c68 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,52 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text + +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +pytorch_model-00002-of-00003.bin filter=lfs diff=lfs merge=lfs -text +pytorch_model-00001-of-00003.bin filter=lfs diff=lfs merge=lfs -text +pytorch_model-00003-of-00003.bin filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..53b37c8 --- /dev/null +++ b/README.md @@ -0,0 +1,156 @@ +--- +license: apache-2.0 +language: + - en + - ja +programming_language: + - C + - C++ + - C# + - Go + - Java + - JavaScript + - Lua + - PHP + - Python + - Ruby + - Rust + - Scala + - TypeScript +library_name: transformers +pipeline_tag: text-generation +inference: false +--- +# llm-jp-13b-instruct-full-dolly-oasst-v1.0 + +This repository provides large language models developed by [LLM-jp](https://llm-jp.nii.ac.jp/), a collaborative project launched in Japan. + +| Model Variant | +| :--- | +|**Instruction models**| +| [llm-jp-13b-instruct-full-jaster-v1.0](https://huggingface.co/llm-jp/llm-jp-13b-instruct-full-jaster-v1.0) | +| [llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0](https://huggingface.co/llm-jp/llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0) | +| [llm-jp-13b-instruct-full-dolly-oasst-v1.0](https://huggingface.co/llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0) | +| [llm-jp-13b-instruct-lora-jaster-v1.0](https://huggingface.co/llm-jp/llm-jp-13b-instruct-lora-jaster-v1.0) | +| [llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0](https://huggingface.co/llm-jp/llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0) | +| [llm-jp-13b-instruct-lora-dolly-oasst-v1.0](https://huggingface.co/llm-jp/llm-jp-13b-instruct-lora-dolly-oasst-v1.0) | + + +| | +| :--- | +|**Pre-trained models**| +| [llm-jp-13b-v1.0](https://huggingface.co/llm-jp/llm-jp-13b-v1.0) | +| [llm-jp-1.3b-v1.0](https://huggingface.co/llm-jp/llm-jp-1.3b-v1.0) | +Checkpoints format: Hugging Face Transformers (Megatron-DeepSpeed format models are available [here](https://huggingface.co/llm-jp/llm-jp-13b-v1.0-mdsfmt)) + + +## Required Libraries and Their Versions + +- torch>=2.0.0 +- transformers>=4.34.0 +- tokenizers>=0.14.0 +- accelerate==0.23.0 + +## Usage + +```python +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +tokenizer = AutoTokenizer.from_pretrained("llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0") +model = AutoModelForCausalLM.from_pretrained("llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0", device_map="auto", torch_dtype=torch.float16) +text = "自然言語処理とは何か" +text = text + "### 回答:" +tokenized_input = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt").to(model.device) +with torch.no_grad(): + output = model.generate( + tokenized_input, + max_new_tokens=100, + do_sample=True, + top_p=0.95, + temperature=0.7, + )[0] +print(tokenizer.decode(output)) +``` + + +## Model Details + +- **Model type:** Transformer-based Language Model +- **Total seen tokens:** 300B + +|Model|Params|Layers|Hidden size|Heads|Context length| +|:---:|:---:|:---:|:---:|:---:|:---:| +|13b model|13b|40|5120|40|2048| +|1.3b model|1.3b|24|2048|16|2048| + + +## Training + +- **Pre-training:** + - **Hardware:** 96 A100 40GB GPUs ([mdx cluster](https://mdx.jp/en/)) + - **Software:** Megatron-DeepSpeed + +- **Instruction tuning:** + - **Hardware:** 8 A100 40GB GPUs ([mdx cluster](https://mdx.jp/en/)) + - **Software:** [TRL](https://github.com/huggingface/trl), [PEFT](https://github.com/huggingface/peft), and [DeepSpeed](https://github.com/microsoft/DeepSpeed) + +## Tokenizer +The tokenizer of this model is based on [huggingface/tokenizers](https://github.com/huggingface/tokenizers) Unigram byte-fallback model. +The vocabulary entries were converted from [`llm-jp-tokenizer v2.1 (50k)`](https://github.com/llm-jp/llm-jp-tokenizer/releases/tag/v2.1). +Please refer to [README.md](https://github.com/llm-jp/llm-jp-tokenizer) of `llm-ja-tokenizer` for details on the vocabulary construction procedure. +- **Model:** Hugging Face Fast Tokenizer using Unigram byte-fallback model which requires `tokenizers>=0.14.0` +- **Training algorithm:** SentencePiece Unigram byte-fallback +- **Training data:** A subset of the datasets for model pre-training +- **Vocabulary size:** 50,570 (mixed vocabulary of Japanese, English, and source code) + + +## Datasets + +### Pre-training + +The models have been pre-trained using a blend of the following datasets. + +| Language | Dataset | Tokens| +|:---:|:---:|:---:| +|Japanese|[Wikipedia](https://huggingface.co/datasets/wikipedia)|1.5B +||[mC4](https://huggingface.co/datasets/mc4)|136B +|English|[Wikipedia](https://huggingface.co/datasets/wikipedia)|5B +||[The Pile](https://huggingface.co/datasets/EleutherAI/pile)|135B +|Codes|[The Stack](https://huggingface.co/datasets/bigcode/the-stack)|10B + +The pre-training was continuously conducted using a total of 10 folds of non-overlapping data, each consisting of approximately 27-28B tokens. +We finalized the pre-training with additional (potentially) high-quality 27B tokens data obtained from the identical source datasets listed above used for the 10-fold data. + +### Instruction tuning + +The models have been fine-tuned on the following datasets. + +| Language | Dataset | description | +|:---|:---:|:---:| +|Japanese|[jaster](https://github.com/llm-jp/llm-jp-eval)| An automatically transformed data from the existing Japanese NLP datasets | +||[databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k)| A translated one by DeepL in LLM-jp | +||[OpenAssistant Conversations Dataset](https://huggingface.co/datasets/OpenAssistant/oasst1)| A translated one by DeepL in LLM-jp | + + +## Evaluation +You can view the evaluation results of several LLMs on this [leaderboard](http://wandb.me/llm-jp-leaderboard). We used [llm-jp-eval](https://github.com/llm-jp/llm-jp-eval) for the evaluation. + +## Risks and Limitations + +The models released here are still in the early stages of our research and development and have not been tuned to ensure outputs align with human intent and safety considerations. + + +## Send Questions to + +llm-jp(at)nii.ac.jp + + +## License + +[Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0) + + +## Model Card Authors +*The names are listed in alphabetical order.* + +Hirokazu Kiyomaru, Hiroshi Matsuda, Jun Suzuki, Namgi Han, Saku Sugawara, Shota Sasaki, Shuhei Kurita, Taishi Nakamura, Takumi Okamoto. \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..b75ff0d --- /dev/null +++ b/config.json @@ -0,0 +1,33 @@ +{ + "activation_function": "gelu", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 7, + "embd_pdrop": 0.1, + "eos_token_id": 7, + "gradient_checkpointing": false, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_ctx": 1024, + "n_embd": 5120, + "n_head": 40, + "n_inner": 20480, + "n_layer": 40, + "n_positions": 2048, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float16", + "transformers_version": "4.34.0", + "use_cache": true, + "vocab_size": 50688 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..390f14e --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 7, + "eos_token_id": 7, + "transformers_version": "4.34.0" +} diff --git a/pytorch_model-00001-of-00003.bin b/pytorch_model-00001-of-00003.bin new file mode 100644 index 0000000..4fd9a87 --- /dev/null +++ b/pytorch_model-00001-of-00003.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5202b69f63fb8e4ed210aa7603197a1c2f6979c4d2fa3a80415c19645488e47b +size 9979281983 diff --git a/pytorch_model-00002-of-00003.bin b/pytorch_model-00002-of-00003.bin new file mode 100644 index 0000000..7ae8c36 --- /dev/null +++ b/pytorch_model-00002-of-00003.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8429348a90169569bcc42ae6ccff2af0216e4aa2e17b874dcf046b2a6e3a2e +size 9858778951 diff --git a/pytorch_model-00003-of-00003.bin b/pytorch_model-00003-of-00003.bin new file mode 100644 index 0000000..7c7cded --- /dev/null +++ b/pytorch_model-00003-of-00003.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd70f190cb4dc8221812aaf18720aa110efd45d0497ceb4e68918b81431d93e +size 5873292803 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..39759fa --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,492 @@ +{ + "metadata": { + "total_size": 25711185920 + }, + "weight_map": { + "lm_head.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.15.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.15.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.15.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.15.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.15.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.15.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.15.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.15.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.15.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.15.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.15.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.20.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.30.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.ln_1.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.ln_1.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.ln_2.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.ln_2.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin", + "transformer.h.30.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.ln_1.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.ln_1.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.ln_2.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.ln_2.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.ln_1.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.ln_1.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.ln_2.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.ln_2.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.32.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.ln_1.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.ln_1.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.ln_2.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.ln_2.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.33.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.ln_1.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.ln_1.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.ln_2.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.ln_2.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.34.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.ln_1.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.ln_1.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.ln_2.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.ln_2.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.35.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.ln_1.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.ln_1.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.ln_2.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.ln_2.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.36.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.ln_1.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.ln_1.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.ln_2.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.ln_2.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.37.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.ln_1.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.ln_1.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.ln_2.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.ln_2.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.38.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.ln_1.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.ln_1.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.ln_2.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.ln_2.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin", + "transformer.h.39.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin", + "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.ln_1.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.ln_2.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin", + "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin", + "transformer.ln_f.bias": "pytorch_model-00003-of-00003.bin", + "transformer.ln_f.weight": "pytorch_model-00003-of-00003.bin", + "transformer.wpe.weight": "pytorch_model-00001-of-00003.bin", + "transformer.wte.weight": "pytorch_model-00001-of-00003.bin" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..cdbdb09 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,13 @@ +{ + "additional_special_tokens": [ + "" + ], + "bos_token": "", + "cls_token": "", + "eod_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..3b8935d --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fefc427dff3323dd8a2fd66f392b90a62896db3b11a031463ad0f4c70fb1de9c +size 3245878 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3a7a39d --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,19 @@ +{ + "unk_token": "", + "bos_token": "", + "eos_token": "", + "pad_token": "", + "cls_token": "", + "sep_token": "", + "eod_token": "", + "mask_token": "", + "extra_ids": 0, + "additional_special_tokens": [ + "" + ], + "sp_model_kwargs": {}, + "model_max_length": 1000000000000000019884624838656, + "clean_up_tokenization_spaces": false, + "special_tokens_map_file": null, + "tokenizer_class": "PreTrainedTokenizerFast" +}