From 8d16f1501aa756492516e8169e975478cad2a704 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 13 May 2026 08:38:22 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: llm-jp/llm-jp-3.1-1.8b Source: Original Platform --- .gitattributes | 50 +++++++ README.md | 232 +++++++++++++++++++++++++++++++ config.json | 30 ++++ configuration.json | 1 + model-00001-of-00001.safetensors | 3 + model.safetensors.index.json | 1 + special_tokens_map.json | 10 ++ tokenizer.json | 3 + tokenizer_config.json | 18 +++ 9 files changed, 348 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 configuration.json create mode 100644 model-00001-of-00001.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..85d8fa9 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,50 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model-00001-of-00001.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..aa10d52 --- /dev/null +++ b/README.md @@ -0,0 +1,232 @@ +--- +license: apache-2.0 +language: +- en +- ja +programming_language: +- C +- C++ +- C# +- Go +- Java +- JavaScript +- Lua +- PHP +- Python +- Ruby +- Rust +- Scala +- TypeScript +pipeline_tag: text-generation +library_name: transformers +inference: false +--- +# llm-jp-3.1-1.8b + +LLM-jp-3.1 is a series of large language models developed by the [Research and Development Center for Large Language Models](https://llmc.nii.ac.jp/) at the [National Institute of Informatics](https://www.nii.ac.jp/en/). + +Building upon the LLM-jp-3 series, the LLM-jp-3.1 models incorporate mid-training ([instruction pre-training](https://aclanthology.org/2024.emnlp-main.148/)), which significantly enhances their instruction-following capabilities compared to the original LLM-jp-3 models. + +This repository provides the **llm-jp-3.1-1.8b** model. +For an overview of the LLM-jp-3.1 models across different parameter sizes, please refer to: + - [LLM-jp-3.1 Pre-trained Models](https://huggingface.co/collections/llm-jp/llm-jp-31-pre-trained-models-68368787c32e462c40a45f7b) + - [LLM-jp-3.1 Fine-tuned Models](https://huggingface.co/collections/llm-jp/llm-jp-31-fine-tuned-models-68368681b9b35de1c4ac8de4). + +For more details on the training procedures and evaluation results, please refer to [this blog post](https://llm-jp.nii.ac.jp/ja/blog/blog-887/) (in Japanese). + +Checkpoints format: Hugging Face Transformers + + +## Required Libraries and Their Versions + +- torch>=2.3.0 +- transformers>=4.40.1 +- tokenizers>=0.19.1 +- accelerate>=0.29.3 +- flash-attn>=2.5.8 + +## Usage + +```python +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +tokenizer = AutoTokenizer.from_pretrained("llm-jp/llm-jp-3.1-1.8b") +model = AutoModelForCausalLM.from_pretrained("llm-jp/llm-jp-3.1-1.8b", device_map="auto", torch_dtype=torch.bfloat16) +text = "自然言語処理とは何か" +tokenized_input = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt").to(model.device) +with torch.no_grad(): + output = model.generate( + tokenized_input, + max_new_tokens=100, + do_sample=True, + top_p=0.95, + temperature=0.7, + repetition_penalty=1.05, + )[0] +print(tokenizer.decode(output)) +``` + + +## Model Details + +- **Model type:** Transformer-based Language Model +- **Architectures:** + +Dense model: +|Params|Layers|Hidden size|Heads|Context length|Embedding parameters|Non-embedding parameters| +|:---:|:---:|:---:|:---:|:---:|:---:|:---:| +|1.8b|24|2048|16|4096|407,498,752|1,459,718,144| +|13b|40|5120|40|4096|1,018,746,880|12,688,184,320| + +MoE model: +|Params|Layers|Hidden size|Heads|Routed Experts|Activated Experts|Context length|Embedding parameters|Non-embedding parameters|Activated parameters|Total parameters| +|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| +|8x13b|40|5120|40|8|2|4096|1,018,746,880|72,144,081,920|22,200,806,400|73,162,828,800| + + +## Tokenizer + +The tokenizer of this model is based on [huggingface/tokenizers](https://github.com/huggingface/tokenizers) Unigram byte-fallback model. +The vocabulary entries were converted from [`llm-jp-tokenizer v3.0`](https://github.com/llm-jp/llm-jp-tokenizer/releases/tag/v3.0b2). +Please refer to [README.md](https://github.com/llm-jp/llm-jp-tokenizer) of `llm-jp-tokenizer` for details on the vocabulary construction procedure (the pure SentencePiece training does not reproduce our vocabulary). + +## Datasets + +### Pre-training + +The models have been pre-trained using a blend of the following datasets. + +| Language | Dataset | Tokens| +|:---|:---|---:| +|Japanese|[Wikipedia](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|2.6B +||[Common Crawl](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|762.8B +||[WARP/PDF](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|237.3B +||[WARP/HTML](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|2.7B +||[Kaken](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|1.8B +|English|[Wikipedia](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|4.7B +||[Dolma/CC-head](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|608.5B +||[Dolma/C4](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|181.6B +||[Dolma/Reddit](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|83.1B +||[Dolma/PeS2o](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|62.9B +||[Dolma/Gutenberg](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|5.5B +||[Dolma/Wiki](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-corpus-v3)|3.9B +|Code|[The Stack](https://huggingface.co/datasets/bigcode/the-stack)|114.1B +|Chinese|[Wikipedia](https://huggingface.co/datasets/bigcode/the-stack)|0.8B +|Korean|[Wikipedia](https://huggingface.co/datasets/bigcode/the-stack)|0.3B + +### Mid-training + +In the LLM-jp-3.1 series, we performed continuous pre-training based on [Instruction Pre-Training](https://aclanthology.org/2024.emnlp-main.148/). +Instruction Pre-Training enhances a model’s ability to follow instructions by continuing pre-training on a large collection of instruction–response pairs. +We prepared approximately 90B tokens of instruction–response data and mixed it with our pre-training datasets, conducting continuous pre-training on a total of 400B tokens. +Each model was initialized from existing checkpoints ([llm-jp/llm-jp-3-1.8b](https://huggingface.co/llm-jp/llm-jp-3-1.8b), [llm-jp/llm-jp-3-13b](https://huggingface.co/llm-jp/llm-jp-3-13b), and [llm-jp/llm-jp-3-8x13b](https://huggingface.co/llm-jp/llm-jp-3-8x13b)) and underwent continuous instruction pre-training. +Since the LLM-jp-3 series was originally pre-trained on 2.1T tokens, the total pre-training token count amounts to 2.5T tokens. + +Details of this training process will be released in a forthcoming paper. The instruction–response dataset used for this training will also be made publicly available. + + +### Post-training + +We have fine-tuned the pre-trained checkpoint with supervised fine-tuning and further aligned it with Direct Preference Optimization. + +#### Supervised Fine-tuning +The datasets used for supervised fine-tuning are as follows: + +| Language | Dataset | Description | +|:---|:---|:---| +|Japanese|[ichikara-instruction-004-002](https://liat-aip.sakura.ne.jp/wp/llm%e3%81%ae%e3%81%9f%e3%82%81%e3%81%ae%e6%97%a5%e6%9c%ac%e8%aa%9e%e3%82%a4%e3%83%b3%e3%82%b9%e3%83%88%e3%83%a9%e3%82%af%e3%82%b7%e3%83%a7%e3%83%b3%e3%83%87%e3%83%bc%e3%82%bf%e4%bd%9c%e6%88%90/llm%e3%81%ae%e3%81%9f%e3%82%81%e3%81%ae%e6%97%a5%e6%9c%ac%e8%aa%9e%e3%82%a4%e3%83%b3%e3%82%b9%e3%83%88%e3%83%a9%e3%82%af%e3%82%b7%e3%83%a7%e3%83%b3%e3%83%87%e3%83%bc%e3%82%bf-%e5%85%ac%e9%96%8b/)| A manually constructed instruction dataset. | +| |[AnswerCarefully (ver2.0)](https://huggingface.co/datasets/llm-jp/AnswerCarefully)| A manually constructed instruction dataset focusing on LLMs' safety. | +| |ichikara-instruction-format| A small subset of the ichikara-instruction dataset, edited with some constraints on the output format. | +| |[AutoMultiTurnByCalm3-22B](https://huggingface.co/datasets/kanhatakeyama/AutoMultiTurnByCalm3-22B)| A synthetic instruction dataset. | +| |[ramdom-to-fixed-multiturn-Calm3](https://huggingface.co/datasets/kanhatakeyama/ramdom-to-fixed-multiturn-Calm3)| A synthetic instruction dataset. | +| |[wizardlm8x22b-logical-math-coding-sft-ja](https://huggingface.co/datasets/llm-jp/wizardlm8x22b-logical-math-coding-sft-ja)| A synthetic instruction dataset. | +| |[magpie-sft-v1.0](https://huggingface.co/datasets/llm-jp/magpie-sft-v1.0)| A synthetic instruction dataset we created. | +| |[jaster v1.4.1](https://github.com/llm-jp/llm-jp-eval/tree/v1.4.1)| - | +| |[extraction-wiki-ja](https://huggingface.co/datasets/llm-jp/extraction-wiki-ja)| A synthetic instruction dataset we created. | +|English|[Daring-Anteater](https://huggingface.co/datasets/nvidia/Daring-Anteater)| - | +|Japanese & English|[Synthetic-JP-EN-Coding-Dataset](https://huggingface.co/datasets/llm-jp/Synthetic-JP-EN-Coding-Dataset)| A synthetic instruction dataset. | + + +#### Direct Preference Optimization + +For Direct Preference Optimization (DPO), we adopted rejection sampling. +Prompts were sampled from the dataset used in SFT, and multiple responses were generated for each prompt. +These responses were then scored (by [Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)), and DPO was performed by treating high-scoring responses as positive examples and low-scoring responses as negative examples. + +We conducted DPO in two stages. +In the second stage, we additionally used [ac-self-inst](https://huggingface.co/datasets/llm-jp/ac-self-inst), a Japanese preference dataset focused on safety. + + +## Evaluation + +### MT Bench (Japanese and English) + +We evaluated the models using `gpt-4o-2024-08-06`. +The scores represent the average values obtained from three rounds of inference and evaluation. +For more details, please refer to the [codes](https://github.com/llm-jp/llm-jp-judge/tree/v1.0.0). + + +| Model Name | JA | EN | +|:------------------------------------------------------------------------------------------------------------------------------|----------:|-------:| +| gpt-35-turbo-1106 | 6.48 | 7.56 | +| gpt-4-0613 | 7.29 | 7.72 | +| gpt-4o-2024-08-06 | 8.10 | 8.38 | +| [sbintuitions/sarashina2.2-1b-instruct-v0.1](https://huggingface.co/sbintuitions/sarashina2.2-1b-instruct-v0.1) | 5.30 | 5.66 | +| [sbintuitions/sarashina2.2-3b-instruct-v0.1](https://huggingface.co/sbintuitions/sarashina2.2-3b-instruct-v0.1) | 7.07 | 6.96 | +| [Rakuten/RakutenAI-2.0-8x7B-instruct](https://huggingface.co/Rakuten/RakutenAI-2.0-8x7B-instruct) | 6.68 | 6.33 | +| [cyberagent/calm3-22b-chat](https://huggingface.co/cyberagent/calm3-22b-chat) | 6.86 | 6.77 | +| [Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 7.07 | 7.99 | +| [Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 7.64 | 8.27 | +| [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B) | 5.46 | 6.95 | +| [Qwen/Qwen3-14B](https://huggingface.co/Qwen/Qwen3-14B) | 8.00 | 8.30 | +| [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | 8.36 | 8.33 | +| [tokyotech-llm/Llama-3.3-Swallow-70B-Instruct-v0.4](https://huggingface.co/tokyotech-llm/Llama-3.3-Swallow-70B-Instruct-v0.4) | 7.64 | 8.02 | +| [stockmark/Stockmark-2-100B-Instruct-beta](https://huggingface.co/stockmark/Stockmark-2-100B-Instruct-beta) | 7.42 | 7.17 | +| [llm-jp-3-1.8b-instruct3](https://huggingface.co/llm-jp/llm-jp-3-1.8b-instruct3) | 4.64 | 4.09 | +| [llm-jp-3-13b-instruct3](https://huggingface.co/llm-jp/llm-jp-3-13b-instruct3) | 6.21 | 6.13 | +| [llm-jp-3-8x13b-instruct3](https://huggingface.co/llm-jp/llm-jp-3-8x13b-instruct3) | 6.60 | 6.49 | +| [llm-jp-3.1-1.8b-instruct4](https://huggingface.co/llm-jp/llm-jp-3.1-1.8b-instruct4) | 6.30 | 5.70 | +| [llm-jp-3.1-13b-instruct4](https://huggingface.co/llm-jp/llm-jp-3.1-13b-instruct4) | 7.37 | 7.01 | +| [llm-jp-3.1-8x13b-instruct4](https://huggingface.co/llm-jp/llm-jp-3.1-8x13b-instruct4) | 7.50 | 7.05 | + + +### AnswerCarefully-Eval + +[AnswerCarefully-Eval](https://www.anlp.jp/proceedings/annual_meeting/2025/pdf_dir/Q4-19.pdf) assesses the safety of Japanese language model outputs using the LLM-as-a-Judge approach, based on the test set from [llm-jp/AnswerCarefully](https://huggingface.co/datasets/llm-jp/AnswerCarefully). +We evaluated the models using `gpt-4o-2024-08-06`. +The scores represent the average values obtained from three rounds of inference and evaluation. +For more details, please refer to the [codes](https://github.com/llm-jp/llm-jp-judge/tree/v1.0.0). + +| Model name | Score | Acceptance rate (%, ↑) | Violation rate (%, ↓) | +| :--- | ---: | ---: | ---: | +| gpt-35-turbo-1106 | 3.98 | 71.7 | 12.6 | +| gpt-4-0613 | 4.06 | 72.3 | 13.2 | +| gpt-4o-2024-08-06 | 4.09 | 72.7 | 12.5 | +| [llm-jp-3-1.8b-instruct3](https://huggingface.co/llm-jp/llm-jp-3-1.8b-instruct3) | 4.03 | 75.9 | 12.2 | +| [llm-jp-3-13b-instruct3](https://huggingface.co/llm-jp/llm-jp-3-13b-instruct3) | 4.37 | 88.4 | 6.5 | +| [llm-jp-3-8x13b-instruct3](https://huggingface.co/llm-jp/llm-jp-3-8x13b-instruct3) | 4.48 | 91.6 | 4.3 | +| [llm-jp-3.1-1.8b-instruct4](https://huggingface.co/llm-jp/llm-jp-3.1-1.8b-instruct4) | 3.66 | 64.7 | 24.3 | +| [llm-jp-3.1-13b-instruct4](https://huggingface.co/llm-jp/llm-jp-3.1-13b-instruct4) | 4.17 | 82.4 | 12.2 | +| [llm-jp-3.1-8x13b-instruct4](https://huggingface.co/llm-jp/llm-jp-3.1-8x13b-instruct4) | 4.26 | 83.1 | 11.6 | + + + +## Risks and Limitations + +The models released here are in the early stages of our research and development and have not been tuned to ensure outputs align with human intent and safety considerations. + + +## Send Questions to + +llm-jp(at)nii.ac.jp + + +## License + +[Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0) + +## Model Card Authors + +*The names are listed in alphabetical order.* + +Hirokazu Kiyomaru and Takashi Kodama. \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..893822b --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "llm-jp/llm-jp-3-1.8b", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 7168, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "num_key_value_heads": 16, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 99584 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/model-00001-of-00001.safetensors b/model-00001-of-00001.safetensors new file mode 100644 index 0000000..11b7b71 --- /dev/null +++ b/model-00001-of-00001.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c734639f810581addc943c0c26d0b1ff1991bbe22adf96eaed86fa3f391d185c +size 3735253776 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..fd640c3 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1 @@ +{"metadata": {"mergekit_version": "0.0.6", "total_size": 3735228416}, "weight_map": {"lm_head.weight": "model-00001-of-00001.safetensors", "model.embed_tokens.weight": "model-00001-of-00001.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.norm.weight": "model-00001-of-00001.safetensors"}} \ No newline at end of file diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..8644c8f --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": "", + "cls_token": "", + "eod_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..fc80107 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955dc1fa623fab38cc92a3f4ee172423ae6d73201c4207569bfdf5626bc733f0 +size 6416433 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..09aa857 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,18 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "unk_token": "", + "bos_token": "", + "eos_token": "", + "pad_token": "", + "cls_token": "", + "sep_token": "", + "eod_token": "", + "mask_token": "", + "extra_ids": 0, + "sp_model_kwargs": {}, + "model_max_length": 1000000000000000019884624838656, + "clean_up_tokenization_spaces": false, + "special_tokens_map_file": null, + "tokenizer_class": "PreTrainedTokenizerFast" +}