commit e20fc9e378573c9baa215e1b39ef927421fb3b2e Author: ModelHub XC Date: Tue May 19 07:14:22 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: tokyotech-llm/Swallow-7b-instruct-hf Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..bbf6a94 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,53 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text + +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.model filter=lfs diff=lfs merge=lfs -text +logo.png filter=lfs diff=lfs merge=lfs -text +model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..51089e2 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,126 @@ +LLAMA 2 COMMUNITY LICENSE AGREEMENT +Llama 2 Version Release Date: July 18, 2023 + +"Agreement" means the terms and conditions for use, reproduction, distribution and +modification of the Llama Materials set forth herein. + +"Documentation" means the specifications, manuals and documentation +accompanying Llama 2 distributed by Meta at ai.meta.com/resources/models-and- +libraries/llama-downloads/. + +"Licensee" or "you" means you, or your employer or any other person or entity (if +you are entering into this Agreement on such person or entity's behalf), of the age +required under applicable laws, rules or regulations to provide legal consent and that +has legal authority to bind your employer or such other person or entity if you are +entering in this Agreement on their behalf. + +"Llama 2" means the foundational large language models and software and +algorithms, including machine-learning model code, trained model weights, +inference-enabling code, training-enabling code, fine-tuning enabling code and other +elements of the foregoing distributed by Meta at ai.meta.com/resources/models-and- +libraries/llama-downloads/. + +"Llama Materials" means, collectively, Meta's proprietary Llama 2 and +Documentation (and any portion thereof) made available under this Agreement. + +"Meta" or "we" means Meta Platforms Ireland Limited (if you are located in or, if you +are an entity, your principal place of business is in the EEA or Switzerland) and Meta +Platforms, Inc. (if you are located outside of the EEA or Switzerland). + +By clicking "I Accept" below or by using or distributing any portion or element of the +Llama Materials, you agree to be bound by this Agreement. + +1. License Rights and Redistribution. + + a. Grant of Rights. You are granted a non-exclusive, worldwide, non- +transferable and royalty-free limited license under Meta's intellectual property or +other rights owned by Meta embodied in the Llama Materials to use, reproduce, +distribute, copy, create derivative works of, and make modifications to the Llama +Materials. + + b. Redistribution and Use. + + i. If you distribute or make the Llama Materials, or any derivative works +thereof, available to a third party, you shall provide a copy of this Agreement to such +third party. + ii. If you receive Llama Materials, or any derivative works thereof, from +a Licensee as part of an integrated end user product, then Section 2 of this +Agreement will not apply to you. + + iii. You must retain in all copies of the Llama Materials that you +distribute the following attribution notice within a "Notice" text file distributed as a +part of such copies: "Llama 2 is licensed under the LLAMA 2 Community License, +Copyright (c) Meta Platforms, Inc. All Rights Reserved." + + iv. Your use of the Llama Materials must comply with applicable laws +and regulations (including trade compliance laws and regulations) and adhere to the +Acceptable Use Policy for the Llama Materials (available at +https://ai.meta.com/llama/use-policy), which is hereby incorporated by reference into +this Agreement. + + v. You will not use the Llama Materials or any output or results of the +Llama Materials to improve any other large language model (excluding Llama 2 or +derivative works thereof). + +2. Additional Commercial Terms. If, on the Llama 2 version release date, the +monthly active users of the products or services made available by or for Licensee, +or Licensee's affiliates, is greater than 700 million monthly active users in the +preceding calendar month, you must request a license from Meta, which Meta may +grant to you in its sole discretion, and you are not authorized to exercise any of the +rights under this Agreement unless or until Meta otherwise expressly grants you +such rights. + +3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE +LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE +PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY +WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR +FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE +FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING +THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR +USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS. + +4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE +LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, +NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS +AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, +CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN +IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF +ANY OF THE FOREGOING. + +5. Intellectual Property. + + a. No trademark licenses are granted under this Agreement, and in +connection with the Llama Materials, neither Meta nor Licensee may use any name +or mark owned by or associated with the other or any of its affiliates, except as +required for reasonable and customary use in describing and redistributing the +Llama Materials. + + b. Subject to Meta's ownership of Llama Materials and derivatives made by or +for Meta, with respect to any derivative works and modifications of the Llama +Materials that are made by you, as between you and Meta, you are and will be the +owner of such derivative works and modifications. + + c. If you institute litigation or other proceedings against Meta or any entity +(including a cross-claim or counterclaim in a lawsuit) alleging that the Llama +Materials or Llama 2 outputs or results, or any portion of any of the foregoing, +constitutes infringement of intellectual property or other rights owned or licensable +by you, then any licenses granted to you under this Agreement shall terminate as of +the date such litigation or claim is filed or instituted. You will indemnify and hold +harmless Meta from and against any claim by any third party arising out of or related +to your use or distribution of the Llama Materials. + +6. Term and Termination. The term of this Agreement will commence upon your +acceptance of this Agreement or access to the Llama Materials and will continue in +full force and effect until terminated in accordance with the terms and conditions +herein. Meta may terminate this Agreement if you are in breach of any term or +condition of this Agreement. Upon termination of this Agreement, you shall delete +and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the +termination of this Agreement. + +7. Governing Law and Jurisdiction. This Agreement will be governed and +construed under the laws of the State of California without regard to choice of law +principles, and the UN Convention on Contracts for the International Sale of Goods +does not apply to this Agreement. The courts of California shall have exclusive +jurisdiction of any dispute arising out of this Agreement. + diff --git a/Notice b/Notice new file mode 100644 index 0000000..a9811e1 --- /dev/null +++ b/Notice @@ -0,0 +1 @@ +Llama 2 is licensed under the LLAMA 2 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fd18db3 --- /dev/null +++ b/README.md @@ -0,0 +1,300 @@ +--- +language: + - en + - ja +library_name: transformers +pipeline_tag: text-generation +license: llama2 +model_type: llama +--- + +# Swallow + +Our Swallow model has undergone continual pre-training from the [Llama 2 family](https://huggingface.co/meta-llama), primarily with the addition of Japanese language data. The tuned versions use supervised fine-tuning (SFT). +Links to other models can be found in the index. + +# Model Release Updates + +We are excited to share the release schedule for our latest models: +- **April 26, 2024**: Released version 0.1 of our enhanced instruction-tuned models: [Swallow-7b-instruct-v0.1](https://huggingface.co/tokyotech-llm/Swallow-7b-instruct-v0.1), [Swallow-13b-instruct-v0.1](https://huggingface.co/tokyotech-llm/Swallow-13b-instruct-v0.1), and [Swallow-70b-instruct-v0.1](https://huggingface.co/tokyotech-llm/Swallow-70b-instruct-v0.1) as preview versions. +- **March 2, 2024**: Released the [Swallow-7b-plus-hf](https://huggingface.co/tokyotech-llm/Swallow-7b-plus-hf), a model trained with approximately twice as many Japanese tokens as [Swallow-7b-hf](https://huggingface.co/tokyotech-llm/Swallow-7b-hf). +- **February 4, 2024**: Released the [Swallow-13b-NVE-hf](https://huggingface.co/tokyotech-llm/Swallow-13b-NVE-hf). +- **January 26, 2024**: Released the [Swallow-7b-NVE-hf](https://huggingface.co/tokyotech-llm/Swallow-7b-NVE-hf), [Swallow-7b-NVE-instruct-hf](https://huggingface.co/tokyotech-llm/Swallow-7b-NVE-instruct-hf), [Swallow-70b-NVE-hf](https://huggingface.co/tokyotech-llm/Swallow-70b-NVE-hf), and [Swallow-70b-NVE-instruct-hf](https://huggingface.co/tokyotech-llm/Swallow-70b-NVE-instruct-hf) +- **December 19, 2023**: Released the [Swallow-7b-hf](https://huggingface.co/tokyotech-llm/Swallow-7b-hf), [Swallow-7b-instruct-hf](https://huggingface.co/tokyotech-llm/Swallow-7b-instruct-hf), [Swallow-13b-hf](https://huggingface.co/tokyotech-llm/Swallow-13b-hf), [Swallow-13b-instruct-hf](https://huggingface.co/tokyotech-llm/Swallow-13b-instruct-hf), [Swallow-70b-hf](https://huggingface.co/tokyotech-llm/Swallow-70b-hf), and [Swallow-70b-instruct-hf](https://huggingface.co/tokyotech-llm/Swallow-70b-instruct-hf). + +## Swallow Model Index +|Model|Swallow-hf|Swallow-instruct-hf|Swallow-instruct-v0.1| +|---|---|---|---| +|7B| [Link](https://huggingface.co/tokyotech-llm/Swallow-7b-hf) | [Link](https://huggingface.co/tokyotech-llm/Swallow-7b-instruct-hf)|[Link](https://huggingface.co/tokyotech-llm/Swallow-7b-instruct-v1.0)| +|7B-Plus| [Link](https://huggingface.co/tokyotech-llm/Swallow-7b-plus-hf) | N/A | N/A | +|13B| [Link](https://huggingface.co/tokyotech-llm/Swallow-13b-hf) | [Link](https://huggingface.co/tokyotech-llm/Swallow-13b-instruct-hf)| [Link](https://huggingface.co/tokyotech-llm/Swallow-13b-instruct-v1.0)| +|70B| [Link](https://huggingface.co/tokyotech-llm/Swallow-70b-hf) | [Link](https://huggingface.co/tokyotech-llm/Swallow-70b-instruct-hf)| [Link](https://huggingface.co/tokyotech-llm/Swallow-70b-instruct-v1.0)| + +## Swallow Model Index NVE (No Vocabulary Expansion) +|Model|Swallow-NVE-hf|Swallow-NVE-instruct-hf| +|---|---|---| +|7B| [Link](https://huggingface.co/tokyotech-llm/Swallow-7b-NVE-hf) | [Link](https://huggingface.co/tokyotech-llm/Swallow-7b-NVE-instruct-hf)| +|13B| [Link](https://huggingface.co/tokyotech-llm/Swallow-13b-NVE-hf) | N/A | +|70B| [Link](https://huggingface.co/tokyotech-llm/Swallow-70b-NVE-hf) | [Link](https://huggingface.co/tokyotech-llm/Swallow-70b-NVE-instruct-hf)| + +![logo](./logo.png) + +This repository provides large language models developed by [TokyoTech-LLM](https://tokyotech-llm.github.io/). +Read our [blog post](https://zenn.dev/tokyotech_lm/articles/d6cb3a8fdfc907) or our [paper](https://arxiv.org/abs/2404.17790) + +## Model Details + +* **Model type**: Please refer to LLaMA-2 technical report for details on the model architecture. +* **Language(s)**: Japanese English +* **Library**: [Megatron-LM](https://github.com/rioyokotalab/Megatron-Llama2) +* **Tokenizer**: This model employs a tokenizer that features a broadened vocabulary based on Japanese data. This allows for a more efficient representation of text using fewer tokens, leading to a notably faster inference process. +* **Contact**: swallow[at]nlp.c.titech.ac.jp + +## Base Model Performance + +### Japanese tasks + +|Model|Size|JCommonsenseQA|JEMHopQA|NIILC|JSQuAD|XL-Sum|MGSM|WMT20-en-ja|WMT20-ja-en| +|---|---|---|---|---|---|---|---|---|---| +| | |4-shot|4-shot|4-shot|4-shot|1-shot|4-shot|4-shot|4-shot| +| Llama 2 | 7B | 0.3852 | 0.4240 | 0.3410 | 0.7917 | 0.1905 | 0.0760 | 0.1783 | 0.1738 | +| Swallow | 7B | 0.4808 | 0.5078 | 0.5968 | 0.8573 | 0.1830 | 0.1240 | 0.2510 | 0.1511 | +| Swallow-Plus | 7B | 0.5478 | 0.5493 | 0.6030 | 0.8544 | 0.1806 | 0.1360 | 0.2568 | 0.1441 | +| Swallow-NVE | 7B | 0.5433 | 0.5425 | 0.5729 | 0.8684 | 0.2117 | 0.1200 | 0.2405 | 0.1512 | +| Llama 2 | 13B | 0.6997 | 0.4415 | 0.4170 | 0.8533 | 0.2139 | 0.1320 | 0.2146 | 0.1982 | +| Swallow | 13B | 0.7837 | 0.5063 | 0.6398 | 0.9005 | 0.2168 | 0.2040 | 0.2720 | 0.1771 | +| Swallow-NVE | 13B | 0.7712 | 0.5438 | 0.6351 | 0.9030 | 0.2294 | 0.2120 | 0.2735 | 0.1817 | +| Llama 2 | 70B | 0.8686 | 0.4656 | 0.5256 | 0.9080 | 0.2361 | 0.3560 | 0.2643 | **0.2398** | +| Swallow | 70B | 0.9348 | **0.6290** | 0.6960 | 0.9176 | 0.2266 | **0.4840** | **0.3043** | 0.2298 | +| Swallow-NVE | 70B | **0.9410** | 0.5759 | **0.7024** | **0.9254** | **0.2758** | 0.4720 | 0.3042 | 0.2322 | +### English tasks + +|Model|Size|OpenBookQA|TriviaQA|HellaSwag|SQuAD2.0|XWINO|GSM8K| +|---|---|---|---|---|---|---|---| +| | |8-shot|8-shot|8-shot|8-shot|8-shot|8-shot| +| Llama 2 | 7B | 0.3580 | 0.6265 | 0.5860 | 0.3207 | 0.9049 | 0.1410 | +| Swallow | 7B | 0.3180 | 0.4836 | 0.5308 | 0.3125 | 0.8817 | 0.1130 | +| Swallow-Plus | 7B | 0.3280 | 0.4558 | 0.5259 | 0.3134 | 0.8929 | 0.1061 | +| Swallow-NVE | 7B | 0.3180 | 0.5079 | 0.5329 | 0.2919 | 0.8817 | 0.0986 | +| Llama 2 | 13B | 0.3760 | 0.7255 | 0.6148 | 0.3681 | 0.9140 | 0.2403 | +| Swallow | 13B | 0.3500 | 0.5852 | 0.5660 | 0.3406 | 0.9075 | 0.2039 | +| Swallow-NVE | 13B | 0.3460 | 0.6025 | 0.5700 | 0.3478 | 0.9006 | 0.1751 | +| Llama 2 | 70B | **0.4280** | **0.8239** | **0.6742** | **0.3770** | **0.9290** | **0.5284** | +| Swallow | 70B | 0.4220 | 0.7756 | 0.6458 | 0.3745 | 0.9204 | 0.4867 | +| Swallow-NVE | 70B | 0.4240 | 0.7817 | 0.6439 | 0.3451 | 0.9256 | 0.4943 | + +## Evaluation Benchmarks + +### Japanese evaluation benchmarks + +We used llm-jp-eval(v1.0.0) and JP Language Model Evaluation Harness(commit #9b42d41). The details are as follows: + +- Multiple-choice question answering (JCommonsenseQA [Kurihara+, 2022]) +- Open-ended question answering (JEMHopQA [Ishii+, 2023]) +- Open-ended question answering (NIILC [Sekine, 2003]) +- Machine reading comprehension (JSQuAD [Kurihara+, 2022]) +- Automatic summarization (XL-Sum [Hasan+, 2021]) +- Machine translation (WMT2020 ja-en [Barrault+, 2020]) +- Machine translation (WMT2020 en-ja [Barrault+, 2020]) +- Mathematical reasoning (MGSM [Shi+, 2023]) + +### English evaluation benchmarks + +We used the Language Model Evaluation Harness(v.0.3.0). The details are as follows: + +- Multiple-choice question answering (OpenBookQA [Mihaylov+, 2018]) +- Open-ended question answering (TriviaQA [Joshi+, 2017]) +- Machine reading comprehension (SQuAD 2.0 [Rajpurkar+, 2018]) +- Commonsense reasoning (XWINO [Tikhonov & Ryabinin, 2021]) +- Natural language inference (HellaSwag [Zellers+, 2019]) +- Mathematical reasoning (GSM8k [Cobbe+, 2021]) + + +## Usage + +First install additional dependencies in [requirements.txt](./requirements.txt): + +```sh +pip install -r requirements.txt +``` + +### Use the instruct model + +```python +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +model_name = "tokyotech-llm/Swallow-7b-instruct-hf" + +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map="auto") + + +PROMPT_DICT = { + "prompt_input": ( + "以下に、あるタスクを説明する指示があり、それに付随する入力が更なる文脈を提供しています。" + "リクエストを適切に完了するための回答を記述してください。\n\n" + "### 指示:\n{instruction}\n\n### 入力:\n{input}\n\n### 応答:" + + ), + "prompt_no_input": ( + "以下に、あるタスクを説明する指示があります。" + "リクエストを適切に完了するための回答を記述してください。\n\n" + "### 指示:\n{instruction}\n\n### 応答:" + ), +} + +def create_prompt(instruction, input=None): + """ + Generates a prompt based on the given instruction and an optional input. + If input is provided, it uses the 'prompt_input' template from PROMPT_DICT. + If no input is provided, it uses the 'prompt_no_input' template. + + Args: + instruction (str): The instruction describing the task. + input (str, optional): Additional input providing context for the task. Default is None. + + Returns: + str: The generated prompt. + """ + if input: + # Use the 'prompt_input' template when additional input is provided + return PROMPT_DICT["prompt_input"].format(instruction=instruction, input=input) + else: + # Use the 'prompt_no_input' template when no additional input is provided + return PROMPT_DICT["prompt_no_input"].format(instruction=instruction) + +# Example usage +instruction_example = "以下のトピックに関する詳細な情報を提供してください。" +input_example = "東京工業大学の主なキャンパスについて教えてください" +prompt = create_prompt(instruction_example, input_example) + +input_ids = tokenizer.encode( + prompt, + add_special_tokens=False, + return_tensors="pt" +) + +tokens = model.generate( + input_ids.to(device=model.device), + max_new_tokens=128, + temperature=0.99, + top_p=0.95, + do_sample=True, +) + +out = tokenizer.decode(tokens[0], skip_special_tokens=True) +print(out) + +``` + +### Use the base model + +```python +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +model_name = "tokyotech-llm/Swallow-7b-hf" + +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") + +prompt = "東京工業大学の主なキャンパスは、" +input_ids = tokenizer.encode( + prompt, + add_special_tokens=False, + return_tensors="pt" +) +tokens = model.generate( + input_ids.to(device=model.device), + max_new_tokens=128, + temperature=0.99, + top_p=0.95, + do_sample=True, +) + +out = tokenizer.decode(tokens[0], skip_special_tokens=True) +print(out) +``` + +## Training Datasets + +### Continual Pre-Training +The following datasets were used for continual pre-training. + +- [Japanese Wikipedia](https://dumps.wikimedia.org/other/cirrussearch) +- [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) +- [Swallow Corpus](https://arxiv.org/abs/2404.17733) +- [The Pile](https://huggingface.co/datasets/EleutherAI/pile) + + +### Instruction Tuning + +The following datasets were used for the instruction tuning. + +- [Anthropic HH-RLHF](https://huggingface.co/datasets/kunishou/hh-rlhf-49k-ja) +- [Databricks Dolly 15-k](https://huggingface.co/datasets/kunishou/databricks-dolly-15k-ja) +- [OpenAssistant Conversations Dataset](https://huggingface.co/datasets/kunishou/oasst1-89k-ja) + +## Risks and Limitations + +The models released here are still in the early stages of our research and development and have not been tuned to ensure outputs align with human intent and safety considerations. + +## Acknowledgements + +We thank Meta Research for releasing Llama 2 under an open license for others to build on. + +Our project is supported by the [ABCI Large-scale Language Model Building Support Program](https://abci.ai/en/link/llm_support_program.html) of the National Institute of Advanced Industrial Science and Technology. + +## License + +Llama 2 is licensed under the LLAMA 2 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved. + +## Authors + +Here are the team members: +- From [Okazaki Laboratory](https://www.nlp.c.titech.ac.jp/index.en.html), the following members: + - [Naoaki Okazaki](https://www.chokkan.org/index.ja.html) + - [Sakae Mizuki](https://s-mizuki-nlp.github.io/) + - [Hiroki Iida](https://meshidenn.github.io/) + - [Mengsay Loem](https://loem-ms.github.io/) + - [Shota Hirai](https://huggingface.co/Kotemo428) + - [Kakeru Hattori](https://aya-se.vercel.app/) + - [Masanari Ohi](https://twitter.com/stjohn2007) +- From [YOKOTA Laboratory](https://www.rio.gsic.titech.ac.jp/en/index.html), the following members: + - [Rio Yokota](https://twitter.com/rioyokota) + - [Kazuki Fujii](https://twitter.com/okoge_kaz) + - [Taishi Nakamura](https://twitter.com/Setuna7777_2) + +## How to cite + +If you find our work helpful, please feel free to cite us. + +``` +@inproceedings{Fujii:COLM2024, + title={Continual Pre-Training for Cross-Lingual LLM Adaptation: +Enhancing Japanese Language Capabilities}, + author={Kazuki Fujii and Taishi Nakamura and Mengsay Loem and Hiroki +Iida and Masanari Ohi and Kakeru Hattori and Hirai Shota and Sakae +Mizuki and Rio Yokota and Naoaki Okazaki}, + booktitle="Proceedings of the First Conference on Language Modeling", + series={COLM}, + pages="(to appear)", + year="2024", + month=oct, + address={University of Pennsylvania, USA}, +} + +@inproceedings{Okazaki:COLM2024, + title={Building a Large Japanese Web Corpus for Large Language Models}, + author={Naoaki Okazaki and Kakeru Hattori and Hirai Shota and Hiroki +Iida and Masanari Ohi and Kazuki Fujii and Taishi Nakamura and Mengsay +Loem and Rio Yokota and Sakae Mizuki}, + booktitle="Proceedings of the First Conference on Language Modeling", + series={COLM}, + pages="(to appear)", + year="2024", + month=oct, + address={University of Pennsylvania, USA}, +} +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..a687588 --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "_name_or_path": "tokyotech-llm/Swallow-7b-instruct-hf", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "max_sequence_length": 4096, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.33.2", + "use_cache": true, + "vocab_size": 43176 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..82cab69 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,11 @@ +{ + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "pad_token_id": 0, + "temperature": 0.6, + "max_length": 4096, + "top_p": 0.9, + "transformers_version": "4.33.2" + } + \ No newline at end of file diff --git a/logo.png b/logo.png new file mode 100644 index 0000000..eb72cf6 --- /dev/null +++ b/logo.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a6948289c43c398d1c7a190767e3c68e30215015f44bdaa497836192fc9bbfa +size 1913917 diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..80a23da --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:352bf5b4db884f0f255d3a80f6be342d6e2759095a7ffa4f92d5ad012c85922b +size 4940361496 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..b17bf4a --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cae769ca718982e43c839b42b946613f5cfc3677fa35da9dee62164f7b5a8446 +size 4947390888 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..5c548d3 --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e97ed64ba3cde4dad1cbc43e810c242b1d726a58f720065f2b1697f860a481bb +size 3772220264 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..c246d45 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 13659938816 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..661cd2f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +torch +transformers +sentencepiece +accelerate +protobuf diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..d85ba6c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..48d502c --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c877c5ca885bad5c19d1b1706a2703f8b30de90f03c1f834f8bdb9faf79821e8 +size 914000 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..a32398f --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,35 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": null, + "padding_side": "right", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} \ No newline at end of file