初始化项目,由ModelHub XC社区提供模型

Model: HinGwenWoong/ancient-chat-7b
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-21 19:12:12 +08:00
commit 07044757ad
22 changed files with 2516 additions and 0 deletions

34
.gitattributes vendored Normal file
View File

@@ -0,0 +1,34 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*.tfevents* filter=lfs diff=lfs merge=lfs -text
*.db* filter=lfs diff=lfs merge=lfs -text
*.ark* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text

201
LICENSE Normal file
View File

@@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

298
README.md Normal file
View File

@@ -0,0 +1,298 @@
---
language:
- zh
tags:
- ancient-chat-llm
- internlm2
frameworks:
- pytorch
tasks:
- text-generation
license: Apache License 2.0
---
# ancient-chat-llm 古语说 —— 一个精通中国文化的大模型
<!-- PROJECT SHIELDS -->
<!--
[![Contributors][contributors-shield]][contributors-url]
[![Forks][forks-shield]][forks-url]
[![Issues][issues-shield]][issues-url]
[![MIT License][license-shield]][license-url]
[![Stargazers][stars-shield]][stars-url]
-->
<br />
<!-- PROJECT LOGO -->
<p align="center">
<a href="https://github.com/PeterH0323/ancient-chat-llm/">
<img src="assets/logo.png" alt="Logo" width="30%">
</a>
<h3 align="center">ancient-chat-llm</h3>
<p align="center">
<br />
<a href="https://github.com/PeterH0323/ancient-chat-llm">Github repo</a>
.
<a href="https://github.com/PeterH0323/ancient-chat-llm/tree/main/demo">查看Demo</a>
·
<a href="https://github.com/PeterH0323/ancient-chat-llm/issues">报告Bug & 提出新特性</a>
</p>
</p>
## 简介
**ancient-chat-llm 古语说** 是一个能够在用户输入现代汉语后输出文言文,同时能够解答用户 **关于中国文化的问题** 的大模型,包括但不限于**唐诗、宋词、论语**等古籍,还可以让其**将文言文翻译成白话文**等,模型用 [xtuner](https://github.com/InternLM/xtuner) 在 [InternLM2](https://github.com/InternLM/InternLM) 的基础上指令微调而来。
**开源不易,如果本项目帮到大家,可以右上角帮我点个 star~ ⭐⭐ , 您的 star ⭐是我们最大的鼓励,谢谢各位!**
## NEWS
- [2024.1] 新增诗词、古籍等知识微调模型
- [2024.1] 成语数据集微调模型
## 介绍
中国文化,博大精深,源远流长。从古老的诗词歌赋到现代的文艺创作,都展现了中华民族的智慧和创造力。
- **中国古籍**,中华文明的重要组成部分,承载着丰富的历史和文化信息,反映了古代社会的风貌和人民的智慧。这些古籍不仅具有极高的历史价值,也是我们了解古代文化、传承中华文明的重要窗口。其中,《诗经》是中国最早的诗歌总集,收录了西周初年至春秋中叶的诗歌,展现了古代人民的生活和情感。其优美的语言和深邃的思想,至今仍为人们所传颂和学习。另一部重要的古籍是 **《论语》**,其是儒家学派的经典之作,记录了孔子及其弟子的言行和思想。它强调仁爱、礼义等儒家核心价值观,对中国乃至东亚地区的文化和社会产生了深远的影响。此外,**《道德经》、《易经》** 等道家经典,以及 **《孙子兵法》、《战国策》** 等兵家著作,也都是中国古代文化古籍中的重要代表。
- **中国古诗**,蕴含着深厚的文化底蕴,闪耀着诗人的智慧与才情。以李白的 **《将进酒》** 为例,诗中“人生得意须尽欢,莫使金樽空对月”传达出豁达乐观的人生态度,激励着代代读者。这样的诗句,既是中国古诗的瑰宝,也是中华文化的骄傲。让我们共同欣赏、传承这些珍贵的文化遗产,感受中国古诗的无穷魅力。
- **中国成语**,其有固定的结构形式和固定的说法,表示一定的意义,在语句中是作为一个整体来应用的。成语有很大一部分是从古代相承沿用下来的,它代表了一个故事或者典故,有些成语本就是一个微型的句子。有些成语来自于历史事件,如“完璧归赵”、“负荆请罪”等,它们通过简短的形式,概括了整个故事的内容,使得人们可以更加方便地理解和记忆。有些成语则来自于文学作品,如“柳暗花明”、“刻舟求剑”等,这些成语通过形象的比喻,表达了深刻的道理。
**这就是我们做这个模型的初衷,我们想将中华文化教给大模型,让其能够尽可能掌握中华文化,做到文化输出。**
**ancient-chat-llm 古语说** 是一个能够在用户输入现代汉语后输出文言文,同时能够解答用户 **关于中国文化的问题** 的大模型,包括但不限于**唐诗、宋词、论语**等古籍,还可以让其**将文言文翻译成白话文**等,模型用 [xtuner](https://github.com/InternLM/xtuner) 在 [InternLM2](https://github.com/InternLM/InternLM) 的基础上指令微调而来。
**开源不易,如果本项目帮到大家,可以右上角帮我点个 star~ ⭐⭐ , 您的 star ⭐是我们最大的鼓励,谢谢各位!**
Demo 访问地址:
<!-- 演示 Start -->
<!-- 演示 END -->
模型对比:
## 模型
### 从 ModelScope 导入
[HinGwenWoong/ancient-chat-7b](https://modelscope.cn/models/HinGwenWoong/ancient-chat-7b)
```python
import torch
from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
model_dir = snapshot_download('HinGwenWoong/ancient-chat-7b')
tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
# Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
model = model.eval()
response, history = model.chat(tokenizer, "你好", history=[])
print(response)
response, history = model.chat(tokenizer, "李白简介", history=history)
print(response)
```
## 知识库
- [x] 文言文翻译
- [x] 成语
- [x] 论语
- [x] 唐诗
- [x] 宋词
- [x] 楚辞
- [x] 四书五经
- [x] 百家姓
- [x] 弟子规
- [ ] 史记
- [ ] 宫廷制度
- [ ] 二十四节气
- [ ] ...
## 环境搭建
本项目使用 [xtuner](https://github.com/InternLM/xtuner) 训练,在 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 上进行微调
1. clone 本项目
```bash
git clone https://github.com/PeterH0323/ancient-chat-llm.git
cd ancient-chat-llm
```
2. 创建虚拟环境
```bash
conda env create -f environment.yml
conda activate ancient-chat-llm
pip install -r requirements.txt
```
## 数据集准备
目前使用到的开源数据集有以下几个,我们还使用爬虫等技术进行爬取了其余知识库的数据集:
- 文言文https://huggingface.co/datasets/RUCAIBox/Erya-dataset/tree/main
- 古诗https://github.com/chinese-poetry/chinese-poetry
数据集结构(省略了用不到的文件):
```bash
dataset/
├── Erya-dataset
│ ├── dataset # 解压自 finetune.tgz
│ └── stage_2 # 解压自 trans.tgz
├── chinese-poetry
│ ├── 五代诗词
│ ├── 元曲
│ ├── 全唐诗
│ ├── 四书五经
│ ├── 宋词
│ ├── 幽梦影
│ ├── 御定全唐詩
│ ├── 曹操诗集
│ ├── 楚辞
│ ├── 水墨唐诗
│ ├── 纳兰性德
│ ├── 蒙学
│ ├── 论语
│ └── 诗经
├── gen_dataset.py
└── whoami.jsonl
```
使用脚本可以进行生成:
```bash
cd dataset
python gen_dataset.py --data-root=./dataset --output=data.jsonl
```
生成之后可以在 ${output} 设置的路径看到 jsonl 文件,样例如下:
```json
{
"conversation": [
{
"system": "你是一位专业的中文教师。你总能解答用户关于中文的相关知识。",
"input": "帮我翻译成文言文:“往西走一里,绕着新寺西边的峰嘴往北走。”",
"output": "西一里,转新寺西峰之嘴而北。"
}
]
},
{
"conversation": [
{
"system": "你是一位专业的中文教师。你总能解答用户关于中文的相关知识。",
"input": "背诵《将进酒》",
"output": "君不见黄河之水天上来,奔流到海不复回。\n君不見高堂明鏡悲白髮朝如青絲暮成雪。\n人生得意須盡歡莫使金樽空對月。\n天生我材必有用千金散盡還復來。\n烹羊宰牛且爲樂會須一飲三百盃。\n岑夫子丹丘生將進酒君莫停。\n與君歌一曲請君爲我側耳聽。\n鐘鼓饌玉不足貴但願長醉不願醒。\n古來聖賢皆寂寞惟有飲者留其名。\n陳王昔時宴平樂斗酒十千恣讙謔。\n主人何爲言少錢徑須沽取對君酌。\n五花馬千金裘呼兒將出換美酒與爾同銷萬古愁。"
}
]
},
...
```
## 训练
1. 训练之前,需要在 `xtuner` 代码中 `xtuner/xtuner/utils/templates.py` 添加 `SYSTEM_TEMPLATE.ancient_chat`
```diff
SYSTEM_TEMPLATE = ConfigDict(
moss_sft=('You are an AI assistant whose name is {bot_name}.\n'
'Capabilities and tools that {bot_name} can possess.\n'
'- Inner thoughts: enabled.\n'
'- Web search: enabled. API: Search(query)\n'
'- Calculator: enabled. API: Calculate(expression)\n'
'- Equation solver: enabled. API: Solve(equation)\n'
'- Text-to-image: disabled.\n'
'- Image edition: disabled.\n'
'- Text-to-speech: disabled.\n'),
alpaca=('Below is an instruction that describes a task. '
'Write a response that appropriately completes the request.\n'),
arxiv_gentile=('If you are an expert in writing papers, please generate '
"a good paper title for this paper based on other authors' "
'descriptions of their abstracts.\n'),
colorist=('You are a professional color designer. Please provide the '
'corresponding colors based on the description of Human.\n'),
coder=('You are a professional programer. Please provide the '
'corresponding code based on the description of Human.\n'),
lawyer='你现在是一名专业的中国律师,请根据用户的问题给出准确、有理有据的回复。\n',
medical='如果你是一名医生,请根据患者的描述回答医学问题。\n',
sql=('If you are an expert in SQL, please generate a good SQL Query '
'for Question based on the CREATE TABLE statement.\n'),
+ ancient_chat="你是一位专业的中文教师。你总能解答用户关于中文的相关知识。\n",
)
```
2.`./finetune_configs/internlm2_chat_7b/internlm2_chat_7b_qlora_custom_data_e3_finetune.py` 中 数据集路径 和 模型路径 改为您的本地路径
```diff
# Model
- pretrained_model_name_or_path = 'internlm/internlm2-7b'
+ pretrained_model_name_or_path = '/path/to/internlm/internlm2-7b' # 这步可选,如果事先下载好了模型可以直接使用绝对路径
# Data
- data_path = 'timdettmers/openassistant-guanaco'
+ data_path = '/path/to/data.jsonl' # 数据集步骤生成的 json 文件绝对路径
prompt_template = PROMPT_TEMPLATE.default
max_length = 2048
pack_to_max_length = True
```
3. 使用命令进行训练:
```bash
xtuner train finetune_configs/internlm2_chat_7b/internlm2_chat_7b_qlora_custom_data_e3_finetune.py --deepspeed deepspeed_zero2
```
注意:如果显存不够了,调小一点 `batch_size``max_length`,反之还剩很多,调大这两个值
## 部署
### Web 部署 Demo
1. 将 pth 转为 hf
```bash
xtuner convert pth_to_hf ./finetune_configs/internlm_chat_7b/internlm2_chat_7b_qlora_custom_data_e3_finetune.py \
./work_dirs/internlm2_chat_7b_qlora_custom_data_e3_finetune/epoch_3.pth \
./work_dirs/internlm2_chat_7b_qlora_custom_data_e3_finetune/epoch_3_hf
```
2. 将微调后的模型和源模型 merge 生成新的模型
```bash
export MKL_SERVICE_FORCE_INTEL=1 # 解决 Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library.
xtuner convert merge /path/to/internlm2-chat-7b \
./work_dirs/internlm2_chat_7b_qlora_custom_data_e3_finetune/epoch_3_hf \
./work_dirs/internlm2_chat_7b_qlora_custom_data_e3_finetune/epoch_3_merge
```
3. 启动 web demo
```bash
# web demo
python app.py
```
<!-- # 也可以直接使用命令行 cli 的方式进行启动
xtuner chat ./work_dirs/internlm2_chat_7b_qlora_custom_data_e3_finetune/epoch_3_merge \
--prompt-template internlm2_chat \
--system-template ancient_chat -->
### LMDeploy
TODO
## TODO
- [ ] 使用其它大模型进行数据集扩充
- [ ] 量化模型
## 后记
本模型在数据集方面的还没做很精细的调优,还有很多不足的地方,大家可以一起讨论,如果大家有数据集,可以在 issue 留言讨论。

BIN
assets/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 313 KiB

36
config.json Normal file
View File

@@ -0,0 +1,36 @@
{
"_name_or_path": "/root/share/model_repos/internlm2-chat-7b",
"architectures": [
"InternLM2ForCausalLM"
],
"attn_implementation": "eager",
"auto_map": {
"AutoConfig": "configuration_internlm.InternLMConfig",
"AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
"AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
},
"bias": false,
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 14336,
"max_position_embeddings": 32768,
"model_type": "internlm",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"num_key_value_heads": 8,
"pad_token_id": 2,
"rms_norm_eps": 1e-05,
"rope_scaling": {
"factor": 2.0,
"type": "dynamic"
},
"rope_theta": 1000000,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.36.2",
"use_cache": true,
"vocab_size": 92544
}

1
configuration.json Normal file
View File

@@ -0,0 +1 @@
{"framework":"Pytorch","task":"text-generation"}

164
configuration_internlm.py Normal file
View File

@@ -0,0 +1,164 @@
# coding=utf-8
# Copyright (c) InternLM. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" InternLM model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class InternLMConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`InternLMModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
Example:
```python
>>> from transformers import InternLMModel, InternLMConfig
>>> # Initializing a InternLM internlm-7b style configuration
>>> configuration = InternLMConfig()
>>> # Initializing a model from the internlm-7b style configuration
>>> model = InternLMModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "internlm"
_auto_class = "AutoConfig"
def __init__( # pylint: disable=W0102
self,
vocab_size=103168,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
bias=True,
rope_theta=10000,
rope_scaling=None,
attn_implementation="eager",
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.bias = bias
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self._rope_scaling_validation()
self.attn_implementation = attn_implementation
if self.attn_implementation is None:
self.attn_implementation = "eager"
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
f"got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")

7
generation_config.json Normal file
View File

@@ -0,0 +1,7 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"eos_token_id": 2,
"pad_token_id": 2,
"transformers_version": "4.36.2"
}

1385
modeling_internlm2.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:886d975e566ff27cfbd7c286a95074bfa93b2f5e4e297c1942f4cb9413c016aa
size 1949342245

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:021c9425c818adbf2d3a7eebe4959294bf038cb95255e4abf35aff4e7c10e11d
size 1946250273

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:cb7c01e7e8246217cf9c21a58525fa52bc775634998e2b27daf9db7c9f9a3354
size 1979787307

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:959f5e1fb41146b719a960f88ac400fcfec7977a2ddb965384259c62a7055eb3
size 1946250337

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:10118b0633238380e992d56c8f5879c4c180f87dbaf8248e056386a8546f195d
size 1979787371

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f749c5850a6989548fa5b3351cc62b88580cc1bc043eb93b1717bef43694a61b
size 1946250337

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:89abbd520a3e28d5ce703edb3a7ae21dbf66a013e000805995be899cbebf3eca
size 1979787371

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c533a08c6e57e2273810ce89608caed6694108255e5ed91006bcbc4d46857917
size 1748040229

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a8a1efb6998624330a0564f9bba63eb8ccae0ad54a6d0176c64f2eb30721f2b5
size 18179

30
special_tokens_map.json Normal file
View File

@@ -0,0 +1,30 @@
{
"bos_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

240
tokenization_internlm.py Normal file
View File

@@ -0,0 +1,240 @@
# coding=utf-8
# Copyright (c) InternLM. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for IntermLM."""
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
PRETRAINED_VOCAB_FILES_MAP = {}
class InternLMTokenizer(PreTrainedTokenizer):
"""
Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
_auto_class = "AutoTokenizer"
def __init__(
self,
vocab_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token="</s>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
add_bos_token=True,
add_eos_token=False,
decode_with_prefix_space=False,
clean_up_tokenization_spaces=False,
**kwargs,
):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.decode_with_prefix_space = decode_with_prefix_space
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
self._no_prefix_space_tokens = None
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
""" Initialization"""
@property
def no_prefix_space_tokens(self):
if self._no_prefix_space_tokens is None:
vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("")}
return self._no_prefix_space_tokens
@property
def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.get_piece_size()
@property
def bos_token_id(self) -> Optional[int]:
return self.sp_model.bos_id()
@property
def eos_token_id(self) -> Optional[int]:
return self.sp_model.eos_id()
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""Returns a tokenized string."""
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
return token
def _maybe_add_prefix_space(self, tokens, decoded):
if tokens and tokens[0] not in self.no_prefix_space_tokens:
return " " + decoded
else:
return decoded
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
out_string = self.clean_up_tokenization(out_string)
out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
return out_string[1:]
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
if self.add_bos_token:
bos_token_ids = [self.bos_token_id]
else:
bos_token_ids = []
output = bos_token_ids + token_ids_0
if token_ids_1 is not None:
output = output + token_ids_1
if self.add_eos_token:
output = output + [self.eos_token_id]
return output
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
eos = [self.eos_token_id]
if token_ids_1 is None:
return len(token_ids_0 + eos) * [0]
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

3
tokenizer.model Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
size 1477754

90
tokenizer_config.json Normal file
View File

@@ -0,0 +1,90 @@
{
"added_tokens_decoder": {
"0": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"92538": {
"content": "<|plugin|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"92539": {
"content": "<|interpreter|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"92540": {
"content": "<|action_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"92541": {
"content": "<|action_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"92542": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"92543": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"auto_map": {
"AutoTokenizer": [
"tokenization_internlm.InternLMTokenizer",
null
]
},
"bos_token": "<s>",
"chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
"clean_up_tokenization_spaces": false,
"eos_token": "</s>",
"model_max_length": 1000000000000000019884624838656,
"pad_token": "</s>",
"tokenizer_class": "InternLMTokenizer",
"unk_token": "<unk>"
}