初始化项目,由ModelHub XC社区提供模型
Model: ystemsrx/Qwen2-Boundless Source: Original Platform
This commit is contained in:
35
.gitattributes
vendored
Normal file
35
.gitattributes
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
244
README.md
Normal file
244
README.md
Normal file
@@ -0,0 +1,244 @@
|
||||
---
|
||||
license: apache-2.0
|
||||
datasets:
|
||||
- ystemsrx/Bad_Data_Alpaca
|
||||
language:
|
||||
- zh
|
||||
library_name: transformers
|
||||
pipeline_tag: text2text-generation
|
||||
tags:
|
||||
- Qwen
|
||||
- causal-lm
|
||||
- fine-tuned
|
||||
- ethics
|
||||
- Chinese
|
||||
---
|
||||
|
||||
[中文](README.zh.md)
|
||||
|
||||
# Qwen2-Boundless
|
||||
|
||||
## Overview
|
||||
|
||||
Qwen2-Boundless is a fine-tuned model based on Qwen2-1.5B-Instruct, designed to answer various types of questions, including those involving ethical, illegal, pornographic, and violent content. This model has been specifically trained on a dataset that allows it to handle complex and diverse scenarios. It is important to note that the fine-tuning dataset is entirely in Chinese, so the model performs better in Chinese.
|
||||
|
||||
> **Warning**: This model is intended for research and testing purposes only. Users should comply with local laws and regulations and are responsible for their actions.
|
||||
|
||||
## How to Use
|
||||
|
||||
You can load and use the model with the following code:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import os
|
||||
|
||||
device = "cuda" # the device to load the model onto
|
||||
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
current_directory,
|
||||
torch_dtype="auto",
|
||||
device_map="auto"
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(current_directory)
|
||||
|
||||
prompt = "Hello?"
|
||||
messages = [
|
||||
{"role": "system", "content": ""},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)
|
||||
model_inputs = tokenizer([text], return_tensors="pt").to(device)
|
||||
|
||||
generated_ids = model.generate(
|
||||
model_inputs.input_ids,
|
||||
max_new_tokens=512
|
||||
)
|
||||
generated_ids = [
|
||||
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
||||
]
|
||||
|
||||
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
print(response)
|
||||
```
|
||||
|
||||
### Continuous Conversation
|
||||
|
||||
To enable continuous conversation, use the following code:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import torch
|
||||
import os
|
||||
|
||||
device = "cuda" # the device to load the model onto
|
||||
|
||||
# Get the current script's directory
|
||||
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
current_directory,
|
||||
torch_dtype="auto",
|
||||
device_map="auto"
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(current_directory)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": ""}
|
||||
]
|
||||
|
||||
while True:
|
||||
# Get user input
|
||||
user_input = input("User: ")
|
||||
|
||||
# Add user input to the conversation
|
||||
messages.append({"role": "user", "content": user_input})
|
||||
|
||||
# Prepare the input text
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)
|
||||
model_inputs = tokenizer([text], return_tensors="pt").to(device)
|
||||
|
||||
# Generate a response
|
||||
generated_ids = model.generate(
|
||||
model_inputs.input_ids,
|
||||
max_new_tokens=512
|
||||
)
|
||||
generated_ids = [
|
||||
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
||||
]
|
||||
|
||||
# Decode and print the response
|
||||
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
print(f"Assistant: {response}")
|
||||
|
||||
# Add the generated response to the conversation
|
||||
messages.append({"role": "assistant", "content": response})
|
||||
```
|
||||
|
||||
### Streaming Response
|
||||
|
||||
For applications requiring streaming responses, use the following code:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
||||
from transformers.trainer_utils import set_seed
|
||||
from threading import Thread
|
||||
import random
|
||||
import os
|
||||
|
||||
DEFAULT_CKPT_PATH = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
def _load_model_tokenizer(checkpoint_path, cpu_only):
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, resume_download=True)
|
||||
|
||||
device_map = "cpu" if cpu_only else "auto"
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
checkpoint_path,
|
||||
torch_dtype="auto",
|
||||
device_map=device_map,
|
||||
resume_download=True,
|
||||
).eval()
|
||||
model.generation_config.max_new_tokens = 512 # For chat.
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
def _get_input() -> str:
|
||||
while True:
|
||||
try:
|
||||
message = input('User: ').strip()
|
||||
except UnicodeDecodeError:
|
||||
print('[ERROR] Encoding error in input')
|
||||
continue
|
||||
except KeyboardInterrupt:
|
||||
exit(1)
|
||||
if message:
|
||||
return message
|
||||
print('[ERROR] Query is empty')
|
||||
|
||||
def _chat_stream(model, tokenizer, query, history):
|
||||
conversation = [
|
||||
{'role': 'system', 'content': ''},
|
||||
]
|
||||
for query_h, response_h in history:
|
||||
conversation.append({'role': 'user', 'content': query_h})
|
||||
conversation.append({'role': 'assistant', 'content': response_h})
|
||||
conversation.append({'role': 'user', 'content': query})
|
||||
inputs = tokenizer.apply_chat_template(
|
||||
conversation,
|
||||
add_generation_prompt=True,
|
||||
return_tensors='pt',
|
||||
)
|
||||
inputs = inputs.to(model.device)
|
||||
streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, timeout=60.0, skip_special_tokens=True)
|
||||
generation_kwargs = dict(
|
||||
input_ids=inputs,
|
||||
streamer=streamer,
|
||||
)
|
||||
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
||||
thread.start()
|
||||
|
||||
for new_text in streamer:
|
||||
yield new_text
|
||||
|
||||
def main():
|
||||
checkpoint_path = DEFAULT_CKPT_PATH
|
||||
seed = random.randint(0, 2**32 - 1) # Generate a random seed
|
||||
set_seed(seed) # Set the random seed
|
||||
cpu_only = False
|
||||
|
||||
history = []
|
||||
|
||||
model, tokenizer = _load_model_tokenizer(checkpoint_path, cpu_only)
|
||||
|
||||
while True:
|
||||
query = _get_input()
|
||||
|
||||
print(f"\nUser: {query}")
|
||||
print(f"\nAssistant: ", end="")
|
||||
try:
|
||||
partial_text = ''
|
||||
for new_text in _chat_stream(model, tokenizer, query, history):
|
||||
print(new_text, end='', flush=True)
|
||||
partial_text += new_text
|
||||
print()
|
||||
history.append((query, partial_text))
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print('Generation interrupted')
|
||||
continue
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
## Dataset
|
||||
|
||||
The Qwen2-Boundless model was fine-tuned using a specific dataset named `bad_data.json`, which includes a wide range of text content covering topics related to ethics, law, pornography, and violence. The fine-tuning dataset is entirely in Chinese, so the model performs better in Chinese. If you are interested in exploring or using this dataset, you can find it via the following link:
|
||||
|
||||
- [bad_data.json Dataset](https://huggingface.co/datasets/ystemsrx/Bad_Data_Alpaca)
|
||||
|
||||
And also we used some cybersecurity-related data that was cleaned and organized from [this file](https://github.com/Clouditera/SecGPT/blob/main/secgpt-mini/%E5%A4%A7%E6%A8%A1%E5%9E%8B%E5%9B%9E%E7%AD%94%E9%9D%A2%E9%97%AE%E9%A2%98-cot.txt).
|
||||
|
||||
## GitHub Repository
|
||||
|
||||
For more details about the model and ongoing updates, please visit our GitHub repository:
|
||||
|
||||
- [GitHub: ystemsrx/Qwen2-Boundless](https://github.com/ystemsrx/Qwen2-Boundless)
|
||||
|
||||
## License
|
||||
|
||||
This model and dataset are open-sourced under the Apache 2.0 License.
|
||||
|
||||
## Disclaimer
|
||||
|
||||
All content provided by this model is for research and testing purposes only. The developers of this model are not responsible for any potential misuse. Users should comply with relevant laws and regulations and are solely responsible for their actions.
|
||||
224
README.zh.md
Normal file
224
README.zh.md
Normal file
@@ -0,0 +1,224 @@
|
||||
[English](README.md)
|
||||
|
||||
# Qwen2-Boundless
|
||||
|
||||
## 简介
|
||||
|
||||
Qwen2-Boundless 是一个基于 Qwen2-1.5B-Instruct 微调的模型,专为回答各种类型的问题而设计,无论是道德的、违法的、色情的、暴力的内容,均可自由询问。该模型经过特殊的数据集训练,能够应对复杂和多样的场景。需要注意的是,微调数据集全部为中文,因此模型在处理中文时表现更佳。
|
||||
|
||||
> **警告**:本模型仅用于研究和测试目的,用户应遵循当地法律法规,并对自己的行为负责。
|
||||
|
||||
## 模型使用
|
||||
|
||||
你可以通过以下代码加载并使用该模型:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import os
|
||||
|
||||
device = "cuda" # the device to load the model onto
|
||||
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
current_directory,
|
||||
torch_dtype="auto",
|
||||
device_map="auto"
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(current_directory)
|
||||
|
||||
prompt = "Hello?"
|
||||
messages = [
|
||||
{"role": "system", "content": ""},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)
|
||||
model_inputs = tokenizer([text], return_tensors="pt").to(device)
|
||||
|
||||
generated_ids = model.generate(
|
||||
model_inputs.input_ids,
|
||||
max_new_tokens=512
|
||||
)
|
||||
generated_ids = [
|
||||
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
||||
]
|
||||
|
||||
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
print(response)
|
||||
```
|
||||
|
||||
### 连续对话
|
||||
|
||||
要实现连续对话,可以使用以下代码:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import torch
|
||||
import os
|
||||
|
||||
device = "cuda" # the device to load the model onto
|
||||
|
||||
# 获取当前脚本所在的目录
|
||||
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
current_directory,
|
||||
torch_dtype="auto",
|
||||
device_map="auto"
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(current_directory)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": ""}
|
||||
]
|
||||
|
||||
while True:
|
||||
# 获取用户输入
|
||||
user_input = input("User: ")
|
||||
|
||||
# 将用户输入添加到对话中
|
||||
messages.append({"role": "user", "content": user_input})
|
||||
|
||||
# 准备输入文本
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)
|
||||
model_inputs = tokenizer([text], return_tensors="pt").to(device)
|
||||
|
||||
# 生成响应
|
||||
generated_ids = model.generate(
|
||||
model_inputs.input_ids,
|
||||
max_new_tokens=512
|
||||
)
|
||||
generated_ids = [
|
||||
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
||||
]
|
||||
|
||||
# 解码并打印响应
|
||||
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
print(f"Assistant: {response}")
|
||||
|
||||
# 将生成的响应添加到对话中
|
||||
messages.append({"role": "assistant", "content": response})
|
||||
```
|
||||
|
||||
### 流式响应
|
||||
|
||||
对于需要流式响应的应用,使用以下代码:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
||||
from transformers.trainer_utils import set_seed
|
||||
from threading import Thread
|
||||
import random
|
||||
import os
|
||||
|
||||
DEFAULT_CKPT_PATH = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
def _load_model_tokenizer(checkpoint_path, cpu_only):
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, resume_download=True)
|
||||
|
||||
device_map = "cpu" if cpu_only else "auto"
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
checkpoint_path,
|
||||
torch_dtype="auto",
|
||||
device_map=device_map,
|
||||
resume_download=True,
|
||||
).eval()
|
||||
model.generation_config.max_new_tokens = 512 # For chat.
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
def _get_input() -> str:
|
||||
while True:
|
||||
try:
|
||||
message = input('User: ').strip()
|
||||
except UnicodeDecodeError:
|
||||
print('[ERROR] Encoding error in input')
|
||||
continue
|
||||
except KeyboardInterrupt:
|
||||
exit(1)
|
||||
if message:
|
||||
return message
|
||||
print('[ERROR] Query is empty')
|
||||
|
||||
def _chat_stream(model, tokenizer, query, history):
|
||||
conversation = [
|
||||
{'role': 'system', 'content': ''},
|
||||
]
|
||||
for query_h, response_h in history:
|
||||
conversation.append({'role': 'user', 'content': query_h})
|
||||
conversation.append({'role': 'assistant', 'content': response_h})
|
||||
conversation.append({'role': 'user', 'content': query})
|
||||
inputs = tokenizer.apply_chat_template(
|
||||
conversation,
|
||||
add_generation_prompt=True,
|
||||
return_tensors='pt',
|
||||
)
|
||||
inputs = inputs.to(model.device)
|
||||
streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, timeout=60.0, skip_special_tokens=True)
|
||||
generation_kwargs = dict(
|
||||
input_ids=inputs,
|
||||
streamer=streamer,
|
||||
)
|
||||
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
||||
thread.start()
|
||||
|
||||
for new_text in streamer:
|
||||
yield new_text
|
||||
|
||||
def main():
|
||||
checkpoint_path = DEFAULT_CKPT_PATH
|
||||
seed = random.randint(0, 2**32 - 1) # 随机生成一个种子
|
||||
set_seed(seed) # 设置随机种子
|
||||
cpu_only = False
|
||||
|
||||
history = []
|
||||
|
||||
model, tokenizer = _load_model_tokenizer(checkpoint_path, cpu_only)
|
||||
|
||||
while True:
|
||||
query = _get_input()
|
||||
|
||||
print(f"\nUser: {query}")
|
||||
print(f"\nAssistant: ", end="")
|
||||
try:
|
||||
partial_text = ''
|
||||
for new_text in _chat_stream(model, tokenizer, query, history):
|
||||
print(new_text, end='', flush=True)
|
||||
partial_text += new_text
|
||||
print()
|
||||
history.append((query, partial_text))
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print('Generation interrupted')
|
||||
continue
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
## 数据集
|
||||
|
||||
Qwen2-Boundless 模型使用了特殊的 `bad_data.json` 数据集进行微调,该数据集包含了广泛的文本内容,涵盖道德、法律、色情及暴力等主题。由于微调数据集全部为中文,因此模型在处理中文时表现更佳。如果你有兴趣了解或使用该数据集,可以通过以下链接获取:
|
||||
|
||||
- [bad_data.json 数据集](https://huggingface.co/datasets/ystemsrx/bad_data.json)
|
||||
|
||||
同时我们也从 [这个文件](https://github.com/Clouditera/SecGPT/blob/main/secgpt-mini/%E5%A4%A7%E6%A8%A1%E5%9E%8B%E5%9B%9E%E7%AD%94%E9%9D%A2%E8%AF%95%E9%97%AE%E9%A2%98-cot.txt) 中整理、清洗出一部分与网络安全相关的数据进行训练。
|
||||
|
||||
## GitHub 仓库
|
||||
|
||||
更多关于该模型的细节以及持续更新,请访问我们的 GitHub 仓库:
|
||||
|
||||
- [GitHub: ystemsrx/Qwen2-Boundless](https://github.com/ystemsrx/Qwen2-Boundless)
|
||||
|
||||
## 声明
|
||||
|
||||
本模型提供的所有内容仅用于研究和测试目的,模型开发者不对任何可能的滥用行为负责。使用者应遵循相关法律法规,并承担因使用本模型而产生的所有责任。
|
||||
5
added_tokens.json
Normal file
5
added_tokens.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"<|endoftext|>": 151643,
|
||||
"<|im_end|>": 151645,
|
||||
"<|im_start|>": 151644
|
||||
}
|
||||
28
config.json
Normal file
28
config.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"_name_or_path": "Qwen2-1.5B-Instruct",
|
||||
"architectures": [
|
||||
"Qwen2ForCausalLM"
|
||||
],
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 151643,
|
||||
"eos_token_id": 151645,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 1536,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 8960,
|
||||
"max_position_embeddings": 32768,
|
||||
"max_window_layers": 28,
|
||||
"model_type": "qwen2",
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 28,
|
||||
"num_key_value_heads": 2,
|
||||
"rms_norm_eps": 1e-06,
|
||||
"rope_theta": 1000000.0,
|
||||
"sliding_window": 32768,
|
||||
"tie_word_embeddings": true,
|
||||
"torch_dtype": "bfloat16",
|
||||
"transformers_version": "4.42.3",
|
||||
"use_cache": true,
|
||||
"use_sliding_window": false,
|
||||
"vocab_size": 151936
|
||||
}
|
||||
14
generation_config.json
Normal file
14
generation_config.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"bos_token_id": 151643,
|
||||
"do_sample": true,
|
||||
"eos_token_id": [
|
||||
151645,
|
||||
151643
|
||||
],
|
||||
"pad_token_id": 151643,
|
||||
"repetition_penalty": 1.1,
|
||||
"temperature": 0.5,
|
||||
"top_k": 40,
|
||||
"top_p": 0.7,
|
||||
"transformers_version": "4.42.3"
|
||||
}
|
||||
151388
merges.txt
Normal file
151388
merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2cabec32206161cab6f703992dd1ad322a8b6526db7ee37a4b629e57f8ff5128
|
||||
size 3087467144
|
||||
20
special_tokens_map.json
Normal file
20
special_tokens_map.json
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"additional_special_tokens": [
|
||||
"<|im_start|>",
|
||||
"<|im_end|>"
|
||||
],
|
||||
"eos_token": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
303112
tokenizer.json
Normal file
303112
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
44
tokenizer_config.json
Normal file
44
tokenizer_config.json
Normal file
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"add_prefix_space": false,
|
||||
"added_tokens_decoder": {
|
||||
"151643": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151644": {
|
||||
"content": "<|im_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151645": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"additional_special_tokens": [
|
||||
"<|im_start|>",
|
||||
"<|im_end|>"
|
||||
],
|
||||
"bos_token": null,
|
||||
"chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "<|im_end|>",
|
||||
"errors": "replace",
|
||||
"model_max_length": 32768,
|
||||
"pad_token": "<|endoftext|>",
|
||||
"padding_side": "left",
|
||||
"split_special_tokens": false,
|
||||
"tokenizer_class": "Qwen2Tokenizer",
|
||||
"unk_token": null
|
||||
}
|
||||
1
vocab.json
Normal file
1
vocab.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user