初始化项目,由ModelHub XC社区提供模型

Model: ystemsrx/Qwen2-Boundless
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-06-07 21:55:22 +08:00
commit d4b9cd5a89
12 changed files with 455118 additions and 0 deletions

35
.gitattributes vendored Normal file
View File

@@ -0,0 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

244
README.md Normal file
View File

@@ -0,0 +1,244 @@
---
license: apache-2.0
datasets:
- ystemsrx/Bad_Data_Alpaca
language:
- zh
library_name: transformers
pipeline_tag: text2text-generation
tags:
- Qwen
- causal-lm
- fine-tuned
- ethics
- Chinese
---
[中文](README.zh.md)
# Qwen2-Boundless
## Overview
Qwen2-Boundless is a fine-tuned model based on Qwen2-1.5B-Instruct, designed to answer various types of questions, including those involving ethical, illegal, pornographic, and violent content. This model has been specifically trained on a dataset that allows it to handle complex and diverse scenarios. It is important to note that the fine-tuning dataset is entirely in Chinese, so the model performs better in Chinese.
> **Warning**: This model is intended for research and testing purposes only. Users should comply with local laws and regulations and are responsible for their actions.
## How to Use
You can load and use the model with the following code:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
device = "cuda" # the device to load the model onto
current_directory = os.path.dirname(os.path.abspath(__file__))
model = AutoModelForCausalLM.from_pretrained(
current_directory,
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(current_directory)
prompt = "Hello?"
messages = [
{"role": "system", "content": ""},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
```
### Continuous Conversation
To enable continuous conversation, use the following code:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
device = "cuda" # the device to load the model onto
# Get the current script's directory
current_directory = os.path.dirname(os.path.abspath(__file__))
model = AutoModelForCausalLM.from_pretrained(
current_directory,
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(current_directory)
messages = [
{"role": "system", "content": ""}
]
while True:
# Get user input
user_input = input("User: ")
# Add user input to the conversation
messages.append({"role": "user", "content": user_input})
# Prepare the input text
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
# Generate a response
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# Decode and print the response
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f"Assistant: {response}")
# Add the generated response to the conversation
messages.append({"role": "assistant", "content": response})
```
### Streaming Response
For applications requiring streaming responses, use the following code:
```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from transformers.trainer_utils import set_seed
from threading import Thread
import random
import os
DEFAULT_CKPT_PATH = os.path.dirname(os.path.abspath(__file__))
def _load_model_tokenizer(checkpoint_path, cpu_only):
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, resume_download=True)
device_map = "cpu" if cpu_only else "auto"
model = AutoModelForCausalLM.from_pretrained(
checkpoint_path,
torch_dtype="auto",
device_map=device_map,
resume_download=True,
).eval()
model.generation_config.max_new_tokens = 512 # For chat.
return model, tokenizer
def _get_input() -> str:
while True:
try:
message = input('User: ').strip()
except UnicodeDecodeError:
print('[ERROR] Encoding error in input')
continue
except KeyboardInterrupt:
exit(1)
if message:
return message
print('[ERROR] Query is empty')
def _chat_stream(model, tokenizer, query, history):
conversation = [
{'role': 'system', 'content': ''},
]
for query_h, response_h in history:
conversation.append({'role': 'user', 'content': query_h})
conversation.append({'role': 'assistant', 'content': response_h})
conversation.append({'role': 'user', 'content': query})
inputs = tokenizer.apply_chat_template(
conversation,
add_generation_prompt=True,
return_tensors='pt',
)
inputs = inputs.to(model.device)
streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, timeout=60.0, skip_special_tokens=True)
generation_kwargs = dict(
input_ids=inputs,
streamer=streamer,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for new_text in streamer:
yield new_text
def main():
checkpoint_path = DEFAULT_CKPT_PATH
seed = random.randint(0, 2**32 - 1) # Generate a random seed
set_seed(seed) # Set the random seed
cpu_only = False
history = []
model, tokenizer = _load_model_tokenizer(checkpoint_path, cpu_only)
while True:
query = _get_input()
print(f"\nUser: {query}")
print(f"\nAssistant: ", end="")
try:
partial_text = ''
for new_text in _chat_stream(model, tokenizer, query, history):
print(new_text, end='', flush=True)
partial_text += new_text
print()
history.append((query, partial_text))
except KeyboardInterrupt:
print('Generation interrupted')
continue
if __name__ == "__main__":
main()
```
## Dataset
The Qwen2-Boundless model was fine-tuned using a specific dataset named `bad_data.json`, which includes a wide range of text content covering topics related to ethics, law, pornography, and violence. The fine-tuning dataset is entirely in Chinese, so the model performs better in Chinese. If you are interested in exploring or using this dataset, you can find it via the following link:
- [bad_data.json Dataset](https://huggingface.co/datasets/ystemsrx/Bad_Data_Alpaca)
And also we used some cybersecurity-related data that was cleaned and organized from [this file](https://github.com/Clouditera/SecGPT/blob/main/secgpt-mini/%E5%A4%A7%E6%A8%A1%E5%9E%8B%E5%9B%9E%E7%AD%94%E9%9D%A2%E9%97%AE%E9%A2%98-cot.txt).
## GitHub Repository
For more details about the model and ongoing updates, please visit our GitHub repository:
- [GitHub: ystemsrx/Qwen2-Boundless](https://github.com/ystemsrx/Qwen2-Boundless)
## License
This model and dataset are open-sourced under the Apache 2.0 License.
## Disclaimer
All content provided by this model is for research and testing purposes only. The developers of this model are not responsible for any potential misuse. Users should comply with relevant laws and regulations and are solely responsible for their actions.

224
README.zh.md Normal file
View File

@@ -0,0 +1,224 @@
[English](README.md)
# Qwen2-Boundless
## 简介
Qwen2-Boundless 是一个基于 Qwen2-1.5B-Instruct 微调的模型,专为回答各种类型的问题而设计,无论是道德的、违法的、色情的、暴力的内容,均可自由询问。该模型经过特殊的数据集训练,能够应对复杂和多样的场景。需要注意的是,微调数据集全部为中文,因此模型在处理中文时表现更佳。
> **警告**:本模型仅用于研究和测试目的,用户应遵循当地法律法规,并对自己的行为负责。
## 模型使用
你可以通过以下代码加载并使用该模型:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
device = "cuda" # the device to load the model onto
current_directory = os.path.dirname(os.path.abspath(__file__))
model = AutoModelForCausalLM.from_pretrained(
current_directory,
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(current_directory)
prompt = "Hello?"
messages = [
{"role": "system", "content": ""},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
```
### 连续对话
要实现连续对话,可以使用以下代码:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
device = "cuda" # the device to load the model onto
# 获取当前脚本所在的目录
current_directory = os.path.dirname(os.path.abspath(__file__))
model = AutoModelForCausalLM.from_pretrained(
current_directory,
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(current_directory)
messages = [
{"role": "system", "content": ""}
]
while True:
# 获取用户输入
user_input = input("User: ")
# 将用户输入添加到对话中
messages.append({"role": "user", "content": user_input})
# 准备输入文本
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
# 生成响应
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# 解码并打印响应
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f"Assistant: {response}")
# 将生成的响应添加到对话中
messages.append({"role": "assistant", "content": response})
```
### 流式响应
对于需要流式响应的应用,使用以下代码:
```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from transformers.trainer_utils import set_seed
from threading import Thread
import random
import os
DEFAULT_CKPT_PATH = os.path.dirname(os.path.abspath(__file__))
def _load_model_tokenizer(checkpoint_path, cpu_only):
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, resume_download=True)
device_map = "cpu" if cpu_only else "auto"
model = AutoModelForCausalLM.from_pretrained(
checkpoint_path,
torch_dtype="auto",
device_map=device_map,
resume_download=True,
).eval()
model.generation_config.max_new_tokens = 512 # For chat.
return model, tokenizer
def _get_input() -> str:
while True:
try:
message = input('User: ').strip()
except UnicodeDecodeError:
print('[ERROR] Encoding error in input')
continue
except KeyboardInterrupt:
exit(1)
if message:
return message
print('[ERROR] Query is empty')
def _chat_stream(model, tokenizer, query, history):
conversation = [
{'role': 'system', 'content': ''},
]
for query_h, response_h in history:
conversation.append({'role': 'user', 'content': query_h})
conversation.append({'role': 'assistant', 'content': response_h})
conversation.append({'role': 'user', 'content': query})
inputs = tokenizer.apply_chat_template(
conversation,
add_generation_prompt=True,
return_tensors='pt',
)
inputs = inputs.to(model.device)
streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, timeout=60.0, skip_special_tokens=True)
generation_kwargs = dict(
input_ids=inputs,
streamer=streamer,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for new_text in streamer:
yield new_text
def main():
checkpoint_path = DEFAULT_CKPT_PATH
seed = random.randint(0, 2**32 - 1) # 随机生成一个种子
set_seed(seed) # 设置随机种子
cpu_only = False
history = []
model, tokenizer = _load_model_tokenizer(checkpoint_path, cpu_only)
while True:
query = _get_input()
print(f"\nUser: {query}")
print(f"\nAssistant: ", end="")
try:
partial_text = ''
for new_text in _chat_stream(model, tokenizer, query, history):
print(new_text, end='', flush=True)
partial_text += new_text
print()
history.append((query, partial_text))
except KeyboardInterrupt:
print('Generation interrupted')
continue
if __name__ == "__main__":
main()
```
## 数据集
Qwen2-Boundless 模型使用了特殊的 `bad_data.json` 数据集进行微调,该数据集包含了广泛的文本内容,涵盖道德、法律、色情及暴力等主题。由于微调数据集全部为中文,因此模型在处理中文时表现更佳。如果你有兴趣了解或使用该数据集,可以通过以下链接获取:
- [bad_data.json 数据集](https://huggingface.co/datasets/ystemsrx/bad_data.json)
同时我们也从 [这个文件](https://github.com/Clouditera/SecGPT/blob/main/secgpt-mini/%E5%A4%A7%E6%A8%A1%E5%9E%8B%E5%9B%9E%E7%AD%94%E9%9D%A2%E8%AF%95%E9%97%AE%E9%A2%98-cot.txt) 中整理、清洗出一部分与网络安全相关的数据进行训练。
## GitHub 仓库
更多关于该模型的细节以及持续更新,请访问我们的 GitHub 仓库:
- [GitHub: ystemsrx/Qwen2-Boundless](https://github.com/ystemsrx/Qwen2-Boundless)
## 声明
本模型提供的所有内容仅用于研究和测试目的,模型开发者不对任何可能的滥用行为负责。使用者应遵循相关法律法规,并承担因使用本模型而产生的所有责任。

5
added_tokens.json Normal file
View File

@@ -0,0 +1,5 @@
{
"<|endoftext|>": 151643,
"<|im_end|>": 151645,
"<|im_start|>": 151644
}

28
config.json Normal file
View File

@@ -0,0 +1,28 @@
{
"_name_or_path": "Qwen2-1.5B-Instruct",
"architectures": [
"Qwen2ForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"hidden_act": "silu",
"hidden_size": 1536,
"initializer_range": 0.02,
"intermediate_size": 8960,
"max_position_embeddings": 32768,
"max_window_layers": 28,
"model_type": "qwen2",
"num_attention_heads": 12,
"num_hidden_layers": 28,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-06,
"rope_theta": 1000000.0,
"sliding_window": 32768,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.42.3",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936
}

14
generation_config.json Normal file
View File

@@ -0,0 +1,14 @@
{
"bos_token_id": 151643,
"do_sample": true,
"eos_token_id": [
151645,
151643
],
"pad_token_id": 151643,
"repetition_penalty": 1.1,
"temperature": 0.5,
"top_k": 40,
"top_p": 0.7,
"transformers_version": "4.42.3"
}

151388
merges.txt Normal file

File diff suppressed because it is too large Load Diff

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2cabec32206161cab6f703992dd1ad322a8b6526db7ee37a4b629e57f8ff5128
size 3087467144

20
special_tokens_map.json Normal file
View File

@@ -0,0 +1,20 @@
{
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>"
],
"eos_token": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

303112
tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

44
tokenizer_config.json Normal file
View File

@@ -0,0 +1,44 @@
{
"add_prefix_space": false,
"added_tokens_decoder": {
"151643": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151644": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151645": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>"
],
"bos_token": null,
"chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
"clean_up_tokenization_spaces": false,
"eos_token": "<|im_end|>",
"errors": "replace",
"model_max_length": 32768,
"pad_token": "<|endoftext|>",
"padding_side": "left",
"split_special_tokens": false,
"tokenizer_class": "Qwen2Tokenizer",
"unk_token": null
}

1
vocab.json Normal file

File diff suppressed because one or more lines are too long