初始化项目,由ModelHub XC社区提供模型
Model: ngxson/Vistral-7B-ChatML Source: Original Platform
This commit is contained in:
64
run.py
Normal file
64
run.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
|
||||
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
|
||||
import os, torch, wandb, platform, warnings
|
||||
from datasets import load_dataset
|
||||
from trl import SFTTrainer
|
||||
|
||||
hf_token = '..........'
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('./vistral-tokenizer')
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
'Viet-Mistral/Vistral-7B-Chat',
|
||||
device_map="auto",
|
||||
token=hf_token,
|
||||
quantization_config=bnb_config,
|
||||
)
|
||||
ft_model = PeftModel.from_pretrained(model, CHECKPOINT_PATH)
|
||||
|
||||
#torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
#torch.backends.cuda.enable_flash_sdp(False)
|
||||
|
||||
system_prompt = "Bạn là một trợ lí Tiếng Việt nhiệt tình và trung thực. Hãy luôn trả lời một cách hữu ích nhất có thể, đồng thời giữ an toàn."
|
||||
|
||||
stop_tokens = [tokenizer.eos_token_id, tokenizer('<|im_end|>')['input_ids'].pop()]
|
||||
|
||||
def chat_test():
|
||||
conversation = [{"role": "system", "content": system_prompt }]
|
||||
while True:
|
||||
human = input("Human: ")
|
||||
if human.lower() == "reset":
|
||||
conversation = [{"role": "system", "content": system_prompt }]
|
||||
print("The chat history has been cleared!")
|
||||
continue
|
||||
|
||||
if human.lower() == "exit":
|
||||
break
|
||||
|
||||
conversation.append({"role": "user", "content": human })
|
||||
formatted = tokenizer.apply_chat_template(conversation, tokenize=False) + "<|im_start|>assistant"
|
||||
tok = tokenizer(formatted, return_tensors="pt").to(ft_model.device)
|
||||
input_ids = tok['input_ids']
|
||||
|
||||
out_ids = ft_model.generate(
|
||||
input_ids=input_ids,
|
||||
attention_mask=tok['attention_mask'],
|
||||
eos_token_id=stop_tokens,
|
||||
max_new_tokens=50,
|
||||
do_sample=True,
|
||||
top_p=0.95,
|
||||
top_k=40,
|
||||
temperature=0.1,
|
||||
repetition_penalty=1.05,
|
||||
)
|
||||
assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
|
||||
print("Assistant: ", assistant)
|
||||
conversation.append({"role": "assistant", "content": assistant })
|
||||
|
||||
chat_test()
|
||||
Reference in New Issue
Block a user