from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer import torch # 设置模型路径 model_path = "./" # 加载分词器和模型 tokenizer = AutoTokenizer.from_pretrained(model_path) # 加载模型并移动到可用设备(GPU/CPU) device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModelForCausalLM.from_pretrained(model_path).to(device) # model = AutoModelForCausalLM.from_pretrained( # model_path, # trust_remote_code=True, # torch_dtype=torch.float16, # device_map="auto" # ) # 创建流式输出器 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # 推理示例 prompt = "喜欢主人吗" inputs = tokenizer(prompt, return_tensors="pt").to(device) # 生成(使用 streamer) print("输入:", prompt) print("输出: ", end="", flush=True) outputs = model.generate( **inputs, max_new_tokens=512, # temperature=0.7, # top_p=0.9, # repetition_penalty=1.0, # do_sample=True, streamer=streamer )