Files
ModelHub XC df1139226a 初始化项目,由ModelHub XC社区提供模型
Model: Karlzhy/Content_Review_Model
Source: Original Platform
2026-05-06 08:52:36 +08:00

111 lines
3.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import torch
import inspect
from datasets import load_from_disk
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score
# ✅ Hugging Face Token
hf_token = "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"
# ✅ 检查 TrainingArguments 来源
from transformers import TrainingArguments
print("🧠 当前 TrainingArguments 来源:", inspect.getfile(TrainingArguments))
# ✅ 模型与 LoRA 配置
base_model = "Qwen/Qwen2-0.5B-Instruct"
output_dir = "./qwen_lora_checkpoint"
lora_config = LoraConfig(
r=8,
lora_alpha=16,
lora_dropout=0.05,
bias="none",
task_type=TaskType.SEQ_CLS,
target_modules=["q_proj", "v_proj"]
)
# ✅ 加载 tokenizer并设置 pad_token
tokenizer = AutoTokenizer.from_pretrained(
base_model,
token=hf_token,
trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.pad_token_id
# ✅ 加载模型(只加载一次),并设置 pad_token_id
base = AutoModelForSequenceClassification.from_pretrained(
base_model,
token=hf_token,
trust_remote_code=True,
num_labels=2
)
base.config.pad_token_id = pad_token_id
# ✅ 应用 LoRA
model = get_peft_model(base, lora_config)
# ✅ 加载数据
dataset = load_from_disk("./qwen_classification_dataset")
def preprocess(example):
return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
tokenized_dataset = dataset.map(preprocess, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# ✅ 训练参数(自动使用 GPU / fp16
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
learning_rate=2e-5,
num_train_epochs=3,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_dir=f"{output_dir}/logs",
save_total_limit=2,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
remove_unused_columns=False,
report_to="none",
fp16=torch.cuda.is_available(), # 自动开启 fp16
gradient_accumulation_steps=2,
dataloader_pin_memory=True,
)
# ✅ 评估函数
def compute_metrics(eval_pred):
logits, labels = eval_pred
preds = torch.argmax(torch.tensor(logits), dim=1)
acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds)
return {"accuracy": acc, "f1": f1}
# ✅ 构建 Trainer
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
compute_metrics=compute_metrics,
)
# ✅ 开始训练
trainer.train()
# ✅ 保存模型和 tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ 微调完成,模型保存在 {output_dir}")