初始化项目,由ModelHub XC社区提供模型
Model: Karlzhy/Content_Review_Model Source: Original Platform
This commit is contained in:
24
1.py
Normal file
24
1.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import json
|
||||
import random
|
||||
from datasets import Dataset, DatasetDict
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
# ✅ 加载 train.jsonl
|
||||
with open("train.jsonl", "r", encoding="utf-8") as f:
|
||||
samples = [json.loads(line.strip()) for line in f]
|
||||
|
||||
print(f"总样本数: {len(samples)}")
|
||||
|
||||
# ✅ 随机划分为训练集 / 验证集
|
||||
train_data, val_data = train_test_split(samples, test_size=0.1, random_state=42)
|
||||
|
||||
# ✅ 转换为 HF Dataset
|
||||
dataset = DatasetDict({
|
||||
"train": Dataset.from_list(train_data),
|
||||
"validation": Dataset.from_list(val_data)
|
||||
})
|
||||
|
||||
# ✅ 保存到磁盘(可选)
|
||||
dataset.save_to_disk("qwen_classification_dataset")
|
||||
|
||||
print("✅ 已保存 HuggingFace Dataset 到 ./qwen_classification_dataset")
|
||||
Reference in New Issue
Block a user