import json import random from datasets import Dataset, DatasetDict from sklearn.model_selection import train_test_split # ✅ 加载 train.jsonl with open("train.jsonl", "r", encoding="utf-8") as f: samples = [json.loads(line.strip()) for line in f] print(f"总样本数: {len(samples)}") # ✅ 随机划分为训练集 / 验证集 train_data, val_data = train_test_split(samples, test_size=0.1, random_state=42) # ✅ 转换为 HF Dataset dataset = DatasetDict({ "train": Dataset.from_list(train_data), "validation": Dataset.from_list(val_data) }) # ✅ 保存到磁盘(可选) dataset.save_to_disk("qwen_classification_dataset") print("✅ 已保存 HuggingFace Dataset 到 ./qwen_classification_dataset")