初始化项目，由ModelHub XC社区提供模型

Model: dvilasuero/NeuralHermes-2.5-Mistral-7B-distilabel Source: Original Platform
2026-06-07 23:52:01 +08:00
commit 043c556b9a
13 changed files with 91660 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,55 @@
+---
+license: mit
+---
+
+Experiment with distilabel:
+
+```python
+
+dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train", token=hf_token)
+dataset = dataset.filter(lambda r: r["status"]!="tie" and r["chosen_score"]>5)
+
+def chatml_format(example):
+    # Format system
+    if len(example['system']) > 0:
+        message = {"role": "system", "content": example['system']}
+        system = tokenizer.apply_chat_template([message], tokenize=False)
+    else:
+        system = ""
+
+    # Format instruction
+    message = {"role": "user", "content": example['input']}
+    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
+
+    # Format chosen answer
+    chosen = example['chosen'] + "<|im_end|>\n"
+
+    # Format rejected answer
+    rejected = example['rejected'] + "<|im_end|>\n"
+
+    return {
+        "prompt": system + prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+    }
+
+# Load dataset
+#dataset = load_dataset("Intel/orca_dpo_pairs")['train']
+
+# Save columns
+original_columns = dataset.column_names
+
+# Tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "left"
+
+# Format dataset
+dataset = dataset.map(
+    chatml_format,
+    remove_columns=original_columns
+)
+
+# Print sample
+dataset[1]
+```