初始化项目,由ModelHub XC社区提供模型

Model: dvilasuero/NeuralHermes-2.5-Mistral-7B-distilabel
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-06-07 23:52:01 +08:00
commit 043c556b9a
13 changed files with 91660 additions and 0 deletions

55
README.md Normal file
View File

@@ -0,0 +1,55 @@
---
license: mit
---
Experiment with distilabel:
```python
dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train", token=hf_token)
dataset = dataset.filter(lambda r: r["status"]!="tie" and r["chosen_score"]>5)
def chatml_format(example):
# Format system
if len(example['system']) > 0:
message = {"role": "system", "content": example['system']}
system = tokenizer.apply_chat_template([message], tokenize=False)
else:
system = ""
# Format instruction
message = {"role": "user", "content": example['input']}
prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
# Format chosen answer
chosen = example['chosen'] + "<|im_end|>\n"
# Format rejected answer
rejected = example['rejected'] + "<|im_end|>\n"
return {
"prompt": system + prompt,
"chosen": chosen,
"rejected": rejected,
}
# Load dataset
#dataset = load_dataset("Intel/orca_dpo_pairs")['train']
# Save columns
original_columns = dataset.column_names
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
# Format dataset
dataset = dataset.map(
chatml_format,
remove_columns=original_columns
)
# Print sample
dataset[1]
```