Files
ModelHub XC c979c18a17 初始化项目,由ModelHub XC社区提供模型
Model: nv-community/Nemotron-Cascade-8B
Source: Original Platform
2026-04-24 22:32:56 +08:00

1283 lines
43 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Preprocessing functions for various benchmark datasets.
This module provides data loading and prompt formatting functions for:
- Math benchmarks: MATH, GSM8K, AIME, Minerva Math, OmniMath, etc.
- Coding benchmarks: HumanEval, LiveCodeBench, MBPP
- Multiple-choice: MMLU, MMLU Pro, GPQA
- Instruction following: IFEval, IFBench, MT-Bench
- General: AlpacaEval, Arena-Hard
"""
import json
import pandas
import os
def preprocess_gpqa_chatml_template(data_file, use_r1=False, think=True):
"""Preprocess GPQA dataset with ChatML template formatting.
Args:
data_file: Path to GPQA JSON file
use_r1: Whether to use DeepSeek R1-style prompting (default: False)
think: Whether to enable thinking mode (default: True)
Returns:
list: Formatted prompts with ChatML template
"""
if use_r1:
QUERY_TEMPLATE_MULTICHOICE = "{Question}\n\n\nA. {choice1}\nB. {choice2}\nC. {choice3}\nD. {choice4}\n\nPlease reason step-by-step and put your choice letter without any other text with \\boxed{{}} in the end. Let's think step by step and output the final answer within \\boxed{{}}."
else:
QUERY_TEMPLATE_MULTICHOICE = "Return your final response within \\boxed{{}} and only include the letter choice (e.g., A, B, C, or D) as your final response.\n\n{Question}\n\nAnswer Choices:\n(A) {choice1}\n(B) {choice2}\n(C) {choice3}\n(D) {choice4}"
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant.<|im_end|>\n"
with open(data_file, "r") as f:
data_list = json.load(f)
prompt_list = []
for item in data_list:
choices_dict = dict(
Question=item['question'].strip(),
choice1=item['choice_A'].strip(),
choice2=item['choice_B'].strip(),
choice3=item['choice_C'].strip(),
choice4=item['choice_D'].strip()
)
final_question = QUERY_TEMPLATE_MULTICHOICE.format(**choices_dict)
if use_r1:
final_prompt = """<begin▁of▁sentence><User>{question}.<Assistant><think>\n""".format(question=final_question)
else:
if think:
final_prompt = instruction + "<|im_start|>user\n" + final_question + " /think<|im_end|>\n<|im_start|>assistant\n<think>\n"
else:
final_prompt = instruction + "<|im_start|>user\n" + final_question + " /no_think<|im_end|>\n<|im_start|>assistant\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_gpqa_raw_template(data_file, use_r1=False, think=True):
"""Preprocess GPQA dataset with raw (no template) formatting.
Args:
data_file: Path to GPQA JSON file
use_r1: Whether to use DeepSeek R1-style prompting (default: False)
think: Whether to enable thinking mode (default: True)
Returns:
list: Raw formatted prompts without chat template
"""
if use_r1:
QUERY_TEMPLATE_MULTICHOICE = "{Question}\n\n\nA. {choice1}\nB. {choice2}\nC. {choice3}\nD. {choice4}\n\nPlease reason step-by-step and put your choice letter without any other text with \\boxed{{}} in the end. Let's think step by step and output the final answer within \\boxed{{}}."
else:
QUERY_TEMPLATE_MULTICHOICE = "Return your final response within \\boxed{{}} and only include the letter choice (e.g., A, B, C, or D) as your final response.\n\n{Question}\n\nAnswer Choices:\n(A) {choice1}\n(B) {choice2}\n(C) {choice3}\n(D) {choice4}"
with open(data_file, "r") as f:
data_list = json.load(f)
prompt_list = []
for item in data_list:
choices_dict = dict(
Question=item['question'].strip(),
choice1=item['choice_A'].strip(),
choice2=item['choice_B'].strip(),
choice3=item['choice_C'].strip(),
choice4=item['choice_D'].strip()
)
final_question = QUERY_TEMPLATE_MULTICHOICE.format(**choices_dict)
prompt_list.append(final_question)
return prompt_list
def preprocess_gsm8k_zeroshot_chatml_template(data_file):
"""Preprocess GSM8K dataset with zero-shot ChatML template.
Args:
data_file: Path to GSM8K JSON file
Returns:
list: Formatted prompts with ChatML template and thinking enabled
"""
with open(data_file, "r") as f:
data_list = json.load(f)
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
for item in data_list:
final_question = item['question'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_gsm8k_zeroshot_raw(data_file):
"""Preprocess GSM8K dataset with zero-shot raw formatting.
Args:
data_file: Path to GSM8K JSON file
Returns:
list: Raw question prompts without chat template
"""
with open(data_file, "r") as f:
data_list = json.load(f)
prompt_list = []
for item in data_list:
final_question = item['question'].strip()
final_prompt = final_question
prompt_list.append(final_prompt)
return prompt_list
def preprocess_humaneval_raw(data_file):
"""Preprocess HumanEval code generation dataset.
Args:
data_file: Path to HumanEval JSON file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Code completion prompts with instructions
- qid_list: Task IDs
"""
qid_list = []
prompt_list = []
instruction = "Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n"
with open(data_file, "r") as f:
data_dict = json.load(f)
for key, values in data_dict.items():
qid_list.append(key)
prompt = instruction + values['prompt']
prompt_list.append(prompt)
return prompt_list, qid_list
def preprocess_math_zeroshot_chatml_template(data_file):
"""Preprocess MATH dataset with zero-shot ChatML template.
Args:
data_file: Path to MATH CSV file
Returns:
list: Formatted prompts with ChatML template and thinking enabled
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
df = pandas.read_csv(data_file)
test_list = [row.to_dict() for _, row in df.iterrows()]
prompt_list = []
for item in test_list:
final_question = item['Question'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_math500_zeroshot_chatml_template(data_file, use_r1=False):
"""Preprocess MATH-500 dataset with zero-shot prompting.
Args:
data_file: Path to MATH-500 JSONL file
use_r1: Whether to use DeepSeek R1-style prompting (default: False)
Returns:
list: Formatted prompts with boxed answer instruction
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
with open(data_file, "r") as f:
for line in f:
data_dict = json.loads(line)
final_question = data_dict['problem'].strip()
if use_r1:
final_prompt = """<begin▁of▁sentence><User>{question}\nPlease reason step by step, and put your final answer within \boxed{{}}.<Assistant><think>\n""".format(question=final_question)
else:
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_minerva_math_chatml_template(data_file):
"""Preprocess Minerva Math dataset with ChatML template.
Args:
data_file: Path to Minerva Math JSONL file
Returns:
list: Formatted prompts with boxed answer instruction
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
with open(data_file, "r") as f:
for line in f:
item = json.loads(line)
final_question = item['problem'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_gaokao2023en_chatml_template(data_file):
"""Preprocess Gaokao 2023 English dataset with ChatML template.
Args:
data_file: Path to Gaokao 2023 English JSONL file
Returns:
list: Formatted prompts with boxed answer instruction
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
with open(data_file, "r", encoding="utf-8") as f:
for line in f:
item = json.loads(line)
final_question = item['question'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_olympiadbench_chatml_template(data_file):
"""Preprocess OlympiadBench dataset with ChatML template.
Args:
data_file: Path to OlympiadBench JSONL file
Returns:
list: Formatted prompts with boxed answer instruction
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
with open(data_file, "r", encoding="utf-8") as f:
for line in f:
item = json.loads(line)
final_question = item['question'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_collegemath_chatml_template(data_file):
"""Preprocess College Math dataset with ChatML template.
Args:
data_file: Path to College Math JSONL file
Returns:
list: Formatted prompts with boxed answer instruction
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
with open(data_file, "r") as f:
for line in f:
item = json.loads(line)
final_question = item['question'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_aime24_chatml_template(data_file):
"""Preprocess AIME 2024 dataset with ChatML template.
Args:
data_file: Path to AIME 2024 JSONL file
Returns:
list: Formatted prompts with thinking enabled
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
with open(data_file, "r") as f:
for line in f:
item = json.loads(line)
final_question = item['question'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_aime25_chatml_template(data_file):
"""Preprocess AIME 2025 dataset with ChatML template.
Args:
data_file: Path to AIME 2025 JSONL file
Returns:
list: Formatted prompts with thinking enabled
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
with open(data_file, "r") as f:
for line in f:
item = json.loads(line)
final_question = item['problem'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_aime24_raw(data_file):
"""Preprocess AIME 2024 dataset with raw formatting.
Args:
data_file: Path to AIME 2024 JSONL file
Returns:
list: Raw prompts with boxed answer instruction
"""
prompt_list = []
with open(data_file, "r") as f:
for line in f:
item = json.loads(line)
final_question = item['question'].strip()
final_prompt = "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.".format(question=final_question)
prompt_list.append(final_prompt)
return prompt_list
def preprocess_aime25_raw(data_file):
"""Preprocess AIME 2025 dataset with raw formatting.
Args:
data_file: Path to AIME 2025 JSONL file
Returns:
list: Raw prompts with boxed answer instruction
"""
prompt_list = []
with open(data_file, "r") as f:
for line in f:
item = json.loads(line)
final_question = item['problem'].strip()
final_prompt = "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.".format(question=final_question)
prompt_list.append(final_prompt)
return prompt_list
def preprocess_amc23_chatml_template(data_file):
"""Preprocess AMC 2023 dataset with ChatML template.
Args:
data_file: Path to AMC 2023 JSONL file
Returns:
list: Formatted prompts with thinking enabled
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
with open(data_file, "r") as f:
for line in f:
item = json.loads(line)
final_question = item['question'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_omnimath_chatml_template(data_file):
"""Preprocess OmniMath dataset with ChatML template.
Args:
data_file: Path to OmniMath JSONL file
Returns:
list: Formatted prompts with thinking enabled
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
prompt_list = []
with open(data_file, "r") as f:
for line in f:
item = json.loads(line)
final_question = item['problem'].strip()
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_ifeval_chatml_template(data_file):
"""Preprocess IFEval instruction-following dataset with ChatML template.
Args:
data_file: Path to IFEval JSONL file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Formatted prompts with ChatML template
- qid_list: Task IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['key'])
first_question = item['prompt']
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n"
prompt_list.append(final_prompt)
return prompt_list, qid_list
def preprocess_ifeval_raw(data_file):
"""Preprocess IFEval instruction-following dataset with raw formatting.
Args:
data_file: Path to IFEval JSONL file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Raw instruction prompts
- qid_list: Task IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['key'])
final_prompt = item['prompt']
prompt_list.append(final_prompt)
return prompt_list, qid_list
def preprocess_ifbench_raw(data_file):
"""Preprocess IFBench instruction-following dataset with raw formatting.
Args:
data_file: Path to IFBench JSONL file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Raw instruction prompts
- qid_list: Task IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['key'])
final_prompt = item['prompt']
prompt_list.append(final_prompt)
return prompt_list, qid_list
def preprocess_arena_hard_chatml_template(data_file):
"""Preprocess Arena-Hard dataset with ChatML template.
Args:
data_file: Path to Arena-Hard JSONL file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Formatted prompts with ChatML template
- qid_list: Question IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['question_id'])
first_question = item['turns'][0]['content']
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n"
prompt_list.append(final_prompt)
return prompt_list, qid_list
def preprocess_arena_hard_raw(data_file):
"""Preprocess Arena-Hard dataset with raw formatting.
Args:
data_file: Path to Arena-Hard JSONL file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Raw question prompts
- qid_list: Question IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['question_id'])
first_question = item['turns'][0]['content']
prompt_list.append(first_question)
return prompt_list, qid_list
def preprocess_arena_hard_v2_raw(data_file):
"""Preprocess Arena-Hard v2.0 dataset with raw formatting.
Args:
data_file: Path to Arena-Hard v2.0 JSONL file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Raw question prompts
- qid_list: Unique IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['uid'])
first_question = item['prompt']
prompt_list.append(first_question)
return prompt_list, qid_list
def preprocess_alpaca_eval_raw(data_file):
"""Preprocess AlpacaEval dataset with raw formatting.
Args:
data_file: Path to AlpacaEval JSON file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Raw instruction prompts
- qid_list: Sequential question IDs
"""
with open(data_file, "r") as f:
data_list = json.load(f)
qid_list = []
prompt_list = []
for i, item in enumerate(data_list):
qid_list.append(i)
final_prompt = item['instruction']
prompt_list.append(final_prompt)
return prompt_list, qid_list
def preprocess_alpaca_eval_chatml_template(data_file):
"""Preprocess AlpacaEval dataset with ChatML template.
Args:
data_file: Path to AlpacaEval JSON file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Formatted prompts with ChatML template
- qid_list: Sequential question IDs
"""
with open(data_file, "r") as f:
data_list = json.load(f)
qid_list = []
prompt_list = []
for i, item in enumerate(data_list):
qid_list.append(i)
first_question = item['instruction']
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n"
prompt_list.append(final_prompt)
return prompt_list, qid_list
def preprocess_mtbench_firstturn(data_file):
"""Preprocess MT-Bench first turn with ChatML template.
Args:
data_file: Path to MT-Bench JSONL file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: First turn prompts with ChatML template
- qid_list: Question IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['question_id'])
first_question = item['turns'][0]
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n"
prompt_list.append(final_prompt)
return prompt_list, qid_list
def preprocess_mtbench_firstturn_raw(data_file):
"""Preprocess MT-Bench first turn with raw formatting.
Args:
data_file: Path to MT-Bench JSONL file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: First turn raw prompts
- qid_list: Question IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['question_id'])
first_question = item['turns'][0]
prompt_list.append(first_question)
return prompt_list, qid_list
def preprocess_mtbench_secondturn(data_file, output_file):
"""Preprocess MT-Bench second turn with ChatML template.
Args:
data_file: Path to MT-Bench JSONL file
output_file: Model output file for the first turn of MT-Bench
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Second turn prompts with conversation history
- qid_list: Question IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
id2output = {}
with open(output_file, "r") as f:
for line in f:
item = json.loads(line)
id2output[item['task_id']] = item['output']
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['question_id'])
first_question = item['turns'][0]
second_question = item['turns'][1]
model_output = id2output[item['question_id']]
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n" + model_output + "<|im_end|>\n" + \
"<|im_start|>user\n" + second_question + "<|im_end|>\n<|im_start|>assistant\n"
prompt_list.append(final_prompt)
return prompt_list, qid_list
def preprocess_mtbench_secondturn_raw(data_file, output_file):
"""Preprocess MT-Bench second turn with raw formatting.
Args:
data_file: Path to MT-Bench JSONL file
output_file: Model output file for the first turn of MT-Bench
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Second turn prompts as chat message lists
- qid_list: Question IDs
"""
with open(data_file, "r") as f:
data = f.readlines()
data_list = [json.loads(x) for x in data]
id2output = {}
with open(output_file, "r") as f:
for line in f:
item = json.loads(line)
id2output[item['task_id']] = (item['output'], item['reason_text'])
qid_list = []
prompt_list = []
for item in data_list:
qid_list.append(item['question_id'])
first_question = item['turns'][0]
second_question = item['turns'][1]
output, reason = id2output[item['question_id']]
model_output = output
chat = [
{'role': 'user', 'content': first_question},
{'role': 'assistant', 'content': model_output},
{'role': 'user', 'content': second_question}
]
prompt_list.append(chat)
return prompt_list, qid_list
def preprocess_mmlu_chatml_template(data_file):
"""Preprocess MMLU dataset with few-shot ChatML template.
Args:
data_file: Path to MMLU CSV file
Returns:
list: Formatted prompts with few-shot examples and ChatML template
"""
def _load_mmlu_cot_fewshot_examples():
import yaml
current_folder = os.path.dirname(os.path.abspath(__file__))
fewshot_folder = os.path.join(current_folder, "flan_cot_fewshot")
file_list = os.listdir(fewshot_folder)
fewshot_dict = {}
for filename in file_list:
with open(os.path.join(fewshot_folder, filename)) as f:
data = yaml.safe_load(f)
dataset_name = data['dataset_name'].strip()
description = data['description'].strip()
sample_list = data['fewshot_config']["samples"]
prompt = description
for sample in sample_list:
prompt += "\n\n"
prompt += "Q: " + sample['question'].strip() + "\n" + "A: " + sample['target'].strip()
fewshot_dict[dataset_name] = prompt
return fewshot_dict
fewshot_dict = _load_mmlu_cot_fewshot_examples()
df = pandas.read_csv(data_file)
test_list = [row.to_dict() for _, row in df.iterrows()]
prompt_list = []
for item in test_list:
subject = item['Subject']
fewshot_prompt = fewshot_dict[subject]
question = item['Question']
choice_a = str(item['A']).strip()
choice_b = str(item['B']).strip()
choice_c = str(item['C']).strip()
choice_d = str(item['D']).strip()
final_question = fewshot_prompt + "\n\n" + "Q: " + question + "\n"
final_question += "(A) " + choice_a + " (B) " + choice_b + " (C) " + choice_c + " (D) " + choice_d + "\n"
final_question += "A: "
final_prompt = "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_mmlu_raw_template(data_file):
"""Preprocess MMLU dataset with few-shot raw formatting.
Args:
data_file: Path to MMLU CSV file
Returns:
list: Raw prompts with few-shot examples
"""
def _load_mmlu_cot_fewshot_examples():
import yaml
current_folder = os.path.dirname(os.path.abspath(__file__))
fewshot_folder = os.path.join(current_folder, "flan_cot_fewshot")
file_list = os.listdir(fewshot_folder)
fewshot_dict = {}
for filename in file_list:
with open(os.path.join(fewshot_folder, filename)) as f:
data = yaml.safe_load(f)
dataset_name = data['dataset_name'].strip()
description = data['description'].strip()
sample_list = data['fewshot_config']["samples"]
prompt = description
for sample in sample_list:
prompt += "\n\n"
prompt += "Q: " + sample['question'].strip() + "\n" + "A: " + sample['target'].strip()
fewshot_dict[dataset_name] = prompt
return fewshot_dict
fewshot_dict = _load_mmlu_cot_fewshot_examples()
df = pandas.read_csv(data_file)
test_list = [row.to_dict() for _, row in df.iterrows()]
prompt_list = []
for item in test_list:
subject = item['Subject']
fewshot_prompt = fewshot_dict[subject]
question = item['Question']
choice_a = str(item['A']).strip()
choice_b = str(item['B']).strip()
choice_c = str(item['C']).strip()
choice_d = str(item['D']).strip()
final_question = fewshot_prompt + "\n\n" + "Q: " + question + "\n"
final_question += "(A) " + choice_a + " (B) " + choice_b + " (C) " + choice_c + " (D) " + choice_d + "\n"
final_question += "A: "
prompt_list.append(final_question)
return prompt_list
QUERY_TEMPLATE_MULTICHOICE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
{Question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
def preprocess_mmlu_r1_raw_template(data_file):
"""Preprocess MMLU dataset for R1-style models with zero-shot prompting.
Args:
data_file: Path to MMLU CSV file
Returns:
list: Formatted prompts for R1-style reasoning models
"""
df = pandas.read_csv(data_file)
test_list = [row.to_dict() for _, row in df.iterrows()]
prompt_list = []
for item in test_list:
subject = item['Subject']
question = item['Question']
choice_a = str(item['A']).strip()
choice_b = str(item['B']).strip()
choice_c = str(item['C']).strip()
choice_d = str(item['D']).strip()
# final_question = fewshot_prompt + "\n\n" + "Q: " + question + "\n"
# final_question += "(A) " + choice_a + " (B) " + choice_b + " (C) " + choice_c + " (D) " + choice_d + "\n"
# final_question += "A: "
final_question = QUERY_TEMPLATE_MULTICHOICE.format(Question=question, A=choice_a, B=choice_b, C=choice_c, D=choice_d)
prompt_list.append(final_question)
return prompt_list
def preprocess_mmlu_r1_raw_template_wdai(data_file):
"""Preprocess MMLU dataset with boxed answer format (alternative R1 style).
Args:
data_file: Path to MMLU CSV file
Returns:
list: Formatted prompts with boxed answer instruction
"""
MMLU_QUERY_TEMPLATE_MULTICHOICE = "Answer the following multiple-choice question. At the end of your response, conclude with the sentence `The answer is \\boxed{{X}}.`, replacing X with the correct capital letter of your choice.\n\n{Question}\n\nAnswer Choices:\n(A) {choice1}\n(B) {choice2}\n(C) {choice3}\n(D) {choice4}"
df = pandas.read_csv(data_file)
test_list = [row.to_dict() for _, row in df.iterrows()]
prompt_list = []
for item in test_list:
choices_dict = dict(
Question=item['Question'].strip(),
choice1=str(item['A']).strip(),
choice2=str(item['B']).strip(),
choice3=str(item['C']).strip(),
choice4=str(item['D']).strip()
)
final_question = MMLU_QUERY_TEMPLATE_MULTICHOICE.format(**choices_dict)
prompt_list.append(final_question)
return prompt_list
def preprocess_mmlu_pro_chatml_template(data_file, fewshot_file):
"""Preprocess MMLU-Pro dataset with 5-shot ChatML template.
Args:
data_file: Path to MMLU-Pro test JSON file
fewshot_file: Path to MMLU-Pro validation JSON file (for few-shot examples)
Returns:
list: Formatted prompts with 5-shot examples and ChatML template
"""
def _preprocess(data_list):
output_list = []
for item in data_list:
options = []
for opt in item["options"]:
if opt == "N/A":
continue
options.append(opt)
item["options"] = options
output_list.append(item)
return output_list
def _categorize_basedon_subject(data_list):
fewshot_dict = {}
for item in data_list:
subject = item['category']
if subject in fewshot_dict:
fewshot_dict[subject].append(item)
else:
fewshot_dict[subject] = [item]
return fewshot_dict
def _format_each_sample(sample, is_test):
choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"]
question = sample['question']
options = sample['options']
sample_prompt = "Q: " + question + "\n"
for i, opt in enumerate(options):
sample_prompt += "(%s) %s " % (choices[i], opt)
sample_prompt = sample_prompt.strip() + "\n"
if is_test:
sample_prompt += "A: "
else:
sample_prompt += sample['cot_content'].strip()
return sample_prompt
def _get_fewshot_prompt(fewshot_samples, test_sample, subject):
description = "The following are multiple choice questions (with answers) about %s." % subject
final_question = description + "\n\n"
for sample in fewshot_samples:
sample_prompt = _format_each_sample(sample, is_test=False)
final_question += sample_prompt + "\n\n"
test_prompt = _format_each_sample(test_sample, is_test=True)
final_question += test_prompt
return final_question
with open(fewshot_file, "r") as f:
fewshot_list = json.load(f)
with open(data_file, "r") as f:
test_list = json.load(f)
fewshot_list = _preprocess(fewshot_list)
fewshot_dict = _categorize_basedon_subject(fewshot_list)
test_list = _preprocess(test_list)
prompt_list = []
for test_sample in test_list:
subject = test_sample['category']
fewshot_samples = fewshot_dict[subject]
## 5-shot examples
assert len(fewshot_samples) == 5
final_question = _get_fewshot_prompt(fewshot_samples, test_sample, subject)
final_prompt = "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_mmlu_pro_zero_shot_chatml_template(data_file, think=True):
"""Preprocess MMLU-Pro dataset with zero-shot ChatML template.
Args:
data_file: Path to MMLU-Pro test JSON file
think: Whether to enable thinking mode (default: True)
Returns:
list: Formatted prompts with ChatML template and boxed answer instruction
"""
def _preprocess(data_list):
output_list = []
for item in data_list:
options = []
for opt in item["options"]:
if opt == "N/A":
continue
options.append(opt)
item["options"] = options
output_list.append(item)
return output_list
def _format_each_sample(sample):
choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"]
question = sample['question']
options = sample['options']
# sample_prompt = "Answer the following multiple-choice question. At the end of your response, conclude with the sentence `The answer is \\boxed{{X}}.`, replacing X with the correct capital letter of your choice.\n\n"
# sample_prompt += question + "\n\nAnswer Choices:"
# for i, opt in enumerate(options):
# sample_prompt += "\n(%s) %s" % (choices[i], opt)
# sample_prompt = sample_prompt.strip() + "\n"
sample_prompt = "Question:\n" + question + "\n\nAnswer Choices:"
for i, opt in enumerate(options):
sample_prompt += "\n(%s) %s" % (choices[i], opt)
sample_prompt += "\n\nConclude your response with the sentence `The answer is \\boxed{{X}}.`, in which X is the correct capital letter of your choice."
sample_prompt = sample_prompt.strip() + "\n"
return sample_prompt
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant.<|im_end|>\n"
with open(data_file, "r") as f:
test_list = json.load(f)
test_list = _preprocess(test_list)
prompt_list = []
for test_sample in test_list:
test_prompt = _format_each_sample(test_sample)
if think:
final_prompt = instruction + "<|im_start|>user\n" + test_prompt + "\n /think<|im_end|>\n<|im_start|>assistant\n<think>\n"
else:
final_prompt = instruction + "<|im_start|>user\n" + test_prompt + "\n /no_think<|im_end|>\n<|im_start|>assistant\n"
prompt_list.append(final_prompt)
return prompt_list
def preprocess_mmlu_pro_zero_shot_raw_template(data_file, think=True):
"""Preprocess MMLU-Pro dataset with zero-shot raw formatting.
Args:
data_file: Path to MMLU-Pro test JSON file
think: Whether to enable thinking mode (default: True, currently unused)
Returns:
list: Raw prompts with boxed answer instruction
"""
def _preprocess(data_list):
output_list = []
for item in data_list:
options = []
for opt in item["options"]:
if opt == "N/A":
continue
options.append(opt)
item["options"] = options
output_list.append(item)
return output_list
def _format_each_sample(sample):
choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"]
question = sample['question']
options = sample['options']
# sample_prompt = "Answer the following multiple-choice question. At the end of your response, conclude with the sentence `The answer is \\boxed{{X}}.`, replacing X with the correct capital letter of your choice.\n\n"
# sample_prompt += question + "\n\nAnswer Choices:"
# for i, opt in enumerate(options):
# sample_prompt += "\n(%s) %s" % (choices[i], opt)
# sample_prompt = sample_prompt.strip() + "\n"
sample_prompt = "Question:\n" + question + "\n\nAnswer Choices:"
for i, opt in enumerate(options):
sample_prompt += "\n(%s) %s" % (choices[i], opt)
sample_prompt += "\n\nConclude your response with the sentence `The answer is \\boxed{{X}}.`, in which X is the correct capital letter of your choice."
sample_prompt = sample_prompt.strip() + "\n"
return sample_prompt
with open(data_file, "r") as f:
test_list = json.load(f)
test_list = _preprocess(test_list)
prompt_list = []
for test_sample in test_list:
test_prompt = _format_each_sample(test_sample)
prompt_list.append(test_prompt)
return prompt_list
def preprocess_livecodebench_chatml_template(data_file):
"""Preprocess LiveCodeBench dataset with ChatML template.
Args:
data_file: Path to LiveCodeBench JSON file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Formatted coding prompts with ChatML template
- qid_list: Question IDs
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
code_instruction_nostartercode = """Write Python code to solve the problem. Please place the solution code in the following format:\n```python\n# Your solution code here\n```"""
code_instruction_hasstartercode = """Please place the solution code in the following format:\n```python\n# Your solution code here\n```"""
with open(data_file, "r") as f:
data_list = json.load(f)
prompt_list = []
qid_list = []
for item in data_list:
question = item['question_content'].strip()
if item['starter_code'] != "":
question += "\n\n" + "Solve the problem starting with the provided function header.\n\nFunction header:\n" + "```\n" + item['starter_code'] + "\n```"
question += "\n\n" + code_instruction_hasstartercode
else:
question += "\n\n" + code_instruction_nostartercode
final_prompt = instruction + "<|im_start|>user\n" + question + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
qid_list.append(item['question_id'])
return prompt_list, qid_list
def preprocess_livecodebench_raw(data_file):
"""Preprocess LiveCodeBench dataset with raw formatting.
Args:
data_file: Path to LiveCodeBench JSON file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Raw coding prompts
- qid_list: Question IDs
"""
code_instruction_nostartercode = """Write Python code to solve the problem. Please place the solution code in the following format:\n```python\n# Your solution code here\n```"""
code_instruction_hasstartercode = """Please place the solution code in the following format:\n```python\n# Your solution code here\n```"""
with open(data_file, "r") as f:
data_list = json.load(f)
prompt_list = []
qid_list = []
for item in data_list:
question = item['question_content'].strip()
if item['starter_code'] != "":
question += "\n\n" + "Solve the problem starting with the provided function header.\n\nFunction header:\n" + "```\n" + \
item['starter_code'] + "\n```"
question += "\n\n" + code_instruction_hasstartercode
else:
question += "\n\n" + code_instruction_nostartercode
final_prompt = question
prompt_list.append(final_prompt)
qid_list.append(item['question_id'])
return prompt_list, qid_list
def preprocess_mbpp_chatml_template(data_file):
"""Preprocess MBPP (Mostly Basic Python Problems) dataset with ChatML template.
Args:
data_file: Path to MBPP JSON file
Returns:
tuple: (prompt_list, qid_list)
- prompt_list: Formatted code generation prompts with ChatML template
- qid_list: Task IDs
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
with open(data_file, "r") as f:
data_dict = json.load(f)
prompt_list = []
qid_list = []
for key, value in data_dict.items():
qid_list.append(key)
question = value.get('text', value.get('prompt', ''))
final_prompt = instruction + "<|im_start|>user\n" + question + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list, qid_list
def preprocess_mmlu_stem_chatml_template(data_file):
"""Preprocess MMLU STEM subset with ChatML template.
Args:
data_file: Path to MMLU STEM JSON file
Returns:
list: Formatted prompts with ChatML template
"""
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n"
with open(data_file, "r") as f:
data_list = json.load(f)
prompt_list = []
for item in data_list:
question = item['question'].strip()
choices = item.get('choices', [])
final_question = question + "\n"
if choices:
for i, choice in enumerate(choices):
final_question += f"({chr(65+i)}) {choice}\n"
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n"
prompt_list.append(final_prompt)
return prompt_list