Files
sglang/benchmark/mmmu/data_utils.py

222 lines
6.6 KiB
Python
Raw Normal View History

"""Utils for data load, save, and process (e.g., prompt construction)"""
import json
import os
import re
import yaml
from datasets import concatenate_datasets, load_dataset
DOMAIN_CAT2SUB_CAT = {
"Art and Design": ["Art", "Art_Theory", "Design", "Music"],
"Business": ["Accounting", "Economics", "Finance", "Manage", "Marketing"],
"Science": [
"Biology",
"Chemistry",
"Geography",
"Math",
"Physics",
],
"Health and Medicine": [
"Basic_Medical_Science",
"Clinical_Medicine",
"Diagnostics_and_Laboratory_Medicine",
"Pharmacy",
"Public_Health",
],
"Humanities and Social Science": [
"History",
"Literature",
"Sociology",
"Psychology",
],
"Tech and Engineering": [
"Agriculture",
"Architecture_and_Engineering",
"Computer_Science",
"Electronics",
"Energy_and_Power",
"Materials",
"Mechanical_Engineering",
],
}
CAT_SHORT2LONG = {
"acc": "Accounting",
"agri": "Agriculture",
"arch": "Architecture_and_Engineering",
"art": "Art",
"art_theory": "Art_Theory",
"bas_med": "Basic_Medical_Science",
"bio": "Biology",
"chem": "Chemistry",
"cli_med": "Clinical_Medicine",
"cs": "Computer_Science",
"design": "Design",
"diag_med": "Diagnostics_and_Laboratory_Medicine",
"econ": "Economics",
"elec": "Electronics",
"ep": "Energy_and_Power",
"fin": "Finance",
"geo": "Geography",
"his": "History",
"liter": "Literature",
"manage": "Manage",
"mark": "Marketing",
"mate": "Materials",
"math": "Math",
"mech": "Mechanical_Engineering",
"music": "Music",
"phar": "Pharmacy",
"phys": "Physics",
"psy": "Psychology",
"pub_health": "Public_Health",
"socio": "Sociology",
}
# DATA SAVING
def save_json(filename, ds):
with open(filename, "w") as f:
json.dump(ds, f, indent=4)
def get_multi_choice_info(options):
"""
Given the list of options for multiple choice question
Return the index2ans and all_choices
"""
start_chr = "A"
all_choices = []
index2ans = {}
for i, option in enumerate(options):
index2ans[chr(ord(start_chr) + i)] = option
all_choices.append(chr(ord(start_chr) + i))
return index2ans, all_choices
def load_yaml(file_path):
with open(file_path, "r") as stream:
try:
yaml_dict = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
return yaml_dict
def parse_img_path(text):
matches = re.findall("<img='(.*?)'>", text)
return matches
def process_single_sample(data):
question = data["question"]
o_imgs_paths = []
for option in data["options"]:
current_o_imgs_paths = parse_img_path(option)
for img_path in current_o_imgs_paths:
o_imgs_paths.append(img_path)
if len(o_imgs_paths) > 1: # multiple images in options, used for random selection
return {
"id": data["id"],
"question": question,
"options": data["options"],
"answer": data["answer"],
"image": None,
"question_type": data["question_type"],
}
else:
return {
"id": data["id"],
"question": question,
"options": data["options"],
"answer": data["answer"],
"image": data["image_1"],
"question_type": data["question_type"],
}
# DATA SAVING
def save_json(filename, ds):
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "w") as f:
json.dump(ds, f, indent=4)
def save_jsonl(filename, data):
"""
Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.
Args:
filename (str): The path to the file where the data should be saved.
data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
"""
with open(filename, "w", encoding="utf-8") as f:
for img_path, caption in data.items():
# Extract the base filename without the extension
base_filename = os.path.basename(img_path)
# Create a JSON object with the filename as the key and caption as the value
json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
# Write the JSON object to the file, one per line
f.write(json_record + "\n")
def save_args(args, path_dir):
argsDict = args.__dict__
with open(path_dir + "setting.txt", "w") as f:
f.writelines("------------------ start ------------------" + "\n")
for eachArg, value in argsDict.items():
f.writelines(eachArg + " : " + str(value) + "\n")
f.writelines("------------------- end -------------------")
# DATA PROCESSING
def construct_prompt(sample, config):
question = sample["question"]
options = eval(sample["options"])
example = ""
if sample["question_type"] == "multiple-choice":
start_chr = "A"
prediction_range = []
index2ans = {}
for option in options:
prediction_range.append(start_chr)
example += f"({start_chr}) {option}\n"
index2ans[start_chr] = option
start_chr = chr(ord(start_chr) + 1)
empty_prompt_sample_structure = config["multi_choice_example_format"]
empty_prompt = empty_prompt_sample_structure.format(question, example)
res_dict = {}
res_dict["index2ans"] = index2ans
res_dict["correct_choice"] = sample["answer"]
res_dict["all_choices"] = prediction_range
res_dict["empty_prompt"] = empty_prompt
if config["task_instructions"]:
res_dict["final_input_prompt"] = (
config["task_instructions"].strip() + "\n\n" + empty_prompt
)
else:
res_dict["final_input_prompt"] = empty_prompt
res_dict["gt_content"] = options[ord(sample["answer"].upper()) - ord("A")]
else:
empty_prompt_sample_structure = config["short_ans_example_format"]
empty_prompt = empty_prompt_sample_structure.format(question)
res_dict = {}
res_dict["empty_prompt"] = empty_prompt
if config["task_instructions"]:
res_dict["final_input_prompt"] = (
config["task_instructions"].strip() + "\n\n" + empty_prompt
)
else:
res_dict["final_input_prompt"] = empty_prompt
res_dict["gt_content"] = sample["answer"]
res_dict.update(sample)
return res_dict