refactor: move image processors to separate files (#4229)

This commit is contained in:
Mick
2025-03-12 03:35:35 +08:00
committed by GitHub
parent 0f2a2e3c19
commit ff2ce0b86f
22 changed files with 1085 additions and 955 deletions

View File

@@ -8,9 +8,9 @@
"""
import argparse
import base64
import dataclasses
import random
import re
from io import BytesIO
from data_utils import save_json
@@ -18,13 +18,14 @@ from eval_utils import (
EvalArgs,
eval_result,
get_sampling_params,
parse_multi_choice_response,
prepare_samples,
process_result,
)
from tqdm import tqdm
from sglang import Engine
from sglang.srt.conversation import chat_templates
from sglang.srt.conversation import generate_chat_conv
from sglang.srt.openai_api.protocol import ChatCompletionRequest
from sglang.srt.server_args import ServerArgs
@@ -35,61 +36,76 @@ def eval_mmmu(args):
if server_args.chat_template is None:
raise ValueError("Chat template must be provided for this benchmark")
samples = prepare_samples(eval_args)
backend = Engine(**dataclasses.asdict(server_args))
out_samples = dict()
sampling_params = get_sampling_params(eval_args)
conv = chat_templates[server_args.chat_template].copy()
image_token = conv.image_token
samples = prepare_samples(eval_args)
answer_dict = {}
for sample in tqdm(samples):
prompt = sample["final_input_prompt"]
image = sample["image"]
bytes_io = BytesIO()
image.save(bytes_io, format="PNG")
png_bytes = bytes_io.getvalue()
prompt = re.sub(r"<[^>]*>", image_token, prompt)
buff = BytesIO()
image.save(buff, format="PNG")
base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
prefix = prompt.split("<")[0]
suffix = prompt.split(">")[1]
request_dict = {
"model": "",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prefix,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_str}"
},
},
{
"type": "text",
"text": suffix,
},
],
}
],
}
conv = generate_chat_conv(
ChatCompletionRequest(**request_dict),
template_name=server_args.chat_template,
)
prompt = conv.get_prompt()
if image is not None:
gen_out = backend.generate(
prompt=prompt, image_data=[png_bytes], sampling_params=sampling_params
prompt=prompt,
image_data=conv.image_data,
sampling_params=sampling_params,
)["text"]
response = gen_out
else: # multiple images actually
if sample["question_type"] == "multiple-choice":
all_choices = sample["all_choices"]
response = random.choice(all_choices)
else:
response = "INVALID GENERATION FOR MULTIPLE IMAGE INPUTS"
if sample["question_type"] == "multiple-choice":
pred_ans = parse_multi_choice_response(
response, sample["all_choices"], sample["index2ans"]
)
else: # open question
pred_ans = response
out_samples[sample["id"]] = pred_ans
# set ground truth answer
answer_dict[sample["id"]] = {
"question_type": sample["question_type"],
"ground_truth": (
sample["correct_choice"]
if "correct_choice" in samples
else sample["answer"]
),
}
process_result(response, sample, answer_dict, out_samples)
args.output_path = f"{args.model_path}_val_sglang.json"
save_json(args.output_path, out_samples)
eval_result(output_path=args.output_path, answer_dict=answer_dict)
eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)
backend.shutdown()
if __name__ == "__main__":