adapt to sglang v0.5.2rc1 on dcu

2025-09-04 15:56:33 +08:00
commit 909abb58f5
2320 changed files with 489411 additions and 0 deletions
--- a/benchmark/mmmu/README.md
+++ b/benchmark/mmmu/README.md
@@ -0,0 +1,46 @@
+## Run evaluation
+
+### Evaluate sglang
+
+Host the VLM:
+
+```
+python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --port 30000
+```
+
+It's recommended to reduce the memory usage by appending something like `--mem-fraction-static 0.6` to the command above.
+
+Benchmark:
+
+```
+python benchmark/mmmu/bench_sglang.py --port 30000 --concurrency 16
+```
+
+You can adjust the `--concurrency` to control the number of concurrent OpenAI calls.
+
+You can use `--lora-path` to specify the LoRA adapter to apply during benchmarking. E.g.,
+```
+# Launch server with LoRA enabled
+python -m sglang.launch_server --model-path microsoft/Phi-4-multimodal-instruct --port 30000 --trust-remote-code --disable-radix-cache --lora-paths vision=<LoRA path>
+
+# Apply LoRA adapter during inferencing
+python -m benchmark/mmmu/bench_sglang.py --concurrency 8 --lora-path vision
+```
+
+You can use `--response-answer-regex` to specify how to extract the answer from the response string. E.g.,
+```
+python3 -m sglang.launch_server --model-path zai-org/GLM-4.1V-9B-Thinking --reasoning-parser glm45
+
+python3 bench_sglang.py --response-answer-regex "<\|begin_of_box\|>(.*)<\|end_of_box\|>" --concurrency 64
+```
+
+You can use `--extra-request-body` to specify additional OpenAI request parameters. E.g.,
+```
+python3 bench_sglang.py --extra-request-body '{"max_new_tokens": 128, "temperature": 0.01}'
+```
+
+### Evaluate hf
+
+```
+python benchmark/mmmu/bench_hf.py --model-path Qwen/Qwen2-VL-7B-Instruct
+```
--- a/benchmark/mmmu/bench_hf.py
+++ b/benchmark/mmmu/bench_hf.py
@@ -0,0 +1,164 @@
+import argparse
+
+import PIL
+import torch
+from data_utils import save_json
+from eval_utils import (
+    EvalArgs,
+    eval_result,
+    get_sampling_params,
+    prepare_samples,
+    process_result,
+)
+from tqdm import tqdm
+from transformers import AutoModel, AutoProcessor, GenerationConfig
+
+
+@torch.no_grad()
+def eval_mmmu(args):
+    eval_args = EvalArgs.from_cli_args(args)
+
+    sampling_params = get_sampling_params(eval_args)
+    generation_config = GenerationConfig(
+        max_new_tokens=sampling_params["max_new_tokens"],
+        do_sample=False,
+    )
+
+    try:
+        from transformers import AutoModelForImageTextToText
+
+        model = AutoModelForImageTextToText.from_pretrained(
+            args.model_path,
+            torch_dtype="auto",
+            trust_remote_code=True,
+        )
+    except Exception as first_exception:
+        try:
+            # check if the model is belongs to internvl
+            if "InternVL" in args.model_path:
+                from internvl_utils import load_image
+                from transformers import AutoTokenizer
+
+                tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+                model = AutoModel.from_pretrained(
+                    args.model_path,
+                    torch_dtype="auto",
+                    trust_remote_code=True,
+                )
+                generation_config_internvl = dict(
+                    max_new_tokens=sampling_params["max_new_tokens"], do_sample=False
+                )
+
+            else:
+                model = AutoModel.from_pretrained(
+                    args.model_path,
+                    torch_dtype="auto",
+                    trust_remote_code=True,
+                    init_tts=False,
+                )
+        except Exception as second_exception:
+            raise RuntimeError(
+                f"Failed to load model: First attempt failed with {first_exception}, "
+                f"second attempt failed with {second_exception}"
+            ) from second_exception
+
+    model = model.eval().cuda()
+
+    processor = AutoProcessor.from_pretrained(
+        args.model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
+    )
+
+    samples = prepare_samples(eval_args)
+    out_samples = dict()
+
+    answer_dict = {}
+    for sample in tqdm(samples):
+        prompt = sample["final_input_prompt"]
+        image = sample["image"]
+        prefix = prompt.split("<")[0]
+        suffix = prompt.split(">")[1]
+        assert image is not None
+
+        if "InternVL" in args.model_path:
+            pixel_values = load_image(sample["image_path"]).to(torch.bfloat16).cuda()
+            contents = ""
+            if prefix:
+                contents += prefix
+            contents += "<image>\n"
+            if suffix:
+                contents += suffix
+            response = model.chat(
+                tokenizer, pixel_values, contents, generation_config_internvl
+            )
+            print(f"response: {response}")
+            process_result(response, sample, answer_dict, out_samples)
+            continue
+
+        contents = []
+        if prefix:
+            contents += [{"type": "text", "text": prefix}]
+        contents += [
+            {
+                "type": "image",
+                "image": sample["image_path"],
+            }
+        ]
+        if suffix:
+            contents += [{"type": "text", "text": suffix}]
+        messages = [{"role": "user", "content": contents}]
+        try:
+            model_inputs = processor.tokenizer.apply_chat_template(
+                messages,
+                tokenize=True,
+                return_dict=True,
+                add_generation_prompt=True,
+                return_tensors="pt",
+            ).to(model.device)
+            input_len = model_inputs["input_ids"].shape[-1]
+            generation = model.generate(
+                **model_inputs, generation_config=generation_config
+            )
+            generation = generation[0][input_len:]
+            response = processor.decode(generation, skip_special_tokens=True)
+        except:
+            contents = []
+            if prefix:
+                contents += [prefix]
+            image = PIL.Image.open(sample["image_path"])
+            contents += [image]
+            if suffix:
+                contents += [suffix]
+            messages = [{"role": "user", "content": contents}]
+            response = model.chat(
+                msgs=messages,
+                tokenizer=processor.tokenizer,
+                sampling=False,
+                max_new_tokens=sampling_params["max_new_tokens"],
+                use_tts_template=False,
+                generate_audio=False,
+                temperature=0.0,
+            )
+        print(f"response: {response}")
+        process_result(response, sample, answer_dict, out_samples)
+
+    args.output_path = f"{args.model_path}_answer_hf.json"
+    save_json(args.output_path, out_samples)
+    eval_result(
+        model_answer_path=args.output_path,
+        answer_dict=answer_dict,
+        eval_output_path=f"{args.model_path}_val_hf.json",
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
+        required=True,
+    )
+    EvalArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    eval_mmmu(args)
--- a/benchmark/mmmu/bench_sglang.py
+++ b/benchmark/mmmu/bench_sglang.py
@@ -0,0 +1,212 @@
+"""
+Bench the sglang-hosted vLM with benchmark MMMU
+
+Usage:
+    Host the VLM: python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --port 30000
+
+    Benchmark: python benchmark/mmmu/bench_sglang.py --port 30000 --concurrency 16
+
+The eval output will be logged
+"""
+
+import argparse
+import asyncio
+import re
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Any, List, Optional, Tuple
+
+import aiohttp
+import openai
+from data_utils import save_json
+from eval_utils import (
+    EvalArgs,
+    eval_result,
+    get_sampling_params,
+    prepare_samples,
+    process_result,
+)
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_sglang_args_and_parse
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: List[str] = field(default_factory=list)
+    prompt_len: List[int] = field(default_factory=list)
+    output_len: List[int] = field(default_factory=list)
+    latency: List[float] = field(default_factory=list)
+    ttft: List[float] = field(default_factory=list)
+    itl: List[float] = field(default_factory=list)  # List of inter-token latencies
+
+    success: bool = False
+    error: str = ""
+
+
+async def async_request_profile(api_url: str) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        output = RequestFuncOutput()
+        try:
+            async with session.post(url=api_url) as response:
+                if response.status == 200:
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    return output
+
+
+def _get_prefix_suffix(prompt: str) -> Tuple[str, str]:
+    """Split the prompt into prefix and suffix."""
+    prefix = prompt.split("<")[0]
+    suffix = prompt.split(">", 1)[1]
+    return prefix, suffix
+
+
+async def process_sample(
+    client: Any, sample: dict, sampling_params: dict, lora_path: Optional[str] = None
+) -> Tuple[dict, str]:
+    """Send a single sample to the LLM and return (sample, response)."""
+    prompt = sample["final_input_prompt"]
+    prefix, suffix = _get_prefix_suffix(prompt)
+    image = sample["image"]
+    assert image is not None
+    image_path = sample["image_path"]
+    extra_body = None if lora_path is None else {"lora_path": lora_path}
+    response = await client.chat.completions.create(
+        model="default",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prefix},
+                    {"type": "image_url", "image_url": {"url": image_path}},
+                    {"type": "text", "text": suffix},
+                ],
+            }
+        ],
+        temperature=0,
+        max_completion_tokens=sampling_params["max_new_tokens"],
+        max_tokens=sampling_params["max_new_tokens"],
+        extra_body=extra_body,
+    )
+    return sample, response.choices[0].message.content
+
+
+async def process_sample_with_semaphore(
+    semaphore: asyncio.Semaphore,
+    client: Any,
+    sample: dict,
+    sampling_params: dict,
+    lora_path: Optional[str] = None,
+) -> Tuple[dict, str]:
+    """Wrap process_sample with a semaphore for concurrency control."""
+    async with semaphore:
+        return await process_sample(client, sample, sampling_params, lora_path)
+
+
+async def eval_mmmu(args) -> None:
+    """Main evaluation loop with concurrency control."""
+    eval_args = EvalArgs.from_cli_args(args)
+    sampling_params = get_sampling_params(eval_args)
+    samples = prepare_samples(eval_args)
+    lora_path = eval_args.lora_path
+    answer_dict = {}
+    out_samples = {}
+    client = openai.AsyncOpenAI(
+        api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
+    )
+    start = time.perf_counter()
+    base_url = f"http://127.0.0.1:{args.port}"
+
+    if args.profile:
+        print("Starting profiler...")
+        profile_output = await async_request_profile(
+            api_url=f"{base_url}/start_profile"
+        )
+        if profile_output.success:
+            print("Profiler started")
+
+        samples = samples[: args.profile_number]
+
+    if args.concurrency == 1:
+        # For concurrency == 1, run in sequential mode to ensure consistent order
+        # this is mainly for profiling
+        for sample in tqdm(samples):
+            _, response = await process_sample(
+                client, sample, sampling_params, lora_path
+            )
+            answer = (
+                re.search(args.response_answer_regex, response)
+                if response is not None
+                else None
+            )
+            process_result(
+                answer.group(1) if answer else response,
+                sample,
+                answer_dict,
+                out_samples,
+            )
+    else:
+        semaphore = asyncio.Semaphore(args.concurrency)
+        tasks = [
+            process_sample_with_semaphore(
+                semaphore, client, sample, sampling_params, lora_path
+            )
+            for sample in samples
+        ]
+
+        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
+            sample, response = await coro
+            answer = (
+                re.search(args.response_answer_regex, response)
+                if response is not None
+                else None
+            )
+            process_result(
+                answer.group(1) if answer else response,
+                sample,
+                answer_dict,
+                out_samples,
+            )
+
+    if args.profile:
+        print("Stopping profiler...")
+        profile_output = await async_request_profile(api_url=f"{base_url}/stop_profile")
+        if profile_output.success:
+            print("Profiler stopped")
+
+    print(f"Benchmark time: {time.perf_counter() - start}")
+    args.output_path = "./answer_sglang.json"
+    save_json(args.output_path, out_samples)
+    eval_result(
+        model_answer_path=args.output_path,
+        answer_dict=answer_dict,
+        eval_output_path="./val_sglang.json",
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    EvalArgs.add_cli_args(parser)
+    args = add_common_sglang_args_and_parse(parser)
+    return args
+
+
+def main():
+    args = parse_args()
+    asyncio.run(eval_mmmu(args))
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/mmmu/data_utils.py
+++ b/benchmark/mmmu/data_utils.py
@@ -0,0 +1,221 @@
+"""Utils for data load, save, and process (e.g., prompt construction)"""
+
+import json
+import os
+import re
+
+import yaml
+
+DOMAIN_CAT2SUB_CAT = {
+    "Art and Design": ["Art", "Art_Theory", "Design", "Music"],
+    "Business": ["Accounting", "Economics", "Finance", "Manage", "Marketing"],
+    "Science": [
+        "Biology",
+        "Chemistry",
+        "Geography",
+        "Math",
+        "Physics",
+    ],
+    "Health and Medicine": [
+        "Basic_Medical_Science",
+        "Clinical_Medicine",
+        "Diagnostics_and_Laboratory_Medicine",
+        "Pharmacy",
+        "Public_Health",
+    ],
+    "Humanities and Social Science": [
+        "History",
+        "Literature",
+        "Sociology",
+        "Psychology",
+    ],
+    "Tech and Engineering": [
+        "Agriculture",
+        "Architecture_and_Engineering",
+        "Computer_Science",
+        "Electronics",
+        "Energy_and_Power",
+        "Materials",
+        "Mechanical_Engineering",
+    ],
+}
+
+
+CAT_SHORT2LONG = {
+    "acc": "Accounting",
+    "agri": "Agriculture",
+    "arch": "Architecture_and_Engineering",
+    "art": "Art",
+    "art_theory": "Art_Theory",
+    "bas_med": "Basic_Medical_Science",
+    "bio": "Biology",
+    "chem": "Chemistry",
+    "cli_med": "Clinical_Medicine",
+    "cs": "Computer_Science",
+    "design": "Design",
+    "diag_med": "Diagnostics_and_Laboratory_Medicine",
+    "econ": "Economics",
+    "elec": "Electronics",
+    "ep": "Energy_and_Power",
+    "fin": "Finance",
+    "geo": "Geography",
+    "his": "History",
+    "liter": "Literature",
+    "manage": "Manage",
+    "mark": "Marketing",
+    "mate": "Materials",
+    "math": "Math",
+    "mech": "Mechanical_Engineering",
+    "music": "Music",
+    "phar": "Pharmacy",
+    "phys": "Physics",
+    "psy": "Psychology",
+    "pub_health": "Public_Health",
+    "socio": "Sociology",
+}
+
+
+# DATA SAVING
+def save_json(filename, ds):
+    with open(filename, "w") as f:
+        json.dump(ds, f, indent=4)
+
+
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    """
+
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
+
+
+def load_yaml(file_path):
+    with open(file_path, "r") as stream:
+        try:
+            yaml_dict = yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+    return yaml_dict
+
+
+def parse_img_path(text):
+    matches = re.findall("<img='(.*?)'>", text)
+    return matches
+
+
+def process_single_sample(data):
+    question = data["question"]
+    o_imgs_paths = []
+    for option in data["options"]:
+        current_o_imgs_paths = parse_img_path(option)
+        for img_path in current_o_imgs_paths:
+            o_imgs_paths.append(img_path)
+
+    if len(o_imgs_paths) > 1:  # multiple images in options, used for random selection
+        return {
+            "id": data["id"],
+            "question": question,
+            "options": data["options"],
+            "answer": data["answer"],
+            "image": None,
+            "question_type": data["question_type"],
+        }
+    else:
+        return {
+            "id": data["id"],
+            "question": question,
+            "options": data["options"],
+            "answer": data["answer"],
+            "image": data["image_1"],
+            "question_type": data["question_type"],
+        }
+
+
+# DATA SAVING
+def save_json(filename, ds):
+    print(f"answers saved to: {filename}")
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    with open(filename, "w") as f:
+        json.dump(ds, f, indent=4)
+
+
+def save_jsonl(filename, data):
+    """
+    Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.
+
+    Args:
+        filename (str): The path to the file where the data should be saved.
+        data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
+    """
+    with open(filename, "w", encoding="utf-8") as f:
+        for img_path, caption in data.items():
+            # Extract the base filename without the extension
+            base_filename = os.path.basename(img_path)
+            # Create a JSON object with the filename as the key and caption as the value
+            json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
+            # Write the JSON object to the file, one per line
+            f.write(json_record + "\n")
+
+
+def save_args(args, path_dir):
+    argsDict = args.__dict__
+    with open(path_dir + "setting.txt", "w") as f:
+        f.writelines("------------------ start ------------------" + "\n")
+        for eachArg, value in argsDict.items():
+            f.writelines(eachArg + " : " + str(value) + "\n")
+        f.writelines("------------------- end -------------------")
+
+
+# DATA PROCESSING
+def construct_prompt(sample, config):
+    question = sample["question"]
+    options = eval(sample["options"])
+    example = ""
+    if sample["question_type"] == "multiple-choice":
+        start_chr = "A"
+        prediction_range = []
+        index2ans = {}
+        for option in options:
+            prediction_range.append(start_chr)
+            example += f"({start_chr}) {option}\n"
+            index2ans[start_chr] = option
+            start_chr = chr(ord(start_chr) + 1)
+        empty_prompt_sample_structure = config["multi_choice_example_format"]
+        empty_prompt = empty_prompt_sample_structure.format(question, example)
+        res_dict = {}
+        res_dict["index2ans"] = index2ans
+        res_dict["correct_choice"] = sample["answer"]
+        res_dict["all_choices"] = prediction_range
+        res_dict["empty_prompt"] = empty_prompt
+        if config["task_instructions"]:
+            res_dict["final_input_prompt"] = (
+                config["task_instructions"].strip() + "\n\n" + empty_prompt
+            )
+        else:
+            res_dict["final_input_prompt"] = empty_prompt
+
+        res_dict["gt_content"] = options[ord(sample["answer"].upper()) - ord("A")]
+    else:
+        empty_prompt_sample_structure = config["short_ans_example_format"]
+        empty_prompt = empty_prompt_sample_structure.format(question)
+        res_dict = {}
+        res_dict["empty_prompt"] = empty_prompt
+        if config["task_instructions"]:
+            res_dict["final_input_prompt"] = (
+                config["task_instructions"].strip() + "\n\n" + empty_prompt
+            )
+        else:
+            res_dict["final_input_prompt"] = empty_prompt
+        res_dict["gt_content"] = sample["answer"]
+
+    res_dict.update(sample)
+    return res_dict
--- a/benchmark/mmmu/eval_utils.py
+++ b/benchmark/mmmu/eval_utils.py
@@ -0,0 +1,649 @@
+"""Response Parsing and Evaluation for various models"""
+
+import argparse
+import dataclasses
+import json
+import os
+import pprint
+import random
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, Optional
+
+import numpy as np
+import torch
+from data_utils import (
+    CAT_SHORT2LONG,
+    DOMAIN_CAT2SUB_CAT,
+    construct_prompt,
+    load_yaml,
+    process_single_sample,
+)
+from datasets import concatenate_datasets, load_dataset
+from tqdm import tqdm
+
+
+@dataclasses.dataclass
+class EvalArgs:
+    seed: int = 42
+    split: str = "validation"
+    image_pixels_limit: int = -1
+    result_filename: str = ""
+    prompt_format_file: str = "prompt_format.yaml"
+    dataset_path: str = "MMMU/MMMU"
+    extra_request_body: Optional[str] = None
+    profile: bool = False
+    profile_number: int = 5
+    concurrency: int = 1
+    response_answer_regex: str = "(.*)"
+    lora_path: Optional[str] = None
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "--result-filename",
+            type=str,
+            default=EvalArgs.result_filename,
+            help="The filename to save the evaluation results.",
+        )
+        parser.add_argument(
+            "--image-pixels-limit",
+            type=int,
+            default=EvalArgs.image_pixels_limit,
+            help="The maximum number of pixels allowed for an image. If an image exceeds this limit, it will be skipped during evaluation.",
+        )
+        parser.add_argument(
+            "--dataset-path",
+            type=str,
+            default=EvalArgs.dataset_path,
+            help="path to the dataset",
+        )
+        parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+        parser.add_argument(
+            "--prompt-format-file",
+            type=str,
+            help="The path to the prompt format of mmmu. If not, a default format llava_config.yaml will be used",
+        )
+        parser.add_argument(
+            "--split",
+            type=str,
+            default=EvalArgs.split,
+            help='Split of the dataset to use for evaluation. Default is "validation".',
+        )
+        parser.add_argument(
+            "--extra-request-body",
+            metavar='{"key1": "value1", "key2": "value2"}',
+            type=str,
+            default=EvalArgs.extra_request_body,
+            help="Append given JSON object to the request payload. You can use this to specify"
+            "additional generate params like sampling params.",
+        )
+        parser.add_argument(
+            "--profile", action="store_true", help="enable mmmu profile"
+        )
+        parser.add_argument(
+            "--profile-number",
+            type=int,
+            default=EvalArgs.profile_number,
+            help="Number of samples to profile. If not set, will profile all samples.",
+        )
+        parser.add_argument(
+            "--concurrency",
+            type=int,
+            default=EvalArgs.concurrency,
+            help="Number of concurrent requests to make during evaluation. Default is 1, which means no concurrency.",
+        )
+        parser.add_argument(
+            "--response-answer-regex",
+            type=str,
+            default=EvalArgs.response_answer_regex,
+            help="Specific regex to capture the answer from the response, string",
+        )
+        parser.add_argument(
+            "--lora-path",
+            type=str,
+            default=EvalArgs.lora_path,
+            help="Specify the LoRA path to use for evaluation. If specified, the value will be specified in the body of every request as `lora-path`.",
+        )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+
+
+def set_seed(seed_value):
+    """
+    Set the seed for PyTorch (both CPU and CUDA), Python, and NumPy for reproducible results.
+
+    :param seed_value: An integer value to be used as the seed.
+    """
+    torch.manual_seed(seed_value)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed_value)
+        torch.cuda.manual_seed_all(seed_value)  # For multi-GPU setups
+    random.seed(seed_value)
+    np.random.seed(seed_value)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+
+def prepare_samples(eval_args: EvalArgs):
+    print("Preparing samples...")
+    # Build prompts
+    set_seed(eval_args.seed)
+
+    prompt_format_file = (
+        eval_args.prompt_format_file
+        if eval_args.prompt_format_file is not None
+        else os.path.join(os.path.dirname(__file__), "prompt_format.yaml")
+    )
+    # load config and process to one value
+    eval_args.config = load_yaml(prompt_format_file)
+    for key, value in eval_args.config.items():
+        if key != "eval_params" and type(value) == list:
+            assert len(value) == 1, "key {} has more than one value".format(key)
+            eval_args.config[key] = value[0]
+
+    # run for each subject in parallel
+    sub_dataset_list = []
+    subjects = list(CAT_SHORT2LONG.values())  # Get a fixed list of subjects
+
+    print(f"Loading datasets for {len(subjects)} subjects...")
+    with ThreadPoolExecutor() as executor:
+        # Submit all load_dataset tasks
+        future_to_subject = {
+            executor.submit(
+                load_dataset, eval_args.dataset_path, subject, split=eval_args.split
+            ): subject
+            for subject in subjects
+        }
+
+        # Collect results as they complete
+        results = {}
+        for future in tqdm(
+            as_completed(future_to_subject),
+            total=len(subjects),
+            desc="Loading datasets",
+        ):
+            subject = future_to_subject[future]
+            try:
+                results[subject] = future.result()
+            except Exception as exc:
+                print(f"{subject} generated an exception: {exc}")
+
+    # Ensure datasets are added in the original order for consistency
+    for subject in subjects:
+        if subject in results:
+            sub_dataset_list.append(results[subject])
+        else:
+            # Handle cases where a dataset failed to load (optional, depends on desired behavior)
+            print(f"Warning: Dataset for subject '{subject}' could not be loaded.")
+
+    # merge all dataset
+    dataset = concatenate_datasets(sub_dataset_list)
+
+    # Prepare images in parallel
+    images_path = os.path.expanduser("~/.cache/mmmu/images")
+    os.makedirs(images_path, exist_ok=True)
+    print(f"Saving images to: {images_path}")
+
+    samples = []
+    skip_count = 0
+
+    def process_sample(i, sample):
+        sample = process_single_sample(sample)
+        sample = construct_prompt(sample, eval_args.config)
+        image = sample["image"]
+        width, height = image.size
+        if 0 < eval_args.image_pixels_limit <= width * height:
+            return None, True
+        # Use a unique identifier for the image path to avoid potential collisions if indices reset
+        image_path = f"{images_path}/image_{sample['id']}.png"
+        if not os.path.exists(image_path):
+            image.save(image_path)
+        sample["image_path"] = image_path
+        return sample, False
+
+    print("Processing samples...")
+    with ThreadPoolExecutor() as executor:
+        # Pass the sample itself to process_sample, index is less reliable now
+        futures = [
+            executor.submit(
+                process_sample, i, sample
+            )  # Keep index i for tqdm maybe? Or remove it. Let's keep it for now.
+            for i, sample in enumerate(dataset)
+        ]
+        for future in tqdm(
+            as_completed(futures), total=len(dataset), desc="Processing samples"
+        ):
+            sample, skipped = future.result()
+            if skipped:
+                skip_count += 1
+            elif sample:
+                samples.append(sample)
+
+    samples.sort(key=lambda x: x["final_input_prompt"])
+
+    print(
+        f"Skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset"
+    )
+    print("Samples have been prepared")
+    return samples
+
+
+def get_sampling_params(eval_args):
+    max_new_tokens = 30
+    temperature = 0.001
+
+    extra_request_body = {}
+    if eval_args.extra_request_body:
+        extra_request_body = json.loads(eval_args.extra_request_body)
+
+    return {
+        "temperature": temperature,
+        "max_new_tokens": max_new_tokens,
+        **extra_request_body,
+    }
+
+
+# ----------- Process Multi-choice -------------
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    """
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f"({choice})" in response:
+            candidates.append(choice)
+            ans_with_brack = True
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A B C D
+            if f" {choice} " in response:
+                candidates.append(choice)
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)  # -1 will be ignored anyway
+                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:  # if only one candidate, use it.
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+# ----------- Process Open -------------
+def check_is_number(string):
+    """
+    Check if the given string a number.
+    """
+    try:
+        float(string.replace(",", ""))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+
+def normalize_str(string):
+    """
+    Normalize the str to lower case and make them float numbers if possible.
+    """
+    # check if characters in the string
+
+    # if number, numerize it.
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(",", "")
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        # lower it
+        string = string.lower()
+        if len(string) == 1:
+            return [" " + string, string + " "]  # avoid trivial matches
+        return [string]
+
+
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
+
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
+def parse_open_response(response):
+    """
+    Parse the prediction from the generated response.
+    Return a list of predicted strings or numbers.
+    """
+
+    # content = content.strip("\n").strip(".").strip(" ")
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip().strip(".").lower()
+        sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
+        indicators_of_keys = [
+            "could be ",
+            "so ",
+            "is ",
+            "thus ",
+            "therefore ",
+            "final ",
+            "answer ",
+            "result ",
+        ]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(
+                            shortest_key_response
+                        ):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+                    # key_responses.append(resp.split(indicator)[1].strip())
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [
+                    ":",
+                    ",",
+                    ".",
+                    "!",
+                    "?",
+                    ";",
+                    ":",
+                    "'",
+                ]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    # pdb.set_trace()
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i]))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+# ----------- Evaluation -------------
+
+
+def eval_multi_choice(gold_i, pred_i):
+    """
+    Evaluate a multiple choice instance.
+    """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else:  # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
+
+
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    """
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i:  # pred is already normalized in parse response phase
+        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else:  # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct
+
+
+# ----------- Batch Evaluation -------------
+def evaluate(samples):
+    """
+    Batch evaluation for multiple choice and open questions.
+    """
+    pred_correct = 0
+    judge_dict = dict()
+    for sample in samples:
+        gold_i = sample["answer"]
+        pred_i = sample["parsed_pred"]
+        if sample["question_type"] == "multiple-choice":
+            correct = eval_multi_choice(gold_i, pred_i)
+        else:  # open question
+            correct = eval_open(gold_i, pred_i)
+
+        if correct:
+            judge_dict[sample["id"]] = "Correct"
+            pred_correct += 1
+        else:
+            # print(f"Wrong! expected {pred_i}, answered with {gold_i}")
+            judge_dict[sample["id"]] = "Wrong"
+
+    if len(samples) == 0:
+        return {"acc": 0}
+    return judge_dict, {"acc": pred_correct / len(samples)}
+
+
+# ----------- Calculate Accuracy -------------
+def calculate_ins_level_acc(results: Dict):
+    """Calculate the instruction level accuracy for given Subject results"""
+    acc = 0
+    ins_num = 0
+    for cat_results in results.values():
+        acc += cat_results["acc"] * cat_results["num_example"]
+        ins_num += cat_results["num_example"]
+    if ins_num == 0:
+        return 0
+    return acc / ins_num
+
+
+def process_result(response, sample, answer_dict, out_samples):
+    if response is None:
+        return
+    if sample["question_type"] == "multiple-choice":
+        pred_ans = parse_multi_choice_response(
+            response, sample["all_choices"], sample["index2ans"]
+        )
+    else:  # open question
+        pred_ans = response
+
+    out_samples[sample["id"]] = pred_ans
+
+    # set ground truth answer
+    answer_dict[sample["id"]] = {
+        "question_type": sample["question_type"],
+        "ground_truth": sample["answer"],
+    }
+
+
+def eval_result(model_answer_path, answer_dict, eval_output_path=None):
+    if eval_output_path is None:
+        eval_output_path = model_answer_path
+    print("Evaluating...")
+    output_dict = json.load(open(model_answer_path))
+    # answer_dict = json.load(open(answer_path))
+
+    # group by category
+    output_dict_w_cat = {}
+    for data_id, parsed_pred in output_dict.items():
+        category = "_".join(data_id.split("_")[1:-1])
+        if category not in output_dict_w_cat:
+            output_dict_w_cat.update({category: {}})
+        output_dict_w_cat[category].update({data_id: parsed_pred})
+
+    # group by category
+    answer_dict_w_cat = {}
+    for data_id, parsed_pred in answer_dict.items():
+        category = "_".join(data_id.split("_")[1:-1])
+        if category not in answer_dict_w_cat:
+            answer_dict_w_cat.update({category: {}})
+        answer_dict_w_cat[category].update({data_id: parsed_pred})
+
+    evaluation_result = {}
+
+    for category in CAT_SHORT2LONG.values():
+        # print("Evaluating: {}".format(category))
+        # get cat_outputs and cat_answers
+        try:
+            cat_outputs = output_dict_w_cat[category]
+            cat_answers = answer_dict_w_cat[category]
+        except KeyError:
+            # print("Skipping {} for not found".format(category))
+            continue
+
+        exampels_to_eval = []
+        for data_id, parsed_pred in cat_outputs.items():
+            question_type = cat_answers[data_id]["question_type"]
+            if question_type != "multiple-choice":
+                parsed_pred = parse_open_response(
+                    parsed_pred
+                )  # mainly for type consistency (make it number, etc.)
+            else:
+                parsed_pred = parsed_pred
+
+            exampels_to_eval.append(
+                {
+                    "id": data_id,
+                    "question_type": question_type,
+                    "answer": cat_answers[data_id]["ground_truth"],
+                    "parsed_pred": parsed_pred,
+                }
+            )
+
+        judge_dict, metric_dict = evaluate(exampels_to_eval)
+        metric_dict.update({"num_example": len(exampels_to_eval)})
+
+        evaluation_result[category] = metric_dict
+
+    printable_results = {}
+    # pdb.set_trace()
+    # add domain Subject
+    for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
+        in_domain_cat_results = {}
+        for cat_name in in_domain_cats:  # use the order in DOMAIN_CAT2SUB_CAT
+            if cat_name in evaluation_result.keys():
+                in_domain_cat_results[cat_name] = evaluation_result[cat_name]
+            else:
+                pass
+        in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
+        in_domain_data_num = sum(
+            [
+                cat_results["num_example"]
+                for cat_results in in_domain_cat_results.values()
+            ]
+        )
+        printable_results["Overall-" + domain] = {
+            "num": int(in_domain_data_num),
+            "acc": round(in_domain_ins_acc, 3),
+        }
+        # add sub category
+        for cat_name, cat_results in in_domain_cat_results.items():
+            printable_results[cat_name] = {
+                "num": int(cat_results["num_example"]),
+                "acc": round(cat_results["acc"], 3),
+            }
+
+    # table.append(["-----------------------------", "-----", "----"])
+    all_ins_acc = calculate_ins_level_acc(evaluation_result)
+    overall_acc = round(all_ins_acc, 3)
+    printable_results["Overall"] = {
+        "num": sum(
+            [cat_results["num_example"] for cat_results in evaluation_result.values()]
+        ),
+        "acc": overall_acc,
+    }
+    pprint.pprint(printable_results)
+    out = eval_output_path
+    with open(out, "w", encoding="utf-8") as outfile:
+        json.dump(printable_results, outfile)
+        print(f"eval out saved to {out}")
+
+    print(f"Overall accuracy: {overall_acc}")
--- a/benchmark/mmmu/internvl_utils.py
+++ b/benchmark/mmmu/internvl_utils.py
@@ -0,0 +1,94 @@
+# copy from https://huggingface.co/OpenGVLab/InternVL3-1B
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(
+    image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image_file, input_size=448, max_num=12):
+    image = Image.open(image_file).convert("RGB")
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(
+        image, image_size=input_size, use_thumbnail=True, max_num=max_num
+    )
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
--- a/benchmark/mmmu/prompt_format.yaml
+++ b/benchmark/mmmu/prompt_format.yaml
@@ -0,0 +1,15 @@
+task_instructions:
+- ""
+multi_choice_example_format:
+- "{}
+
+{}
+
+Answer with the option's letter from the given choices directly."
+
+short_ans_example_format:
+- "{}
+
+Answer the question using a single word or phrase."
+temperature:
+- 0