[VLM] Adopt fast image processor by default (#5065)

2025-04-12 12:46:58 +08:00
parent 611720919d
commit 34ef6c8135
12 changed files with 163 additions and 98 deletions
--- a/benchmark/mmmu/bench_sglang.py
+++ b/benchmark/mmmu/bench_sglang.py
@@ -89,5 +89,4 @@ if __name__ == "__main__":
    EvalArgs.add_cli_args(parser)
    args = add_common_sglang_args_and_parse(parser)
    args = parser.parse_args()
-
    eval_mmmu(args)
--- a/benchmark/mmmu/eval_utils.py
+++ b/benchmark/mmmu/eval_utils.py
@@ -7,6 +7,7 @@ import os
 import pprint
 import random
 import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Dict, Optional

 import numpy as np
@@ -117,29 +118,38 @@ def prepare_samples(eval_args: EvalArgs):
    # merge all dataset
    dataset = concatenate_datasets(sub_dataset_list)

-    ## prepare images
-    samples = []
-    skip_count = 0
-
-    # use image file as input to ensure the consistency between sglang and hf
+    # Prepare images in parallel
    images_path = os.path.expanduser("~/.cache/mmmu/images")
    os.makedirs(images_path, exist_ok=True)
    print(f"Saving images to: {images_path}")

-    for i, sample in enumerate(tqdm(dataset)):
+    samples = []
+    skip_count = 0
+
+    def process_sample(i, sample):
        sample = process_single_sample(sample)
        sample = construct_prompt(sample, eval_args.config)
        image = sample["image"]
-
        width, height = image.size
        if width * height >= eval_args.image_pixels_limit:
-            skip_count += 1
-            continue
+            return None, True
        image_path = f"{images_path}/image_{i}.png"
        if not os.path.exists(image_path):
            image.save(image_path)
        sample["image_path"] = image_path
-        samples.append(sample)
+        return sample, False
+
+    with ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(process_sample, i, sample)
+            for i, sample in enumerate(dataset)
+        ]
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            sample, skipped = future.result()
+            if skipped:
+                skip_count += 1
+            elif sample:
+                samples.append(sample)

    print(
        f"skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset"