diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py index 26f585120..524beb7bc 100644 --- a/benchmark/mmmu/bench_sglang.py +++ b/benchmark/mmmu/bench_sglang.py @@ -125,7 +125,6 @@ async def eval_mmmu(args) -> None: client = openai.AsyncOpenAI( api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1" ) - semaphore = asyncio.Semaphore(args.concurrency) start = time.perf_counter() base_url = f"http://127.0.0.1:{args.port}" @@ -139,16 +138,26 @@ async def eval_mmmu(args) -> None: samples = samples[: args.profile_number] - tasks = [ - process_sample_with_semaphore( - semaphore, client, sample, sampling_params, lora_path - ) - for sample in samples - ] + if args.concurrency == 1: + # For concurrency == 1, run in sequential mode to ensure consistent order + # this is mainly for profiling + for sample in tqdm(samples): + _, response = await process_sample( + client, sample, sampling_params, lora_path + ) + process_result(response, sample, answer_dict, out_samples) + else: + semaphore = asyncio.Semaphore(args.concurrency) + tasks = [ + process_sample_with_semaphore( + semaphore, client, sample, sampling_params, lora_path + ) + for sample in samples + ] - for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)): - sample, response = await coro - process_result(response, sample, answer_dict, out_samples) + for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)): + sample, response = await coro + process_result(response, sample, answer_dict, out_samples) if args.profile: print("Stopping profiler...") diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py index 48b9af4b1..2ec669155 100644 --- a/benchmark/mmmu/eval_utils.py +++ b/benchmark/mmmu/eval_utils.py @@ -27,8 +27,7 @@ from tqdm import tqdm class EvalArgs: seed: int = 42 split: str = "validation" - # Default setting to make the benchmark available on A100 for most 7B models - image_pixels_limit: int = 4300000 + image_pixels_limit: int = -1 result_filename: str = "" prompt_format_file: str = "prompt_format.yaml" dataset_path: str = "MMMU/MMMU" @@ -190,7 +189,7 @@ def prepare_samples(eval_args: EvalArgs): sample = construct_prompt(sample, eval_args.config) image = sample["image"] width, height = image.size - if width * height >= eval_args.image_pixels_limit: + if 0 < eval_args.image_pixels_limit <= width * height: return None, True # Use a unique identifier for the image path to avoid potential collisions if indices reset image_path = f"{images_path}/image_{sample['id']}.png" @@ -217,6 +216,8 @@ def prepare_samples(eval_args: EvalArgs): elif sample: samples.append(sample) + samples.sort(key=lambda x: x["final_input_prompt"]) + print( f"Skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset" )