chore: improve mmmu benchmark (#7000)
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
@@ -125,7 +125,6 @@ async def eval_mmmu(args) -> None:
|
|||||||
client = openai.AsyncOpenAI(
|
client = openai.AsyncOpenAI(
|
||||||
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
|
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
|
||||||
)
|
)
|
||||||
semaphore = asyncio.Semaphore(args.concurrency)
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
base_url = f"http://127.0.0.1:{args.port}"
|
base_url = f"http://127.0.0.1:{args.port}"
|
||||||
|
|
||||||
@@ -139,6 +138,16 @@ async def eval_mmmu(args) -> None:
|
|||||||
|
|
||||||
samples = samples[: args.profile_number]
|
samples = samples[: args.profile_number]
|
||||||
|
|
||||||
|
if args.concurrency == 1:
|
||||||
|
# For concurrency == 1, run in sequential mode to ensure consistent order
|
||||||
|
# this is mainly for profiling
|
||||||
|
for sample in tqdm(samples):
|
||||||
|
_, response = await process_sample(
|
||||||
|
client, sample, sampling_params, lora_path
|
||||||
|
)
|
||||||
|
process_result(response, sample, answer_dict, out_samples)
|
||||||
|
else:
|
||||||
|
semaphore = asyncio.Semaphore(args.concurrency)
|
||||||
tasks = [
|
tasks = [
|
||||||
process_sample_with_semaphore(
|
process_sample_with_semaphore(
|
||||||
semaphore, client, sample, sampling_params, lora_path
|
semaphore, client, sample, sampling_params, lora_path
|
||||||
|
|||||||
@@ -27,8 +27,7 @@ from tqdm import tqdm
|
|||||||
class EvalArgs:
|
class EvalArgs:
|
||||||
seed: int = 42
|
seed: int = 42
|
||||||
split: str = "validation"
|
split: str = "validation"
|
||||||
# Default setting to make the benchmark available on A100 for most 7B models
|
image_pixels_limit: int = -1
|
||||||
image_pixels_limit: int = 4300000
|
|
||||||
result_filename: str = ""
|
result_filename: str = ""
|
||||||
prompt_format_file: str = "prompt_format.yaml"
|
prompt_format_file: str = "prompt_format.yaml"
|
||||||
dataset_path: str = "MMMU/MMMU"
|
dataset_path: str = "MMMU/MMMU"
|
||||||
@@ -190,7 +189,7 @@ def prepare_samples(eval_args: EvalArgs):
|
|||||||
sample = construct_prompt(sample, eval_args.config)
|
sample = construct_prompt(sample, eval_args.config)
|
||||||
image = sample["image"]
|
image = sample["image"]
|
||||||
width, height = image.size
|
width, height = image.size
|
||||||
if width * height >= eval_args.image_pixels_limit:
|
if 0 < eval_args.image_pixels_limit <= width * height:
|
||||||
return None, True
|
return None, True
|
||||||
# Use a unique identifier for the image path to avoid potential collisions if indices reset
|
# Use a unique identifier for the image path to avoid potential collisions if indices reset
|
||||||
image_path = f"{images_path}/image_{sample['id']}.png"
|
image_path = f"{images_path}/image_{sample['id']}.png"
|
||||||
@@ -217,6 +216,8 @@ def prepare_samples(eval_args: EvalArgs):
|
|||||||
elif sample:
|
elif sample:
|
||||||
samples.append(sample)
|
samples.append(sample)
|
||||||
|
|
||||||
|
samples.sort(key=lambda x: x["final_input_prompt"])
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"Skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset"
|
f"Skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset"
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user