diff --git a/benchmark/mmmu/README.md b/benchmark/mmmu/README.md index 54acee52a..62a5a2f12 100644 --- a/benchmark/mmmu/README.md +++ b/benchmark/mmmu/README.md @@ -18,6 +18,15 @@ python benchmark/mmmu/bench_sglang.py --port 30000 --concurrency 16 You can adjust the `--concurrency` to control the number of concurrent OpenAI calls. +You can use `--lora-path` to specify the LoRA adapter to apply during benchmarking. E.g., +``` +# Launch server with LoRA enabled +python -m sglang.launch_server --model-path microsoft/Phi-4-multimodal-instruct --port 30000 --trust-remote-code --disable-radix-cache --lora-paths vision= + +# Apply LoRA adapter during inferencing +python -m benchmark/mmmu/bench_sglang.py --concurrency 8 --lora-path vision +``` + ### Evaluate hf ``` diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py index a177fd137..26f585120 100644 --- a/benchmark/mmmu/bench_sglang.py +++ b/benchmark/mmmu/bench_sglang.py @@ -15,7 +15,7 @@ import sys import time import traceback from dataclasses import dataclass, field -from typing import Any, List, Tuple +from typing import Any, List, Optional, Tuple import aiohttp import openai @@ -73,7 +73,7 @@ def _get_prefix_suffix(prompt: str) -> Tuple[str, str]: async def process_sample( - client: Any, sample: dict, sampling_params: dict + client: Any, sample: dict, sampling_params: dict, lora_path: Optional[str] = None ) -> Tuple[dict, str]: """Send a single sample to the LLM and return (sample, response).""" prompt = sample["final_input_prompt"] @@ -81,6 +81,7 @@ async def process_sample( image = sample["image"] assert image is not None image_path = sample["image_path"] + extra_body = None if lora_path is None else {"lora_path": lora_path} response = await client.chat.completions.create( model="default", messages=[ @@ -96,16 +97,21 @@ async def process_sample( temperature=0, max_completion_tokens=sampling_params["max_new_tokens"], max_tokens=sampling_params["max_new_tokens"], + extra_body=extra_body, ) return sample, response.choices[0].message.content async def process_sample_with_semaphore( - semaphore: asyncio.Semaphore, client: Any, sample: dict, sampling_params: dict + semaphore: asyncio.Semaphore, + client: Any, + sample: dict, + sampling_params: dict, + lora_path: Optional[str] = None, ) -> Tuple[dict, str]: """Wrap process_sample with a semaphore for concurrency control.""" async with semaphore: - return await process_sample(client, sample, sampling_params) + return await process_sample(client, sample, sampling_params, lora_path) async def eval_mmmu(args) -> None: @@ -113,6 +119,7 @@ async def eval_mmmu(args) -> None: eval_args = EvalArgs.from_cli_args(args) sampling_params = get_sampling_params(eval_args) samples = prepare_samples(eval_args) + lora_path = eval_args.lora_path answer_dict = {} out_samples = {} client = openai.AsyncOpenAI( @@ -133,7 +140,9 @@ async def eval_mmmu(args) -> None: samples = samples[: args.profile_number] tasks = [ - process_sample_with_semaphore(semaphore, client, sample, sampling_params) + process_sample_with_semaphore( + semaphore, client, sample, sampling_params, lora_path + ) for sample in samples ] diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py index a0960f9e0..48b9af4b1 100644 --- a/benchmark/mmmu/eval_utils.py +++ b/benchmark/mmmu/eval_utils.py @@ -36,17 +36,22 @@ class EvalArgs: profile: bool = False profile_number: int = 5 concurrency: int = 1 + lora_path: Optional[str] = None @staticmethod def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( - "--result-filename", type=str, default=EvalArgs.result_filename + "--result-filename", + type=str, + default=EvalArgs.result_filename, + help="The filename to save the evaluation results.", ) - parser.add_argument( - "--image-pixels-limit", type=int, default=EvalArgs.image_pixels_limit + "--image-pixels-limit", + type=int, + default=EvalArgs.image_pixels_limit, + help="The maximum number of pixels allowed for an image. If an image exceeds this limit, it will be skipped during evaluation.", ) - parser.add_argument( "--dataset-path", type=str, @@ -59,7 +64,12 @@ class EvalArgs: type=str, help="The path to the prompt format of mmmu. If not, a default format llava_config.yaml will be used", ) - parser.add_argument("--split", type=str, default=EvalArgs.split) + parser.add_argument( + "--split", + type=str, + default=EvalArgs.split, + help='Split of the dataset to use for evaluation. Default is "validation".', + ) parser.add_argument( "--extra-request-body", metavar='{"key1": "value1", "key2": "value2"}', @@ -72,9 +82,23 @@ class EvalArgs: "--profile", action="store_true", help="enable mmmu profile" ) parser.add_argument( - "--profile-number", type=int, default=EvalArgs.profile_number + "--profile-number", + type=int, + default=EvalArgs.profile_number, + help="Number of samples to profile. If not set, will profile all samples.", + ) + parser.add_argument( + "--concurrency", + type=int, + default=EvalArgs.concurrency, + help="Number of concurrent requests to make during evaluation. Default is 1, which means no concurrency.", + ) + parser.add_argument( + "--lora-path", + type=str, + default=EvalArgs.lora_path, + help="Specify the LoRA path to use for evaluation. If specified, the value will be specified in the body of every request as `lora-path`.", ) - parser.add_argument("--concurrency", type=int, default=EvalArgs.concurrency) @classmethod def from_cli_args(cls, args: argparse.Namespace):