sglang/benchmark/mmmu/bench_sglang.py

"""
Bench the sglang-hosted vLM with benchmark MMMU

Usage:
    Host the VLM: python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --port 30000

    Benchmark: python benchmark/mmmu/bench_sglang.py --port 30000 --concurrency 16

The eval output will be logged
"""

import argparse
import asyncio
import re
import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import Any, List, Optional, Tuple

import aiohttp
import openai
from data_utils import save_json
from eval_utils import (
    EvalArgs,
    eval_result,
    get_sampling_params,
    prepare_samples,
    process_result,
)
from tqdm import tqdm

from sglang.test.test_utils import add_common_sglang_args_and_parse

AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)


@dataclass
class RequestFuncOutput:
    generated_text: List[str] = field(default_factory=list)
    prompt_len: List[int] = field(default_factory=list)
    output_len: List[int] = field(default_factory=list)
    latency: List[float] = field(default_factory=list)
    ttft: List[float] = field(default_factory=list)
    itl: List[float] = field(default_factory=list)  # List of inter-token latencies

    success: bool = False
    error: str = ""


async def async_request_profile(api_url: str) -> RequestFuncOutput:
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        output = RequestFuncOutput()
        try:
            async with session.post(url=api_url) as response:
                if response.status == 200:
                    output.success = True
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))

    return output


def _get_prefix_suffix(prompt: str) -> Tuple[str, str]:
    """Split the prompt into prefix and suffix."""
    prefix = prompt.split("<")[0]
    suffix = prompt.split(">", 1)[1]
    return prefix, suffix


async def process_sample(
    client: Any, sample: dict, sampling_params: dict, lora_path: Optional[str] = None
) -> Tuple[dict, str]:
    """Send a single sample to the LLM and return (sample, response)."""
    prompt = sample["final_input_prompt"]
    prefix, suffix = _get_prefix_suffix(prompt)
    image = sample["image"]
    assert image is not None
    image_path = sample["image_path"]
    extra_body = None if lora_path is None else {"lora_path": lora_path}
    response = await client.chat.completions.create(
        model="default",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prefix},
                    {"type": "image_url", "image_url": {"url": image_path}},
                    {"type": "text", "text": suffix},
                ],
            }
        ],
        temperature=0,
        max_completion_tokens=sampling_params["max_new_tokens"],
        max_tokens=sampling_params["max_new_tokens"],
        extra_body=extra_body,
    )
    return sample, response.choices[0].message.content


async def process_sample_with_semaphore(
    semaphore: asyncio.Semaphore,
    client: Any,
    sample: dict,
    sampling_params: dict,
    lora_path: Optional[str] = None,
) -> Tuple[dict, str]:
    """Wrap process_sample with a semaphore for concurrency control."""
    async with semaphore:
        return await process_sample(client, sample, sampling_params, lora_path)


async def eval_mmmu(args) -> None:
    """Main evaluation loop with concurrency control."""
    eval_args = EvalArgs.from_cli_args(args)
    sampling_params = get_sampling_params(eval_args)
    samples = prepare_samples(eval_args)
    lora_path = eval_args.lora_path
    answer_dict = {}
    out_samples = {}
    client = openai.AsyncOpenAI(
        api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
    )
    start = time.perf_counter()
    base_url = f"http://127.0.0.1:{args.port}"

    if args.profile:
        print("Starting profiler...")
        profile_output = await async_request_profile(
            api_url=f"{base_url}/start_profile"
        )
        if profile_output.success:
            print("Profiler started")

        samples = samples[: args.profile_number]

    if args.concurrency == 1:
        # For concurrency == 1, run in sequential mode to ensure consistent order
        # this is mainly for profiling
        for sample in tqdm(samples):
            _, response = await process_sample(
                client, sample, sampling_params, lora_path
            )
            answer = (
                re.search(args.response_answer_regex, response)
                if response is not None
                else None
            )
            process_result(
                answer.group(1) if answer else response,
                sample,
                answer_dict,
                out_samples,
            )
    else:
        semaphore = asyncio.Semaphore(args.concurrency)
        tasks = [
            process_sample_with_semaphore(
                semaphore, client, sample, sampling_params, lora_path
            )
            for sample in samples
        ]

        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            sample, response = await coro
            answer = (
                re.search(args.response_answer_regex, response)
                if response is not None
                else None
            )
            process_result(
                answer.group(1) if answer else response,
                sample,
                answer_dict,
                out_samples,
            )

    if args.profile:
        print("Stopping profiler...")
        profile_output = await async_request_profile(api_url=f"{base_url}/stop_profile")
        if profile_output.success:
            print("Profiler stopped")

    print(f"Benchmark time: {time.perf_counter() - start}")
    args.output_path = "./answer_sglang.json"
    save_json(args.output_path, out_samples)
    eval_result(
        model_answer_path=args.output_path,
        answer_dict=answer_dict,
        eval_output_path="./val_sglang.json",
    )


def parse_args():
    parser = argparse.ArgumentParser()
    EvalArgs.add_cli_args(parser)
    args = add_common_sglang_args_and_parse(parser)
    return args


def main():
    args = parse_args()
    asyncio.run(eval_mmmu(args))


if __name__ == "__main__":
    main()
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00			`"""`
refactor: bug fixes and refactor for vlm (#4661) 2025-03-23 13:48:49 +08:00			`Bench the sglang-hosted vLM with benchmark MMMU`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00
refactor: bug fixes and refactor for vlm (#4661) 2025-03-23 13:48:49 +08:00			`Usage:`
Fix and Clean up chat-template requirement for VLM (#6114) Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> 2025-05-10 09:14:09 -07:00			`Host the VLM: python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --port 30000`
Update MMMU Benchmark instructions (#4694) 2025-03-28 03:14:16 +05:30
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`Benchmark: python benchmark/mmmu/bench_sglang.py --port 30000 --concurrency 16`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00
refactor: bug fixes and refactor for vlm (#4661) 2025-03-23 13:48:49 +08:00			`The eval output will be logged`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00			`"""`

			`import argparse`
support vlm benchmark profile (#5905) 2025-04-30 14:48:27 +08:00			`import asyncio`
Support glm4.1v and glm4.5v (#8798) Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: zRzRzRzRzRzRzR <2448370773@qq.com> Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com> Co-authored-by: Chang Su <csu272@usc.edu> 2025-08-09 00:59:13 -07:00			`import re`
support vlm benchmark profile (#5905) 2025-04-30 14:48:27 +08:00			`import sys`
refactor: bug fixes and refactor for vlm (#4661) 2025-03-23 13:48:49 +08:00			`import time`
support vlm benchmark profile (#5905) 2025-04-30 14:48:27 +08:00			`import traceback`
			`from dataclasses import dataclass, field`
Support LoRA in MMMU benchmark script. (#7218) 2025-06-15 21:17:57 -07:00			`from typing import Any, List, Optional, Tuple`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00
support vlm benchmark profile (#5905) 2025-04-30 14:48:27 +08:00			`import aiohttp`
refactor: rewrite bench-mmmu-sglang (#4458) 2025-03-18 09:11:47 +08:00			`import openai`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00			`from data_utils import save_json`
			`from eval_utils import (`
			`EvalArgs,`
			`eval_result,`
			`get_sampling_params,`
			`prepare_samples,`
refactor: move image processors to separate files (#4229) 2025-03-12 03:35:35 +08:00			`process_result,`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00			`)`
			`from tqdm import tqdm`

refactor: rewrite bench-mmmu-sglang (#4458) 2025-03-18 09:11:47 +08:00			`from sglang.test.test_utils import add_common_sglang_args_and_parse`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00
support vlm benchmark profile (#5905) 2025-04-30 14:48:27 +08:00			`AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00
support vlm benchmark profile (#5905) 2025-04-30 14:48:27 +08:00
			`@dataclass`
			`class RequestFuncOutput:`
			`generated_text: List[str] = field(default_factory=list)`
			`prompt_len: List[int] = field(default_factory=list)`
			`output_len: List[int] = field(default_factory=list)`
			`latency: List[float] = field(default_factory=list)`
			`ttft: List[float] = field(default_factory=list)`
			`itl: List[float] = field(default_factory=list) # List of inter-token latencies`

			`success: bool = False`
			`error: str = ""`


			`async def async_request_profile(api_url: str) -> RequestFuncOutput:`
			`async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:`
			`output = RequestFuncOutput()`
			`try:`
			`async with session.post(url=api_url) as response:`
			`if response.status == 200:`
			`output.success = True`
			`else:`
			`output.error = response.reason or ""`
			`output.success = False`
			`except Exception:`
			`output.success = False`
			`exc_info = sys.exc_info()`
			`output.error = "".join(traceback.format_exception(*exc_info))`

			`return output`


feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`def _get_prefix_suffix(prompt: str) -> Tuple[str, str]:`
			`"""Split the prompt into prefix and suffix."""`
			`prefix = prompt.split("<")[0]`
			`suffix = prompt.split(">", 1)[1]`
			`return prefix, suffix`


			`async def process_sample(`
Support LoRA in MMMU benchmark script. (#7218) 2025-06-15 21:17:57 -07:00			`client: Any, sample: dict, sampling_params: dict, lora_path: Optional[str] = None`
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`) -> Tuple[dict, str]:`
			`"""Send a single sample to the LLM and return (sample, response)."""`
			`prompt = sample["final_input_prompt"]`
			`prefix, suffix = _get_prefix_suffix(prompt)`
			`image = sample["image"]`
			`assert image is not None`
			`image_path = sample["image_path"]`
Support LoRA in MMMU benchmark script. (#7218) 2025-06-15 21:17:57 -07:00			`extra_body = None if lora_path is None else {"lora_path": lora_path}`
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`response = await client.chat.completions.create(`
			`model="default",`
			`messages=[`
			`{`
			`"role": "user",`
			`"content": [`
			`{"type": "text", "text": prefix},`
			`{"type": "image_url", "image_url": {"url": image_path}},`
			`{"type": "text", "text": suffix},`
			`],`
			`}`
			`],`
			`temperature=0,`
			`max_completion_tokens=sampling_params["max_new_tokens"],`
			`max_tokens=sampling_params["max_new_tokens"],`
Support LoRA in MMMU benchmark script. (#7218) 2025-06-15 21:17:57 -07:00			`extra_body=extra_body,`
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`)`
			`return sample, response.choices[0].message.content`


			`async def process_sample_with_semaphore(`
Support LoRA in MMMU benchmark script. (#7218) 2025-06-15 21:17:57 -07:00			`semaphore: asyncio.Semaphore,`
			`client: Any,`
			`sample: dict,`
			`sampling_params: dict,`
			`lora_path: Optional[str] = None,`
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`) -> Tuple[dict, str]:`
			`"""Wrap process_sample with a semaphore for concurrency control."""`
			`async with semaphore:`
Support LoRA in MMMU benchmark script. (#7218) 2025-06-15 21:17:57 -07:00			`return await process_sample(client, sample, sampling_params, lora_path)`
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00

			`async def eval_mmmu(args) -> None:`
			`"""Main evaluation loop with concurrency control."""`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00			`eval_args = EvalArgs.from_cli_args(args)`
			`sampling_params = get_sampling_params(eval_args)`
refactor: move image processors to separate files (#4229) 2025-03-12 03:35:35 +08:00			`samples = prepare_samples(eval_args)`
Support LoRA in MMMU benchmark script. (#7218) 2025-06-15 21:17:57 -07:00			`lora_path = eval_args.lora_path`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00			`answer_dict = {}`
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`out_samples = {}`
			`client = openai.AsyncOpenAI(`
			`api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"`
			`)`
Replace time.time() to time.perf_counter() for benchmarking. (#6178) Signed-off-by: Lifu Huang <lifu.hlf@gmail.com> 2025-05-11 14:32:49 -07:00			`start = time.perf_counter()`
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`base_url = f"http://127.0.0.1:{args.port}"`
support vlm benchmark profile (#5905) 2025-04-30 14:48:27 +08:00
			`if args.profile:`
			`print("Starting profiler...")`
			`profile_output = await async_request_profile(`
			`api_url=f"{base_url}/start_profile"`
			`)`
			`if profile_output.success:`
			`print("Profiler started")`

			`samples = samples[: args.profile_number]`

chore: improve mmmu benchmark (#7000) Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com> 2025-07-26 16:19:45 +08:00			`if args.concurrency == 1:`
			`# For concurrency == 1, run in sequential mode to ensure consistent order`
			`# this is mainly for profiling`
			`for sample in tqdm(samples):`
			`_, response = await process_sample(`
			`client, sample, sampling_params, lora_path`
			`)`
Support glm4.1v and glm4.5v (#8798) Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: zRzRzRzRzRzRzR <2448370773@qq.com> Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com> Co-authored-by: Chang Su <csu272@usc.edu> 2025-08-09 00:59:13 -07:00			`answer = (`
			`re.search(args.response_answer_regex, response)`
			`if response is not None`
			`else None`
			`)`
			`process_result(`
			`answer.group(1) if answer else response,`
			`sample,`
			`answer_dict,`
			`out_samples,`
			`)`
chore: improve mmmu benchmark (#7000) Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com> 2025-07-26 16:19:45 +08:00			`else:`
			`semaphore = asyncio.Semaphore(args.concurrency)`
			`tasks = [`
			`process_sample_with_semaphore(`
			`semaphore, client, sample, sampling_params, lora_path`
			`)`
			`for sample in samples`
			`]`

			`for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):`
			`sample, response = await coro`
Support glm4.1v and glm4.5v (#8798) Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: zRzRzRzRzRzRzR <2448370773@qq.com> Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com> Co-authored-by: Chang Su <csu272@usc.edu> 2025-08-09 00:59:13 -07:00			`answer = (`
			`re.search(args.response_answer_regex, response)`
			`if response is not None`
			`else None`
			`)`
			`process_result(`
			`answer.group(1) if answer else response,`
			`sample,`
			`answer_dict,`
			`out_samples,`
			`)`
refactor: rewrite bench-mmmu-sglang (#4458) 2025-03-18 09:11:47 +08:00
support vlm benchmark profile (#5905) 2025-04-30 14:48:27 +08:00			`if args.profile:`
			`print("Stopping profiler...")`
			`profile_output = await async_request_profile(api_url=f"{base_url}/stop_profile")`
			`if profile_output.success:`
			`print("Profiler stopped")`

Replace time.time() to time.perf_counter() for benchmarking. (#6178) Signed-off-by: Lifu Huang <lifu.hlf@gmail.com> 2025-05-11 14:32:49 -07:00			`print(f"Benchmark time: {time.perf_counter() - start}")`
[GLM4.1V and GLM4.5V] Add vision transformer num_dummy_head support: max tp=4 -> max tp=8 (#9059) 2025-08-18 14:40:13 -07:00			`args.output_path = "./answer_sglang.json"`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00			`save_json(args.output_path, out_samples)`
[GLM4.1V and GLM4.5V] Add vision transformer num_dummy_head support: max tp=4 -> max tp=8 (#9059) 2025-08-18 14:40:13 -07:00			`eval_result(`
			`model_answer_path=args.output_path,`
			`answer_dict=answer_dict,`
			`eval_output_path="./val_sglang.json",`
			`)`
refactor: move image processors to separate files (#4229) 2025-03-12 03:35:35 +08:00
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`def parse_args():`
bench: Add MMMU benchmark for vLM (#3562) 2025-02-23 00:10:59 +08:00			`parser = argparse.ArgumentParser()`
			`EvalArgs.add_cli_args(parser)`
[FA3 Feature] Support multi modal Llama-3.2-11B-Vision-Instruct (#5103) 2025-04-07 22:58:08 -07:00			`args = add_common_sglang_args_and_parse(parser)`
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00			`return args`


			`def main():`
			`args = parse_args()`
support vlm benchmark profile (#5905) 2025-04-30 14:48:27 +08:00			`asyncio.run(eval_mmmu(args))`
feat: add concurrency evaluation logic in mmmu benchmark (#5782) 2025-05-01 18:20:08 -07:00

			`if __name__ == "__main__":`
			`main()`