chore: enhance bench_serving for vlms with a new dataset of configurable image count and resolution (#9583)

Co-authored-by: yhyang201 <yhyang201@gmail.com>
2025-08-27 08:42:54 +08:00
parent a530b3ffdc
commit 16a6d21b95
1 changed files with 186 additions and 14 deletions
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
 import argparse
 import asyncio
 import base64
 import io
 import json
 import os
 import pickle
@@ -71,7 +73,7 @@ class RequestFuncInput:
    output_len: int
    model: str
    lora_name: str
-    image_data: str
+    image_data: Optional[List[str]]
    extra_request_body: Dict[str, Any]
@@ -289,16 +291,19 @@ async def async_request_openai_chat_completions(
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
    if request_func_input.image_data:
        # Build multi-image content: a list of image_url entries followed by the text
        content_items = [
            {
                "type": "image_url",
                "image_url": {"url": img_url},
            }
            for img_url in request_func_input.image_data
        ]
        content_items.append({"type": "text", "text": request_func_input.prompt})
        messages = [
            {
                "role": "user",
-                "content": [
+                "content": content_items,
                    {
                        "type": "image_url",
                        "image_url": {"url": request_func_input.image_data},
                    },
                    {"type": "text", "text": request_func_input.prompt},
                ],
            },
        ]
    else:
@@ -497,7 +502,7 @@ async def async_request_sglang_generate(
            **request_func_input.extra_request_body,
        }
-        # Add image data if available
+        # Add image data if available (list of image urls/base64)
        if request_func_input.image_data:
            payload["image_data"] = request_func_input.image_data
@@ -648,7 +653,7 @@ def get_dataset(args, tokenizer):
            prompt_suffix=args.prompt_suffix,
            apply_chat_template=args.apply_chat_template,
        )
-    elif args.dataset_name.startswith("random"):
+    elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
        input_requests = sample_random_requests(
            input_len=args.random_input_len,
            output_len=args.random_output_len,
@@ -659,6 +664,18 @@ def get_dataset(args, tokenizer):
            random_sample=args.dataset_name == "random",
            return_text=not tokenize_prompt,
        )
    elif args.dataset_name == "random-image":
        assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
        input_requests = sample_random_image_requests(
            num_requests=args.num_prompts,
            num_images=args.random_image_num_images,
            input_len=args.random_input_len,
            output_len=args.random_output_len,
            range_ratio=args.random_range_ratio,
            tokenizer=tokenizer,
            apply_chat_template=args.apply_chat_template,
            image_resolution=args.random_image_resolution,
        )
    elif args.dataset_name == "generated-shared-prefix":
        assert not tokenize_prompt
        input_requests = sample_generated_shared_prefix_requests(
@@ -790,7 +807,7 @@ class DatasetRow:
    prompt: str
    prompt_len: int
    output_len: int
-    image_data: Optional[str] = None
+    image_data: Optional[List[str]] = None
 def sample_mmmu_requests(
@@ -913,7 +930,7 @@ def sample_mmmu_requests(
                        prompt=prompt,
                        prompt_len=prompt_len,
                        output_len=output_len,
-                        image_data=image_data,
+                        image_data=[image_data],
                    )
                )
@@ -1113,6 +1130,132 @@ def sample_random_requests(
    return input_requests
 def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
    """Parse image resolution into (width, height).
    Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
    (e.g., '1080x1920' means height=1080, width=1920).
    """
    resolution_to_size = {
        "4k": (3840, 2160),
        "1080p": (1920, 1080),
        "720p": (1280, 720),
        "360p": (640, 360),
    }
    if image_resolution in resolution_to_size:
        return resolution_to_size[image_resolution]
    res = image_resolution.strip().lower()
    if "x" in res:
        parts = res.split("x")
        if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
            height = int(parts[0])
            width = int(parts[1])
            if height > 0 and width > 0:
                return (width, height)
    raise ValueError(
        f"Unsupported random-image resolution: {image_resolution}. "
        "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
    )
 def sample_random_image_requests(
    num_requests: int,
    num_images: int,
    input_len: int,
    output_len: int,
    range_ratio: float,
    tokenizer: PreTrainedTokenizerBase,
    apply_chat_template: bool = True,
    image_resolution: str = "1080p",
 ) -> List[DatasetRow]:
    """Generate requests with random images.
    - Each request includes ``num_images`` random images.
    - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
      or custom 'heightxwidth' (e.g., 1080x1920).
    - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
      only counts text tokens and excludes image data.
    """
    try:
        import pybase64
        from PIL import Image
    except ImportError as e:
        raise ImportError(
            "Please install Pillow to generate random images: pip install pillow"
        ) from e
    # Parse resolution (supports presets and 'heightxwidth')
    width, height = parse_random_image_resolution(image_resolution)
    # Check for potentially problematic combinations and warn user
    if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
        warnings.warn(
            f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
            f"may take a long time. Consider reducing resolution or image count.",
            UserWarning,
            stacklevel=2,
        )
    # Sample text lengths
    input_lens = np.random.randint(
        max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
    )
    output_lens = np.random.randint(
        int(output_len * range_ratio), output_len + 1, size=num_requests
    )
    def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
        arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
        img = Image.fromarray(arr, mode="RGB")
        buf = io.BytesIO()
        img.save(buf, format="JPEG", quality=85)
        encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
        return f"data:image/jpeg;base64,{encoded}"
    dataset: List[DatasetRow] = []
    for i in range(num_requests):
        # Generate text prompt
        text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
        # Generate image list
        images = [_gen_random_image_data_uri() for _ in range(num_images)]
        prompt_str = text_prompt
        if apply_chat_template:
            try:
                content_items = [
                    {"type": "image_url", "image_url": {"url": img_url}}
                    for img_url in images
                ]
                content_items.append({"type": "text", "text": text_prompt})
                prompt_str = tokenizer.apply_chat_template(
                    [{"role": "user", "content": content_items}],
                    add_generation_prompt=True,
                    tokenize=False,
                )
            except Exception:
                # Some tokenizers do not support list content; fall back to a placeholder in the text
                prompt_str = f"<image>{text_prompt}"
        prompt_token_ids = tokenizer.encode(prompt_str)
        prompt_token_len = len(prompt_token_ids)
        dataset.append(
            DatasetRow(
                prompt=prompt_str,
                prompt_len=prompt_token_len,
                output_len=int(output_lens[i]),
                image_data=images,
            )
        )
    print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
    print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
    return dataset
 def gen_prompt(tokenizer, token_num):
    """Generate a random prompt of specified token length using tokenizer vocabulary."""
    all_available_tokens = list(tokenizer.get_vocab().values())
@@ -1579,7 +1722,13 @@ async def benchmark(
        output_file_name = args.output_file
    else:
        now = datetime.now().strftime("%m%d")
-        if args.dataset_name.startswith("random"):
+        if args.dataset_name == "random-image":
            output_file_name = (
                f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
                f"{args.random_output_len}_{args.random_image_num_images}imgs_"
                f"{args.random_image_resolution}.jsonl"
            )
        elif args.dataset_name.startswith("random"):
            output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
        else:
            output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
@@ -1819,7 +1968,14 @@ if __name__ == "__main__":
        "--dataset-name",
        type=str,
        default="sharegpt",
-        choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
+        choices=[
            "sharegpt",
            "random",
            "random-ids",
            "generated-shared-prefix",
            "mmmu",
            "random-image",
        ],
        help="Name of the dataset to benchmark on.",
    )
    parser.add_argument(
@@ -1872,6 +2028,22 @@ if __name__ == "__main__":
        help="Range of sampled ratio of input/output length, "
        "used only for random dataset.",
    )
    # random-image dataset args
    parser.add_argument(
        "--random-image-num-images",
        type=int,
        default=1,
        help="Number of images per request (only available with the random-image dataset)",
    )
    parser.add_argument(
        "--random-image-resolution",
        type=str,
        default="1080p",
        help=(
            "Resolution of random images for random-image dataset. "
            "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
        ),
    )
    parser.add_argument(
        "--request-rate",
        type=float,