diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index f5578ef9f..db213a9e3 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -388,7 +388,6 @@ async def async_request_sglang_generate(
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
- # print(chunk_bytes)
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
latency = time.perf_counter() - st
@@ -655,6 +654,7 @@ class DatasetRow:
prompt: str
prompt_len: int
output_len: int
+ image_data: Optional[str] = None
def sample_mmmu_requests(
@@ -730,42 +730,50 @@ def sample_mmmu_requests(
buffered = io.BytesIO()
image.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
- image_path = f"data:image/jpeg;base64,{img_str}"
+ image_data = f"data:image/jpeg;base64,{img_str}"
else:
continue
# Extract the question
question = example.get("question")
- # Create the prompt with image, question
+ # Construct the prompt
prompt = f"Question: {question}\n\nAnswer: "
- prompt = tokenizer.apply_chat_template(
- [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": image_path}},
- {"type": "text", "text": prompt},
- ],
- }
- ],
- add_generation_prompt=True,
- tokenize=False,
- )
- prompt = f"{image_path}{prompt}"
- # Calculate token lengths
- # Note: This is approximate since we're not rendering the actual image tokens
+ try:
+ prompt = tokenizer.apply_chat_template(
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {"url": image_data},
+ },
+ {"type": "text", "text": prompt},
+ ],
+ }
+ ],
+ add_generation_prompt=True,
+ tokenize=False,
+ )
+ except Exception as e:
+ # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
+ print(f"Error applying chat template: {e}, fallback to tag")
+ prompt = f"{prompt}"
+
+ # Calculate token lengths for text only (without image data)
prompt_token_ids = tokenizer.encode(prompt)
- prompt_len = (
- len(prompt_token_ids) + 512
- ) # Add estimate for image tokens
+ prompt_len = len(prompt_token_ids)
output_len = fixed_output_len if fixed_output_len is not None else 256
filtered_dataset.append(
DatasetRow(
- prompt=prompt, prompt_len=prompt_len, output_len=output_len
+ prompt=prompt,
+ prompt_len=prompt_len,
+ output_len=output_len,
+ image_data=image_data,
)
)
@@ -1199,34 +1207,21 @@ async def benchmark(
# Use the first request for all warmup iterations
test_request = input_requests[0]
- test_prompt, test_prompt_len, test_output_len = (
- test_request.prompt,
- test_request.prompt_len,
- test_request.output_len,
- )
+
if lora_names is not None and len(lora_names) != 0:
lora_name = lora_names[0]
else:
lora_name = None
- if "" in test_prompt:
- import re
-
- image_match = re.search(r"(.*?)(.*)", test_prompt, re.DOTALL)
- image_data = image_match.group(1) if image_match else None
- test_prompt = image_match.group(2) if image_match else test_prompt
- else:
- image_data = None
-
# Create the test input once
test_input = RequestFuncInput(
model=model_id,
- prompt=test_prompt,
+ prompt=test_request.prompt,
api_url=api_url,
- prompt_len=test_prompt_len,
- output_len=min(test_output_len, 32),
+ prompt_len=test_request.prompt_len,
+ output_len=min(test_request.output_len, 32),
lora_name=lora_name,
- image_data=image_data,
+ image_data=test_request.image_data,
extra_request_body=extra_request_body,
)
@@ -1271,36 +1266,23 @@ async def benchmark(
benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
- prompt, prompt_len, output_len = (
- request.prompt,
- request.prompt_len,
- request.output_len,
- )
if lora_names is not None and len(lora_names) != 0:
idx = random.randint(0, len(lora_names) - 1)
lora_name = lora_names[idx]
else:
lora_name = None
- if "" in prompt:
- import re
-
- image_match = re.search(r"(.*?)(.*)", prompt, re.DOTALL)
- image_data = image_match.group(1) if image_match else None
- prompt = image_match.group(2) if image_match else prompt
- else:
- image_data = None
-
request_func_input = RequestFuncInput(
model=model_id,
- prompt=prompt,
+ prompt=request.prompt,
api_url=api_url,
- prompt_len=prompt_len,
- output_len=output_len,
+ prompt_len=request.prompt_len,
+ output_len=request.output_len,
lora_name=lora_name,
- image_data=image_data,
+ image_data=request.image_data,
extra_request_body=extra_request_body,
)
+
tasks.append(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input, pbar=pbar)
diff --git a/python/sglang/srt/managers/multimodal_processors/internvl.py b/python/sglang/srt/managers/multimodal_processors/internvl.py
index 6d7c14c4f..21b4c36c7 100644
--- a/python/sglang/srt/managers/multimodal_processors/internvl.py
+++ b/python/sglang/srt/managers/multimodal_processors/internvl.py
@@ -175,6 +175,10 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
if not image_data:
return None
+ # Ensure image_data is a list
+ if isinstance(image_data, str):
+ image_data = [image_data]
+
base_output = self.load_mm_data(
prompt=input_text,
image_data=image_data,