ci: improve nightly-ci (#11385)
This commit is contained in:
@@ -25,8 +25,10 @@ from typing import List, Optional, Tuple
|
||||
import numpy as np
|
||||
import requests
|
||||
from pydantic import BaseModel
|
||||
from transformers import AutoProcessor, PreTrainedTokenizer
|
||||
|
||||
from sglang.bench_serving import (
|
||||
get_processor,
|
||||
get_tokenizer,
|
||||
sample_mmmu_requests,
|
||||
sample_random_requests,
|
||||
@@ -104,8 +106,14 @@ Note: To view the traces through perfetto-ui, please:
|
||||
if self.profile_links.extend or self.profile_links.decode:
|
||||
# Create a combined link or use the first available one
|
||||
trace_files = [self.profile_links.extend, self.profile_links.decode]
|
||||
if any(trace_file is None for trace_file in trace_files):
|
||||
logger.error("Some trace files are None", f"{trace_files=}")
|
||||
trace_files_relay_links = [
|
||||
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
|
||||
(
|
||||
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
|
||||
if trace_file
|
||||
else "N/A"
|
||||
)
|
||||
for trace_file in trace_files
|
||||
]
|
||||
|
||||
@@ -114,30 +122,31 @@ Note: To view the traces through perfetto-ui, please:
|
||||
# Build the row
|
||||
return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
|
||||
|
||||
@classmethod
|
||||
def generate_markdown_report(
|
||||
cls, trace_dir, results: List["BenchmarkResult"]
|
||||
) -> str:
|
||||
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
|
||||
import os
|
||||
|
||||
summary = f"### {results[0].model_path}\n"
|
||||
def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
|
||||
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
|
||||
import os
|
||||
|
||||
# summary += (
|
||||
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
|
||||
# )
|
||||
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
|
||||
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
|
||||
summary = f"### {results[0].model_path}\n"
|
||||
|
||||
# all results should share the same isl & osl
|
||||
for result in results:
|
||||
base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
|
||||
relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
|
||||
relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
|
||||
# base_url = "https://github.com/sgl-project/ci-data/traces"
|
||||
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
|
||||
# summary += (
|
||||
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
|
||||
# )
|
||||
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
|
||||
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
|
||||
|
||||
return summary
|
||||
# all results should share the same isl & osl
|
||||
for result in results:
|
||||
base_url = os.getenv(
|
||||
"TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces"
|
||||
).rstrip("/")
|
||||
relay_base = os.getenv(
|
||||
"PERFETTO_RELAY_URL",
|
||||
"https://docs.sglang.ai/ci-data/pages/perfetto_relay.html",
|
||||
).rstrip("/")
|
||||
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@@ -288,7 +297,7 @@ def run_one_case(
|
||||
input_len_step_percentage: float,
|
||||
run_name: str,
|
||||
result_filename: str,
|
||||
tokenizer,
|
||||
tokenizer: PreTrainedTokenizer | AutoProcessor,
|
||||
dataset_name="",
|
||||
profile: bool = False,
|
||||
profile_steps: int = 3,
|
||||
@@ -302,9 +311,8 @@ def run_one_case(
|
||||
if dataset_name == "mmmu":
|
||||
input_requests = sample_mmmu_requests(
|
||||
num_requests=batch_size,
|
||||
tokenizer=tokenizer,
|
||||
processor=tokenizer,
|
||||
fixed_output_len=output_len,
|
||||
apply_chat_template=True,
|
||||
random_sample=False,
|
||||
)
|
||||
elif dataset_name == "random":
|
||||
@@ -364,6 +372,8 @@ def run_one_case(
|
||||
if dataset_name == "mmmu":
|
||||
# vlm
|
||||
input_ids = []
|
||||
# for vlms, tokenizer is an instance of AutoProcessor
|
||||
tokenizer = tokenizer.tokenizer
|
||||
for input_req in input_requests:
|
||||
input_ids += [tokenizer.encode(input_req.prompt)]
|
||||
payload["image_data"] = [req.image_data for req in input_requests]
|
||||
@@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
||||
tokenizer_path = server_info["tokenizer_path"]
|
||||
elif "prefill" in server_info:
|
||||
tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
|
||||
tokenizer = get_tokenizer(tokenizer_path)
|
||||
|
||||
if bench_args.dataset_name == "mmmu":
|
||||
# mmmu implies this is a MLLM
|
||||
tokenizer = get_processor(tokenizer_path)
|
||||
else:
|
||||
tokenizer = get_tokenizer(tokenizer_path)
|
||||
|
||||
# warmup
|
||||
if not bench_args.skip_warmup:
|
||||
|
||||
@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
@@ -671,7 +670,7 @@ def get_processor(
|
||||
if pretrained_model_name_or_path.endswith(
|
||||
".json"
|
||||
) or pretrained_model_name_or_path.endswith(".model"):
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
return get_processor(pretrained_model_name_or_path)
|
||||
|
||||
@@ -935,7 +934,7 @@ async def get_mooncake_request_over_time(
|
||||
for i in range(num_rounds):
|
||||
# Add user query for the current round
|
||||
chat_history.append(
|
||||
{"role": "user", "content": f"Round {i+1}: {user_query_base}"}
|
||||
{"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
|
||||
)
|
||||
|
||||
# Form the full prompt from history
|
||||
@@ -964,7 +963,7 @@ async def get_mooncake_request_over_time(
|
||||
|
||||
def sample_mmmu_requests(
|
||||
num_requests: int,
|
||||
processor: AutoProcessor,
|
||||
processor: AutoProcessor | AutoTokenizer,
|
||||
fixed_output_len: Optional[int] = None,
|
||||
random_sample: bool = True,
|
||||
) -> List[DatasetRow]:
|
||||
@@ -973,9 +972,7 @@ def sample_mmmu_requests(
|
||||
|
||||
Args:
|
||||
num_requests: Number of requests to sample.
|
||||
tokenizer: Tokenizer to use for token counting.
|
||||
fixed_output_len: If provided, use this fixed output length for all requests.
|
||||
apply_chat_template: Whether to apply the chat template to the prompt.
|
||||
random_sample: Whether to randomly sample or take the first N.
|
||||
|
||||
Returns:
|
||||
@@ -1282,11 +1279,11 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
||||
)
|
||||
|
||||
|
||||
def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
|
||||
def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
|
||||
try:
|
||||
content_items = [
|
||||
{"type": "image_url", "image_url": {"url": img_url}}
|
||||
for img_url in images_base64
|
||||
{"type": "image", "image": {"url": image_base64}}
|
||||
for image_base64 in images_base64
|
||||
]
|
||||
content_items.append({"type": "text", "text": text_prompt})
|
||||
prompt_str = processor.apply_chat_template(
|
||||
@@ -1294,7 +1291,9 @@ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
# Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
|
||||
print(f"Error applying chat template: {e}, fallback to <image> tag")
|
||||
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
||||
prompt_str = f"<image>{text_prompt}"
|
||||
|
||||
@@ -1425,7 +1424,7 @@ def sample_image_requests(
|
||||
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
|
||||
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
|
||||
print(
|
||||
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
|
||||
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
|
||||
)
|
||||
return dataset
|
||||
|
||||
|
||||
Reference in New Issue
Block a user