ci: improve nightly-ci (#11385)

This commit is contained in:
Mick
2025-10-13 12:19:34 +08:00
committed by GitHub
parent a55cf5304a
commit 0c0779d667
6 changed files with 76 additions and 54 deletions

View File

@@ -62,7 +62,7 @@ jobs:
nightly-test-eval-vlms: nightly-test-eval-vlms:
if: github.repository == 'sgl-project/sglang' if: github.repository == 'sgl-project/sglang'
runs-on: 1-gpu-runner runs-on: 2-gpu-runner
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -79,7 +79,7 @@ jobs:
nightly-test-perf-vlms: nightly-test-perf-vlms:
if: github.repository == 'sgl-project/sglang' if: github.repository == 'sgl-project/sglang'
runs-on: 1-gpu-runner runs-on: 2-gpu-runner
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4

View File

@@ -25,8 +25,10 @@ from typing import List, Optional, Tuple
import numpy as np import numpy as np
import requests import requests
from pydantic import BaseModel from pydantic import BaseModel
from transformers import AutoProcessor, PreTrainedTokenizer
from sglang.bench_serving import ( from sglang.bench_serving import (
get_processor,
get_tokenizer, get_tokenizer,
sample_mmmu_requests, sample_mmmu_requests,
sample_random_requests, sample_random_requests,
@@ -104,8 +106,14 @@ Note: To view the traces through perfetto-ui, please:
if self.profile_links.extend or self.profile_links.decode: if self.profile_links.extend or self.profile_links.decode:
# Create a combined link or use the first available one # Create a combined link or use the first available one
trace_files = [self.profile_links.extend, self.profile_links.decode] trace_files = [self.profile_links.extend, self.profile_links.decode]
if any(trace_file is None for trace_file in trace_files):
logger.error("Some trace files are None", f"{trace_files=}")
trace_files_relay_links = [ trace_files_relay_links = [
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})" (
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
if trace_file
else "N/A"
)
for trace_file in trace_files for trace_file in trace_files
] ]
@@ -114,30 +122,31 @@ Note: To view the traces through perfetto-ui, please:
# Build the row # Build the row
return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n" return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
@classmethod
def generate_markdown_report(
cls, trace_dir, results: List["BenchmarkResult"]
) -> str:
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
import os
summary = f"### {results[0].model_path}\n" def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
import os
# summary += ( summary = f"### {results[0].model_path}\n"
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
# )
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
# all results should share the same isl & osl # summary += (
for result in results: # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/") # )
relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/") summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html" summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
# base_url = "https://github.com/sgl-project/ci-data/traces"
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
return summary # all results should share the same isl & osl
for result in results:
base_url = os.getenv(
"TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces"
).rstrip("/")
relay_base = os.getenv(
"PERFETTO_RELAY_URL",
"https://docs.sglang.ai/ci-data/pages/perfetto_relay.html",
).rstrip("/")
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
return summary
@dataclasses.dataclass @dataclasses.dataclass
@@ -288,7 +297,7 @@ def run_one_case(
input_len_step_percentage: float, input_len_step_percentage: float,
run_name: str, run_name: str,
result_filename: str, result_filename: str,
tokenizer, tokenizer: PreTrainedTokenizer | AutoProcessor,
dataset_name="", dataset_name="",
profile: bool = False, profile: bool = False,
profile_steps: int = 3, profile_steps: int = 3,
@@ -302,9 +311,8 @@ def run_one_case(
if dataset_name == "mmmu": if dataset_name == "mmmu":
input_requests = sample_mmmu_requests( input_requests = sample_mmmu_requests(
num_requests=batch_size, num_requests=batch_size,
tokenizer=tokenizer, processor=tokenizer,
fixed_output_len=output_len, fixed_output_len=output_len,
apply_chat_template=True,
random_sample=False, random_sample=False,
) )
elif dataset_name == "random": elif dataset_name == "random":
@@ -364,6 +372,8 @@ def run_one_case(
if dataset_name == "mmmu": if dataset_name == "mmmu":
# vlm # vlm
input_ids = [] input_ids = []
# for vlms, tokenizer is an instance of AutoProcessor
tokenizer = tokenizer.tokenizer
for input_req in input_requests: for input_req in input_requests:
input_ids += [tokenizer.encode(input_req.prompt)] input_ids += [tokenizer.encode(input_req.prompt)]
payload["image_data"] = [req.image_data for req in input_requests] payload["image_data"] = [req.image_data for req in input_requests]
@@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
tokenizer_path = server_info["tokenizer_path"] tokenizer_path = server_info["tokenizer_path"]
elif "prefill" in server_info: elif "prefill" in server_info:
tokenizer_path = server_info["prefill"][0]["tokenizer_path"] tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
tokenizer = get_tokenizer(tokenizer_path)
if bench_args.dataset_name == "mmmu":
# mmmu implies this is a MLLM
tokenizer = get_processor(tokenizer_path)
else:
tokenizer = get_tokenizer(tokenizer_path)
# warmup # warmup
if not bench_args.skip_warmup: if not bench_args.skip_warmup:

View File

@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
import argparse import argparse
import asyncio import asyncio
import base64
import io import io
import json import json
import os import os
@@ -671,7 +670,7 @@ def get_processor(
if pretrained_model_name_or_path.endswith( if pretrained_model_name_or_path.endswith(
".json" ".json"
) or pretrained_model_name_or_path.endswith(".model"): ) or pretrained_model_name_or_path.endswith(".model"):
from sglang.srt.hf_transformers_utils import get_processor from sglang.srt.utils.hf_transformers_utils import get_processor
return get_processor(pretrained_model_name_or_path) return get_processor(pretrained_model_name_or_path)
@@ -935,7 +934,7 @@ async def get_mooncake_request_over_time(
for i in range(num_rounds): for i in range(num_rounds):
# Add user query for the current round # Add user query for the current round
chat_history.append( chat_history.append(
{"role": "user", "content": f"Round {i+1}: {user_query_base}"} {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
) )
# Form the full prompt from history # Form the full prompt from history
@@ -964,7 +963,7 @@ async def get_mooncake_request_over_time(
def sample_mmmu_requests( def sample_mmmu_requests(
num_requests: int, num_requests: int,
processor: AutoProcessor, processor: AutoProcessor | AutoTokenizer,
fixed_output_len: Optional[int] = None, fixed_output_len: Optional[int] = None,
random_sample: bool = True, random_sample: bool = True,
) -> List[DatasetRow]: ) -> List[DatasetRow]:
@@ -973,9 +972,7 @@ def sample_mmmu_requests(
Args: Args:
num_requests: Number of requests to sample. num_requests: Number of requests to sample.
tokenizer: Tokenizer to use for token counting.
fixed_output_len: If provided, use this fixed output length for all requests. fixed_output_len: If provided, use this fixed output length for all requests.
apply_chat_template: Whether to apply the chat template to the prompt.
random_sample: Whether to randomly sample or take the first N. random_sample: Whether to randomly sample or take the first N.
Returns: Returns:
@@ -1282,11 +1279,11 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
) )
def create_mm_data_row(text_prompt, images, images_base64, output_len, processor): def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
try: try:
content_items = [ content_items = [
{"type": "image_url", "image_url": {"url": img_url}} {"type": "image", "image": {"url": image_base64}}
for img_url in images_base64 for image_base64 in images_base64
] ]
content_items.append({"type": "text", "text": text_prompt}) content_items.append({"type": "text", "text": text_prompt})
prompt_str = processor.apply_chat_template( prompt_str = processor.apply_chat_template(
@@ -1294,7 +1291,9 @@ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor
add_generation_prompt=True, add_generation_prompt=True,
tokenize=False, tokenize=False,
) )
except Exception: except Exception as e:
# Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
print(f"Error applying chat template: {e}, fallback to <image> tag")
# Some tokenizers do not support list content; fall back to a placeholder in the text # Some tokenizers do not support list content; fall back to a placeholder in the text
prompt_str = f"<image>{text_prompt}" prompt_str = f"<image>{text_prompt}"
@@ -1425,7 +1424,7 @@ def sample_image_requests(
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}") print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}") print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
print( print(
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request" f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
) )
return dataset return dataset

View File

@@ -3,7 +3,7 @@ import subprocess
import time import time
import unittest import unittest
from sglang.bench_one_batch_server import BenchmarkResult from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -41,7 +41,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self): def test_bench_one_batch(self):
all_benchmark_results = [] all_benchmark_results = []
all_model_succeed = True
for model_setup in self.models: for model_setup in self.models:
benchmark_results = [] benchmark_results = []
with self.subTest(model=model_setup.model_path): with self.subTest(model=model_setup.model_path):
@@ -113,19 +113,21 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
# Clean up JSON file # Clean up JSON file
os.remove(json_output_file) os.remove(json_output_file)
else: else:
all_model_succeed = False
print(f"Warning: JSON output file {json_output_file} not found") print(f"Warning: JSON output file {json_output_file} not found")
finally: finally:
kill_process_tree(process.pid) kill_process_tree(process.pid)
report_part = BenchmarkResult.generate_markdown_report( report_part = generate_markdown_report(PROFILE_DIR, benchmark_results)
PROFILE_DIR, benchmark_results
)
self.full_report += report_part + "\n" self.full_report += report_part + "\n"
if is_in_ci(): if is_in_ci():
write_github_step_summary(self.full_report) write_github_step_summary(self.full_report)
if not all_model_succeed:
raise AssertionError("Some models failed the perf tests.")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -1,7 +1,6 @@
import json import json
import unittest import unittest
import warnings import warnings
from functools import partial
from types import SimpleNamespace from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
@@ -26,16 +25,19 @@ MODEL_THRESHOLDS = {
"Efficient-Large-Model/NVILA-Lite-2B-hf-0626" "Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
): ModelEvalMetrics(0.305, 23.8), ): ModelEvalMetrics(0.305, 23.8),
ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9), ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3), ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 17.7),
ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6), ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics( ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(
0.330, 22.3 0.330, 22.3
), ),
ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3), ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5), ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.259, 36.3),
ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0), ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 17.0),
ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3), ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings(
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
): ModelEvalMetrics(0.29, 29.1),
ModelLaunchSettings( ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503" "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7), ): ModelEvalMetrics(0.310, 16.7),

View File

@@ -3,7 +3,7 @@ import subprocess
import unittest import unittest
import warnings import warnings
from sglang.bench_one_batch_server import BenchmarkResult from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -27,6 +27,7 @@ MODEL_DEFAULTS = [
ModelLaunchSettings( ModelLaunchSettings(
"google/gemma-3-27b-it", "google/gemma-3-27b-it",
), ),
ModelLaunchSettings("Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]),
# "OpenGVLab/InternVL2_5-2B", # "OpenGVLab/InternVL2_5-2B",
# buggy in official transformers impl # buggy in official transformers impl
# "openbmb/MiniCPM-V-2_6", # "openbmb/MiniCPM-V-2_6",
@@ -45,9 +46,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
cls.models = [] cls.models = []
model_paths = parse_models(nightly_vlm_models_str) model_paths = parse_models(nightly_vlm_models_str)
for model_path in model_paths: for model_path in model_paths:
cls.models.append( cls.models.append(ModelLaunchSettings(model_path))
ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS)
)
else: else:
cls.models = MODEL_DEFAULTS cls.models = MODEL_DEFAULTS
@@ -60,6 +59,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self): def test_bench_one_batch(self):
all_benchmark_results = [] all_benchmark_results = []
all_model_succeed = True
for model_setup in self.models: for model_setup in self.models:
benchmark_results = [] benchmark_results = []
@@ -112,7 +112,6 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
f"Error running benchmark for {model_setup.model_path} with batch size:" f"Error running benchmark for {model_setup.model_path} with batch size:"
) )
print(result.stderr) print(result.stderr)
# Continue to next batch size even if one fails
continue continue
print(f"Output for {model_setup.model_path} with batch size:") print(f"Output for {model_setup.model_path} with batch size:")
@@ -136,19 +135,24 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
) )
else: else:
all_model_succeed = False
print(f"Warning: JSON output file {json_output_file} not found") print(f"Warning: JSON output file {json_output_file} not found")
finally: finally:
kill_process_tree(process.pid) kill_process_tree(process.pid)
report_part = BenchmarkResult.generate_markdown_report( report_part = generate_markdown_report(
PROFILE_DIR, benchmark_results PROFILE_DIR,
benchmark_results,
) )
self.full_report += report_part + "\n" self.full_report += report_part + "\n"
if is_in_ci(): if is_in_ci():
write_github_step_summary(self.full_report) write_github_step_summary(self.full_report)
if not all_model_succeed:
raise AssertionError("Some models failed the perf tests.")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()