From 777eb538978862b622541eb2f85f2fe390be715b Mon Sep 17 00:00:00 2001 From: Mick Date: Sat, 27 Sep 2025 06:24:30 +0800 Subject: [PATCH] ci: refactor nightly test (#10495) --- .github/workflows/nightly-test.yml | 82 +++- python/sglang/bench_one_batch.py | 14 +- python/sglang/bench_one_batch_server.py | 338 ++++++++++++-- python/sglang/bench_serving.py | 6 +- .../srt/managers/scheduler_profiler_mixin.py | 6 +- python/sglang/test/run_eval.py | 7 + python/sglang/test/simple_eval_mmmu_vlm.py | 441 ++++++++++++++++++ python/sglang/test/test_utils.py | 136 ++++++ scripts/ci/publish_traces.py | 263 +++++++++++ test/srt/run_suite.py | 3 - test/srt/test_nightly_gsm8k_eval_amd.py | 31 +- ...=> test_nightly_text_models_gsm8k_eval.py} | 103 +--- test/srt/test_nightly_text_models_perf.py | 135 ++++++ test/srt/test_nightly_vlms_mmmu_eval.py | 117 +++++ test/srt/test_nightly_vlms_perf.py | 135 ++++++ test/srt/test_vllm_dependency.py | 26 +- 16 files changed, 1656 insertions(+), 187 deletions(-) create mode 100644 python/sglang/test/simple_eval_mmmu_vlm.py create mode 100644 scripts/ci/publish_traces.py rename test/srt/{test_nightly_gsm8k_eval.py => test_nightly_text_models_gsm8k_eval.py} (58%) create mode 100644 test/srt/test_nightly_text_models_perf.py create mode 100644 test/srt/test_nightly_vlms_mmmu_eval.py create mode 100644 test/srt/test_nightly_vlms_perf.py diff --git a/.github/workflows/nightly-test.yml b/.github/workflows/nightly-test.yml index a32c1dbea..468147454 100644 --- a/.github/workflows/nightly-test.yml +++ b/.github/workflows/nightly-test.yml @@ -15,8 +15,8 @@ concurrency: cancel-in-progress: true jobs: - nightly-test: - if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + nightly-test-eval-text-models: + if: github.repository == 'sgl-project/sglang' runs-on: 2-gpu-runner steps: - name: Checkout code @@ -26,8 +26,82 @@ jobs: run: | bash scripts/ci/ci_install_dependency.sh - - name: Run test + - name: Run eval test for text models timeout-minutes: 120 run: | cd test/srt - python3 run_suite.py --suite nightly --timeout-per-file 3600 + python3 test_nightly_text_models_gsm8k_eval.py + + nightly-test-perf-text-models: + if: github.repository == 'sgl-project/sglang' + runs-on: 2-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci/ci_install_dependency.sh + + - name: Run performance test for text models + timeout-minutes: 180 + env: + TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} + PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} + run: | + rm -rf test/srt/performance_profiles_text_models/ + python3 test/srt/test_nightly_text_models_perf.py + + - name: Publish traces to storage repo + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + run: | + python3 scripts/ci/publish_traces.py + + nightly-test-eval-vlms: + if: github.repository == 'sgl-project/sglang' + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci/ci_install_dependency.sh + + - name: Run eval test for VLM models (fixed MMMU-100) + timeout-minutes: 240 + run: | + cd test/srt + python3 test_nightly_vlms_mmmu_eval.py + + + nightly-test-perf-vlms: + if: github.repository == 'sgl-project/sglang' + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci/ci_install_dependency.sh + + - name: Run perf test for VLM models (MMMU) + timeout-minutes: 240 + env: + TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} + PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} + run: | + rm -rf test/srt/performance_profiles_vlms/ + python3 test/srt/test_nightly_vlms_perf.py + + - name: Publish traces to storage repo + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + run: | + python3 scripts/ci/publish_traces.py --vlm diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py index ebd461ec3..92f6e20d1 100644 --- a/python/sglang/bench_one_batch.py +++ b/python/sglang/bench_one_batch.py @@ -443,11 +443,9 @@ def latency_test_run_once( if profile: profiler.stop() - profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz" - _save_profile_trace_results(profiler, profile_filename) - rank_print( - f"torch profiler chrome trace for prefill saved to {profile_filename}" - ) + trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz" + _save_profile_trace_results(profiler, trace_filename) + rank_print(f"torch profiler chrome trace for prefill saved to {trace_filename}") # Decode decode_latencies = [] @@ -479,10 +477,10 @@ def latency_test_run_once( if profile and i == output_len / 2: profiler.stop() - profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz" - _save_profile_trace_results(profiler, profile_filename) + trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz" + _save_profile_trace_results(profiler, trace_filename) rank_print( - f"torch profiler chrome trace for decoding 1 token saved to {profile_filename}" + f"torch profiler chrome trace for decoding 1 token saved to {trace_filename}" ) # Record decode timing from 2nd output diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py index ce904f967..0f2b6bc26 100644 --- a/python/sglang/bench_one_batch_server.py +++ b/python/sglang/bench_one_batch_server.py @@ -9,6 +9,7 @@ python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B -- python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage +python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --output-path results.json --profile """ import argparse @@ -19,12 +20,17 @@ import multiprocessing import os import random import time -from typing import List, Tuple +from typing import List, Optional, Tuple import numpy as np import requests +from pydantic import BaseModel -from sglang.bench_serving import get_tokenizer, sample_random_requests +from sglang.bench_serving import ( + get_tokenizer, + sample_mmmu_requests, + sample_random_requests, +) from sglang.profiler import run_profile from sglang.srt.entrypoints.http_server import launch_server from sglang.srt.server_args import ServerArgs @@ -32,6 +38,109 @@ from sglang.srt.utils import is_blackwell, kill_process_tree from sglang.test.test_utils import is_in_ci, write_github_step_summary +class ProfileLinks(BaseModel): + """Pydantic model for profile trace links.""" + + extend: Optional[str] = None + decode: Optional[str] = None + + +class BenchmarkResult(BaseModel): + """Pydantic model for benchmark results table data, for a single isl and osl""" + + model_path: str + run_name: str + batch_size: int + input_len: int + output_len: int + latency: float + ttft: float + input_throughput: float + output_throughput: float + overall_throughput: float + last_gen_throughput: float + acc_length: Optional[float] = None + profile_links: Optional[ProfileLinks] = None + + @staticmethod + def help_str() -> str: + return f""" +Note: To view the traces through perfetto-ui, please: +1. use Google Chrome +2. enable popup + +""" + + def to_markdown_row( + self, trace_dir, base_url: str = "", relay_base: str = "" + ) -> str: + """Convert this benchmark result to a markdown table row.""" + # Calculate costs (assuming H100 pricing for now) + hourly_cost_per_gpu = 2 # $2/hour for one H100 + hourly_cost = hourly_cost_per_gpu * 1 # Assuming tp_size = 1 for simplicity + input_util = 0.7 + accept_length = ( + round(self.acc_length, 2) if self.acc_length is not None else "n/a" + ) + itl = 1 / (self.output_throughput / self.batch_size) * 1000 + input_cost = 1e6 / (self.input_throughput * input_util) / 3600 * hourly_cost + output_cost = 1e6 / self.output_throughput / 3600 * hourly_cost + + def get_perfetto_relay_link_from_trace_file(trace_file: str): + import os + from urllib.parse import quote + + rel_path = os.path.relpath(trace_file, trace_dir) + raw_file_link = f"{base_url}/{rel_path}" + relay_link = ( + f"{relay_base}?src={quote(raw_file_link, safe='')}" + if relay_base and quote + else raw_file_link + ) + return relay_link + + # Handle profile links + profile_link = "NA | NA" + if self.profile_links: + if self.profile_links.extend or self.profile_links.decode: + # Create a combined link or use the first available one + trace_files = [self.profile_links.extend, self.profile_links.decode] + trace_files_relay_links = [ + f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})" + for trace_file in trace_files + ] + + profile_link = " | ".join(trace_files_relay_links) + + # Build the row + return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n" + + @classmethod + def generate_markdown_report( + cls, trace_dir, results: List["BenchmarkResult"] + ) -> str: + """Generate a markdown report from a list of BenchmarkResult object from a single run.""" + import os + + summary = f"### {results[0].model_path}\n" + + # summary += ( + # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n" + # ) + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n" + summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n" + + # all results should share the same isl & osl + for result in results: + base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/") + relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/") + relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html" + # base_url = "https://github.com/sgl-project/ci-data/traces" + summary += result.to_markdown_row(trace_dir, base_url, relay_base) + + return summary + + @dataclasses.dataclass class BenchArgs: run_name: str = "default" @@ -50,8 +159,12 @@ class BenchArgs: profile: bool = False profile_steps: int = 3 profile_by_stage: bool = False + profile_filename_prefix: str = None + append_to_github_summary: bool = True dataset_path: str = "" parallel_batch: bool = False + dataset_name: str = "random" + output_path: Optional[str] = None @staticmethod def add_cli_args(parser: argparse.ArgumentParser): @@ -67,6 +180,13 @@ class BenchArgs: "--output-len", type=int, nargs="+", default=BenchArgs.output_len ) parser.add_argument("--temperature", type=float, default=BenchArgs.temperature) + parser.add_argument( + "--dataset-name", + type=str, + default=BenchArgs.dataset_name, + choices=["mmmu", "random"], + help="Name of the dataset to benchmark on.", + ) parser.add_argument("--return-logprob", action="store_true") parser.add_argument( "--client-stream-interval", @@ -96,14 +216,36 @@ class BenchArgs: help="Path to the dataset.", ) parser.add_argument("--parallel-batch", action="store_true") + parser.add_argument( + "--profile-filename-prefix", + type=str, + default=BenchArgs.profile_filename_prefix, + ) + parser.add_argument( + "--no-append-to-github-summary", + action="store_false", + dest="append_to_github_summary", + help="Disable appending the output of this run to github ci summary", + ) + parser.add_argument( + "--output-path", + type=str, + default=BenchArgs.output_path, + help="Path to save benchmark results as JSON format. If not specified, results will only be saved to result-filename.", + ) @classmethod def from_cli_args(cls, args: argparse.Namespace): # use the default value's type to cast the args into correct types. attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)] - return cls( - **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs} - ) + kwargs = {} + for attr, attr_type in attrs: + val = getattr(args, attr) + if attr_type is type(None): + kwargs[attr] = val + else: + kwargs[attr] = attr_type(val) + return cls(**kwargs) def launch_server_internal(server_args): @@ -148,23 +290,35 @@ def run_one_case( run_name: str, result_filename: str, tokenizer, + dataset_name="", profile: bool = False, profile_steps: int = 3, profile_by_stage: bool = False, + profile_filename_prefix: str = None, dataset_path: str = "", parallel_batch: bool = False, ): requests.post(url + "/flush_cache") - input_requests = sample_random_requests( - input_len=input_len, - output_len=output_len, - num_prompts=batch_size, - range_ratio=1.0, - tokenizer=tokenizer, - dataset_path=dataset_path, - random_sample=True, - return_text=False, - ) + # TODO: reuse bench_serving.get_dataset ? + if dataset_name == "mmmu": + input_requests = sample_mmmu_requests( + num_requests=batch_size, + tokenizer=tokenizer, + fixed_output_len=output_len, + apply_chat_template=True, + random_sample=False, + ) + elif dataset_name == "random": + input_requests = sample_random_requests( + input_len=input_len, + output_len=output_len, + num_prompts=batch_size, + range_ratio=1.0, + tokenizer=tokenizer, + dataset_path=dataset_path, + random_sample=True, + return_text=False, + ) use_structured_outputs = False if use_structured_outputs: @@ -181,26 +335,48 @@ def run_one_case( profile_link = None if profile: + output_dir, profile_name = None, None + if profile_filename_prefix: + output_dir = os.path.dirname(profile_filename_prefix) + profile_name = os.path.basename(profile_filename_prefix) profile_link: str = run_profile( - url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage + url, + profile_steps, + ["CPU", "GPU"], + output_dir, + profile_name, + profile_by_stage, ) tic = time.perf_counter() + + payload = { + "sampling_params": { + "temperature": temperature, + "max_new_tokens": output_len, + "ignore_eos": True, + "json_schema": json_schema, + "stream_interval": stream_interval, + }, + "return_logprob": return_logprob, + "stream": True, + **({"parallel_batch": parallel_batch} if parallel_batch else {}), + } + if dataset_name == "mmmu": + # vlm + input_ids = [] + for input_req in input_requests: + input_ids += [tokenizer.encode(input_req.prompt)] + payload["image_data"] = [req.image_data for req in input_requests] + + else: + input_ids = [req.prompt for req in input_requests] + + payload["input_ids"] = input_ids + response = requests.post( url + "/generate", - json={ - "input_ids": [req.prompt for req in input_requests], - "sampling_params": { - "temperature": temperature, - "max_new_tokens": output_len, - "ignore_eos": True, - "json_schema": json_schema, - "stream_interval": stream_interval, - }, - "return_logprob": return_logprob, - "stream": True, - **({"parallel_batch": parallel_batch} if parallel_batch else {}), - }, + json=payload, stream=True, ) @@ -264,10 +440,100 @@ def run_one_case( overall_throughput, last_gen_throughput, acc_length, - profile_link if profile else None, + profile_link, ) +def save_results_as_json(result: List[Tuple], bench_args: BenchArgs, model: str): + """Save benchmark results as JSON using Pydantic models.""" + json_results = [] + + # Generate all parameter combinations to match with results + param_combinations = list( + itertools.product( + bench_args.batch_size, bench_args.input_len, bench_args.output_len + ) + ) + + for i, ( + batch_size, + latency, + ttft, + input_throughput, + output_throughput, + overall_throughput, + last_gen_throughput, + acc_length, + profile_link, + ) in enumerate(result): + # Get the corresponding parameters for this result + bs, input_len, output_len = param_combinations[i] + + # Parse profile links if available + profile_links = None + if profile_link: + profile_links = parse_profile_links( + profile_link, batch_size, input_len, output_len + ) + + benchmark_result = BenchmarkResult( + model_path=model, + run_name=bench_args.run_name, + batch_size=batch_size, + input_len=input_len, + output_len=output_len, + latency=latency, + ttft=ttft, + input_throughput=input_throughput, + output_throughput=output_throughput, + overall_throughput=overall_throughput, + last_gen_throughput=last_gen_throughput, + acc_length=acc_length, + profile_links=profile_links, + ) + json_results.append(benchmark_result.model_dump()) + + # Save to JSON file + with open(bench_args.output_path, "w", encoding="utf-8") as f: + json.dump(json_results, f, indent=2, ensure_ascii=False) + + print(f"Results saved as JSON to {bench_args.output_path}") + + +def parse_profile_links( + profile_dir: str, batch_size: int, input_len: int, output_len: int +) -> Optional[ProfileLinks]: + """Parse profile directory to extract extend and decode trace file links.""" + if not profile_dir or not os.path.exists(profile_dir): + return None + + extend_link = None + decode_link = None + + # Look for extend/prefill trace files + for file in os.listdir(profile_dir): + if file.endswith(".trace.json.gz") or file.endswith(".trace.json"): + if "extend" in file.lower() or "prefill" in file.lower(): + extend_link = os.path.join(profile_dir, file) + elif "decode" in file.lower(): + decode_link = os.path.join(profile_dir, file) + + # If no specific extend/decode files found, try to find files with batch/input/output info + if not extend_link or not decode_link: + for file in os.listdir(profile_dir): + if file.endswith(".trace.json.gz") or file.endswith(".trace.json"): + if f"_batch{batch_size}_input{input_len}_output{output_len}_" in file: + if "prefill" in file.lower() or "extend" in file.lower(): + extend_link = os.path.join(profile_dir, file) + elif "decode" in file.lower(): + decode_link = os.path.join(profile_dir, file) + + if extend_link or decode_link: + return ProfileLinks(extend=extend_link, decode=decode_link) + + return None + + def get_report_summary( result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs ): @@ -358,6 +624,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): return_logprob=bench_args.return_logprob, stream_interval=bench_args.client_stream_interval, input_len_step_percentage=bench_args.input_len_step_percentage, + dataset_name=bench_args.dataset_name, run_name="", result_filename="", tokenizer=tokenizer, @@ -384,10 +651,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): stream_interval=bench_args.client_stream_interval, input_len_step_percentage=bench_args.input_len_step_percentage, run_name=bench_args.run_name, + dataset_name=bench_args.dataset_name, result_filename=bench_args.result_filename, tokenizer=tokenizer, dataset_path=bench_args.dataset_path, parallel_batch=bench_args.parallel_batch, + profile_filename_prefix=bench_args.profile_filename_prefix, ) ) @@ -410,11 +679,13 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): run_name=bench_args.run_name, result_filename=bench_args.result_filename, tokenizer=tokenizer, + dataset_name=bench_args.dataset_name, profile=bench_args.profile, profile_steps=bench_args.profile_steps, profile_by_stage=bench_args.profile_by_stage, dataset_path=bench_args.dataset_path, parallel_batch=bench_args.parallel_batch, + profile_filename_prefix=bench_args.profile_filename_prefix, )[-1], ) ) @@ -427,13 +698,16 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): print(f"\nResults are saved to {bench_args.result_filename}") + # Save results as JSON if output_path is specified + if bench_args.output_path: + save_results_as_json(result, bench_args, model=server_args.model_path) + if not bench_args.show_report: return summary = get_report_summary(result, server_args, bench_args) - print(summary) - if is_in_ci(): + if is_in_ci() and bench_args.append_to_github_summary: write_github_step_summary(summary) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index ea670d97f..a6ad956d0 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -208,6 +208,10 @@ async def async_request_openai_completions( "ignore_eos": not args.disable_ignore_eos, **request_func_input.extra_request_body, } + + if request_func_input.image_data: + payload.update({"image_data": request_func_input.image_data}) + headers = get_auth_headers() output = RequestFuncOutput.init_new(request_func_input) @@ -664,7 +668,7 @@ def get_dataset(args, tokenizer): num_prompts=args.num_prompts, range_ratio=args.random_range_ratio, tokenizer=tokenizer, - dataset_path=args.dataset_path, + dataset_path=args.dataset_name, random_sample=args.dataset_name == "random", return_text=not tokenize_prompt, ) diff --git a/python/sglang/srt/managers/scheduler_profiler_mixin.py b/python/sglang/srt/managers/scheduler_profiler_mixin.py index e7ac8452d..74af358f6 100644 --- a/python/sglang/srt/managers/scheduler_profiler_mixin.py +++ b/python/sglang/srt/managers/scheduler_profiler_mixin.py @@ -97,7 +97,7 @@ class SchedulerProfilerMixin: def start_profile( self, stage: Optional[ForwardMode] = None ) -> ProfileReqOutput | None: - stage_str = f" for {stage.__str__()}" if stage else "" + stage_str = f" for {stage.name}" if stage else "" logger.info( f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})", ) @@ -181,7 +181,7 @@ class SchedulerProfilerMixin: if not Path(self.torch_profiler_output_dir).exists(): Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True) - stage_suffix = f"-{stage.__str__()}" if stage else "" + stage_suffix = f"-{stage.name}" if stage else "" logger.info("Stop profiling" + stage_suffix + "...") if self.torch_profiler is not None: self.torch_profiler.stop() @@ -247,7 +247,7 @@ class SchedulerProfilerMixin: if self.profiler_decode_ct == 0: if self.profile_in_progress: # force trace flush - self.stop_profile(ForwardMode.EXTEND) + self.stop_profile(stage=ForwardMode.EXTEND) self.start_profile(batch.forward_mode) self.profiler_decode_ct += 1 if self.profiler_decode_ct > self.profiler_target_decode_ct: diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py index 9b788cc0a..85f84c36b 100644 --- a/python/sglang/test/run_eval.py +++ b/python/sglang/test/run_eval.py @@ -60,6 +60,11 @@ def run_eval(args): from sglang.test.simple_eval_humaneval import HumanEval eval_obj = HumanEval(args.num_examples, args.num_threads) + elif args.eval_name == "mmmu": + # VLM MMMU evaluation with fixed 100 examples by default + from sglang.test.simple_eval_mmmu_vlm import MMMUVLMEval + + eval_obj = MMMUVLMEval(args.num_examples, args.num_threads) else: raise ValueError(f"Invalid eval name: {args.eval_name}") @@ -94,6 +99,8 @@ def run_eval(args): print(f"Total latency: {latency:.3f} s") print(f"Score: {metrics['score']:.3f}") + if getattr(args, "return_latency", False): + return metrics, latency return metrics diff --git a/python/sglang/test/simple_eval_mmmu_vlm.py b/python/sglang/test/simple_eval_mmmu_vlm.py new file mode 100644 index 000000000..2f64df004 --- /dev/null +++ b/python/sglang/test/simple_eval_mmmu_vlm.py @@ -0,0 +1,441 @@ +""" +MMMU evaluation for VLMs using the run_eval simple-evals interface. + +""" + +from __future__ import annotations + +import base64 +import io +from typing import List, Optional, Tuple + +from datasets import concatenate_datasets, load_dataset +from PIL import Image + +from sglang.test import simple_eval_common as common +from sglang.test.simple_eval_common import ( + HTML_JINJA, + Eval, + EvalResult, + SamplerBase, + SingleEvalResult, + map_with_progress, +) + + +class MMMUVLMEval(Eval): + DOMAIN_CAT2SUB_CAT = { + "Art and Design": ["Art", "Art_Theory", "Design", "Music"], + "Business": ["Accounting", "Economics", "Finance", "Manage", "Marketing"], + "Science": ["Biology", "Chemistry", "Geography", "Math", "Physics"], + "Health and Medicine": [ + "Basic_Medical_Science", + "Clinical_Medicine", + "Diagnostics_and_Laboratory_Medicine", + "Pharmacy", + "Public_Health", + ], + "Humanities and Social Science": [ + "History", + "Literature", + "Sociology", + "Psychology", + ], + "Tech and Engineering": [ + "Agriculture", + "Architecture_and_Engineering", + "Computer_Science", + "Electronics", + "Energy_and_Power", + "Materials", + "Mechanical_Engineering", + ], + } + + def __init__( + self, num_examples: Optional[int] = 100, num_threads: int = 32, seed: int = 42 + ): + """Create MMMU VLM eval (Math subset, 100 fixed samples by default).""" + self.num_examples = num_examples + self.num_threads = num_threads + self.seed = seed + # Prepare samples deterministically across all MMMU subjects (validation split) + self.samples = self._prepare_mmmu_samples(self.num_examples) + + @staticmethod + def _to_data_uri(image: Image.Image) -> str: + if image.mode == "RGBA": + image = image.convert("RGB") + buf = io.BytesIO() + image.save(buf, format="PNG") + b64 = base64.b64encode(buf.getvalue()).decode("utf-8") + return f"data:image/png;base64,{b64}" + + @staticmethod + def _build_mc_mapping(options: List[str]) -> Tuple[dict, List[str]]: + index2ans = {} + all_choices = [] + ch = ord("A") + for opt in options: + letter = chr(ch) + index2ans[letter] = opt + all_choices.append(letter) + ch += 1 + return index2ans, all_choices + + def _prepare_mmmu_samples(self, k: int) -> List[dict]: + # Subjects and domains copied from MMMU data_utils to categorize results + subjects: List[str] = [] + for subs in self.DOMAIN_CAT2SUB_CAT.values(): + subjects.extend(subs) + + # Load validation split of each subject + datasets = [] + for subj in subjects: + try: + d = load_dataset("MMMU/MMMU", subj, split="validation") + # attach subject info via transform + d = d.add_column("__subject__", [subj] * len(d)) + datasets.append(d) + except Exception: + continue + if not datasets: + raise RuntimeError("Failed to load MMMU datasets") + + merged = concatenate_datasets(datasets) + + # Deterministic selection: sort by id (fallback to subject+index) + def _key(idx): + ex = merged[idx] + return str(ex.get("id", f"{ex['__subject__']}:{idx}")) + + order = sorted(range(len(merged)), key=_key) + picked_indices = order[:k] + + samples: List[dict] = [] + for idx in picked_indices: + ex = merged[idx] + subject = ex["__subject__"] + image = ex.get("image_1") + if image is None or not hasattr(image, "convert"): + continue + data_uri = self._to_data_uri(image) + question = ex.get("question", "") + answer = ex.get("answer") + raw_options = ex.get("options") + question_type = "open" + index2ans = None + all_choices = None + options = None + if raw_options: + try: + options = ( + raw_options + if isinstance(raw_options, list) + else list(eval(raw_options)) + ) + if isinstance(options, list) and len(options) > 0: + index2ans, all_choices = self._build_mc_mapping(options) + question_type = "multiple-choice" + except Exception: + options = None + + # Build final textual prompt; include choices if MC + prompt_text = f"Question: {question}\n\n" + if options: + letters = [chr(ord("A") + i) for i in range(len(options))] + for letter, opt in zip(letters, options): + prompt_text += f"{letter}) {opt}\n" + prompt_text += "\nAnswer: " + + samples.append( + { + "id": ex.get("id", f"{subject}:{idx}"), + "final_input_prompt": prompt_text, + "image_data": data_uri, + "answer": answer, + "question_type": question_type, + "index2ans": index2ans, + "all_choices": all_choices, + "category": subject, + } + ) + + return samples + + @staticmethod + def _split_prompt_for_image(prompt: str) -> tuple[str, str]: + """Split a prompt containing an inline image tag into prefix and suffix. + + If no tag is present, treat the whole prompt as prefix and empty suffix. + """ + if "<" in prompt and ">" in prompt: + prefix = prompt.split("<")[0] + suffix = prompt.split(">", 1)[1] + return prefix, suffix + return prompt, "" + + @staticmethod + def build_chat_messages_from_prompt(prompt: str, image_data) -> List: + """Split a prompt containing an inline image tag into prefix and suffix. + + If no tag is present, treat the whole prompt as prefix and empty suffix. + """ + # Build a vision+text message for OpenAI-compatible API + prefix, suffix = MMMUVLMEval._split_prompt_for_image(prompt) + + content: List[dict] = [] + if prefix: + content.append({"type": "text", "text": prefix}) + content.append({"type": "image_url", "image_url": {"url": image_data}}) + if suffix: + content.append({"type": "text", "text": suffix}) + prompt_messages = [{"role": "user", "content": content}] + + return prompt_messages + + def __call__(self, sampler: SamplerBase) -> EvalResult: + def fn(sample: dict): + prompt = sample["final_input_prompt"] + image_data = sample["image_data"] + prompt_messages = MMMUVLMEval.build_chat_messages_from_prompt( + prompt, image_data + ) + + # Sample + response_text = sampler(prompt_messages) + + # Parse and score + gold = sample["answer"] + if ( + sample["question_type"] == "multiple-choice" + and sample["all_choices"] + and sample["index2ans"] + ): + pred = _parse_multi_choice_response( + response_text, sample["all_choices"], sample["index2ans"] + ) + score = 1.0 if (gold is not None and pred == gold) else 0.0 + extracted_answer = pred + else: + parsed_list = _parse_open_response(response_text) + score = ( + 1.0 if (gold is not None and _eval_open(gold, parsed_list)) else 0.0 + ) + extracted_answer = ", ".join(map(str, parsed_list)) + + html_rendered = common.jinja_env.from_string(HTML_JINJA).render( + prompt_messages=prompt_messages, + next_message=dict(content=response_text, role="assistant"), + score=score, + correct_answer=gold, + extracted_answer=extracted_answer, + ) + + convo = prompt_messages + [dict(content=response_text, role="assistant")] + return SingleEvalResult( + html=html_rendered, + score=score, + metrics={"__category__": sample["category"]}, + convo=convo, + ) + + results = map_with_progress(fn, self.samples, self.num_threads) + + # Build category table and overall accuracy + # Gather per-sample correctness and category + per_cat_total: dict[str, int] = {} + per_cat_correct: dict[str, int] = {} + htmls = [] + convos = [] + scores: List[float] = [] + for r in results: + # __category__ stored under metrics + cat = r.metrics.get("__category__") if r.metrics else None + if cat is None: + cat = "Unknown" + per_cat_total[cat] = per_cat_total.get(cat, 0) + 1 + if r.score: + per_cat_correct[cat] = per_cat_correct.get(cat, 0) + 1 + htmls.append(r.html) + convos.append(r.convo) + if r.score is not None: + scores.append(r.score) + + evaluation_result = {} + for cat, tot in per_cat_total.items(): + corr = per_cat_correct.get(cat, 0) + acc = (corr / tot) if tot > 0 else 0.0 + evaluation_result[cat] = {"acc": round(acc, 3), "num_example": tot} + + printable_results = {} + # Domains first + for domain, cats in self.DOMAIN_CAT2SUB_CAT.items(): + acc_sum = 0.0 + num_sum = 0 + for cat in cats: + if cat in evaluation_result: + acc_sum += ( + evaluation_result[cat]["acc"] + * evaluation_result[cat]["num_example"] + ) + num_sum += evaluation_result[cat]["num_example"] + if num_sum > 0: + printable_results[f"Overall-{domain}"] = { + "num": num_sum, + "acc": round(acc_sum / num_sum, 3), + } + # add each sub-category row if present + for cat in cats: + if cat in evaluation_result: + printable_results[cat] = { + "num": evaluation_result[cat]["num_example"], + "acc": evaluation_result[cat]["acc"], + } + + # Overall + total_num = sum(v["num_example"] for v in evaluation_result.values()) + overall_acc = ( + sum(v["acc"] * v["num_example"] for v in evaluation_result.values()) + / total_num + if total_num > 0 + else 0.0 + ) + printable_results["Overall"] = {"num": total_num, "acc": round(overall_acc, 3)} + + # Build EvalResult + return EvalResult( + score=overall_acc, metrics=printable_results, htmls=htmls, convos=convos + ) + + +def _parse_multi_choice_response( + response: str, all_choices: List[str], index2ans: dict +) -> str: + # loosely adapted from benchmark mmmu eval + for char in [",", ".", "!", "?", ";", ":", "'"]: + response = response.strip(char) + response = " " + response + " " + + # Prefer explicit letter with bracket e.g. (A) + candidates: List[str] = [] + for choice in all_choices: + if f"({choice})" in response: + candidates.append(choice) + if not candidates: + for choice in all_choices: + if f" {choice} " in response: + candidates.append(choice) + if not candidates and len(response.split()) > 5: + # try match by option text + for idx, ans in index2ans.items(): + if ans and ans.lower() in response.lower(): + candidates.append(idx) + if not candidates: + # fallback to first choice + return all_choices[0] + if len(candidates) == 1: + return candidates[0] + # choose the last occurrence + starts = [] + for can in candidates: + pos = response.rfind(f"({can})") + if pos == -1: + pos = response.rfind(f" {can} ") + if pos == -1 and index2ans.get(can): + pos = response.lower().rfind(index2ans[can].lower()) + starts.append(pos) + return candidates[int(max(range(len(starts)), key=lambda i: starts[i]))] + + +def _check_is_number(s: str) -> bool: + try: + float(s.replace(",", "")) + return True + except Exception: + return False + + +def _normalize_str(s: str): + s = s.strip() + if _check_is_number(s): + s = s.replace(",", "") + try: + v = round(float(s), 2) + return [v] + except Exception: + return [s.lower()] + return [s.lower()] if len(s) > 1 else [" " + s, s + " "] + + +def _extract_numbers(s: str) -> List[str]: + import re as _re + + pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b" + pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+" + pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])" + return ( + _re.findall(pattern_commas, s) + + _re.findall(pattern_scientific, s) + + _re.findall(pattern_simple, s) + ) + + +def _parse_open_response(response: str) -> List[str]: + import re as _re + + def get_key_subresponses(resp: str) -> List[str]: + resp = resp.strip().strip(".").lower() + subs = _re.split(r"\.\s(?=[A-Z])|\n", resp) + indicators = [ + "could be ", + "so ", + "is ", + "thus ", + "therefore ", + "final ", + "answer ", + "result ", + ] + keys = [] + for i, s in enumerate(subs): + cands = [*indicators] + if i == len(subs) - 1: + cands.append("=") + shortest = None + for ind in cands: + if ind in s: + part = s.split(ind)[-1].strip() + if not shortest or len(part) < len(shortest): + shortest = part + if shortest and shortest not in [":", ",", ".", "!", "?", ";", ":", "'"]: + keys.append(shortest) + return keys or [resp] + + key_resps = get_key_subresponses(response) + pred_list = key_resps.copy() + for r in key_resps: + pred_list.extend(_extract_numbers(r)) + out = [] + for x in pred_list: + out.extend(_normalize_str(x)) + # dedup + return list(dict.fromkeys(out)) + + +def _eval_open(gold, preds: List[str]) -> bool: + if isinstance(gold, list): + norm_answers = [] + for ans in gold: + norm_answers.extend(_normalize_str(ans)) + else: + norm_answers = _normalize_str(gold) + for p in preds: + if isinstance(p, str): + for na in norm_answers: + if isinstance(na, str) and na in p: + return True + else: + if p in norm_answers: + return True + return False diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 208b45578..410f0aa99 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -14,10 +14,12 @@ import time import unittest from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass +from datetime import datetime from functools import partial from pathlib import Path from types import SimpleNamespace from typing import Any, Awaitable, Callable, List, Optional, Tuple +from urllib.parse import quote import aiohttp import numpy as np @@ -1467,3 +1469,137 @@ def dump_bench_raw_result( def _ensure_remove_suffix(text: str, suffix: str): assert text.endswith(suffix) return text.removesuffix(suffix) + + +class ModelDeploySetup: + def __init__(self, model_path: str, extra_args: List[str] = []): + self.model_path = model_path + if "--enable-multimodal" not in extra_args: + extra_args.append("--enable-multimodal") + if "--trust-remote-code" not in extra_args: + extra_args.append("--trust-remote-code") + + self.extra_args = extra_args + + +class ModelEvalMetrics: + def __init__(self, accuracy: float, eval_time: float): + self.accuracy = accuracy + self.eval_time = eval_time + + +def extract_trace_link_from_bench_one_batch_server_output(output: str) -> str: + match = re.search(r"\[Profile\]\((.*?)\)", output) + if match: + trace_link = match.group(1) + return trace_link + return None + + +def parse_models(model_string: str): + return [model.strip() for model in model_string.split(",") if model.strip()] + + +def check_evaluation_test_results( + results, + test_name, + model_accuracy_thresholds, + model_latency_thresholds=None, + model_count=None, +): + """ + results: list of tuple of (model_path, accuracy, latency) + """ + failed_models = [] + if model_latency_thresholds is not None: + summary = " | model | status | score | score_threshold | latency | latency_threshold | \n" + summary += "| ----- | ------ | ----- | --------------- | ------- | ----------------- | \n" + else: + summary = " | model | status | score | score_threshold | \n" + summary += "| ----- | ------ | ----- | --------------- | \n" + + for model, accuracy, latency in results: + accuracy_threshold = model_accuracy_thresholds.get(model) + if accuracy_threshold is None: + print(f"Warning: No threshold defined for model {model}") + continue + + latency_threshold = ( + model_latency_thresholds.get(model, None) + if model_latency_thresholds + else 1e9 + ) + + is_success = accuracy >= accuracy_threshold and latency <= latency_threshold + status_emoji = "✅" if is_success else "❌" + + if not is_success: + failed_models.append( + f"\nScore Check Failed: {model}\n" + f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})" + ) + + if model_latency_thresholds is not None: + line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n" + else: + line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n" + + summary += line + + print(summary) + + if is_in_ci(): + write_github_step_summary(f"## {test_name}\n{summary}") + + some_model_failed_to_get_result = len(results) != ( + model_count or len(model_accuracy_thresholds) + ) + if some_model_failed_to_get_result: + print("Some model has failed to launch and be evaluated") + + if failed_models or some_model_failed_to_get_result: + raise AssertionError("\n".join(failed_models)) + + +# Bench knobs for bench_one_batch_server (override by env) +def _parse_int_list_env(name: str, default_val: str): + val = os.environ.get(name, default_val) + return [int(x) for x in val.split(",") if x] + + +# Return filenames +def find_traces_under_path(path: str) -> List[str]: + results = [] + for _, dirs, files in os.walk(path): + for file in files: + if file.endswith(".trace.json.gz"): + results.append(f"{file}") + return results + + +def write_results_to_json(model, metrics, mode="a"): + result = { + "timestamp": datetime.now().isoformat(), + "model": model, + "metrics": metrics, + "score": metrics["score"], + } + + if "latency" in metrics: + result["latency"] = (metrics.get("latency"),) + + existing_results = [] + if mode == "a" and os.path.exists("results.json"): + try: + with open("results.json", "r") as f: + existing_results = json.load(f) + except json.JSONDecodeError: + existing_results = [] + + if isinstance(existing_results, list): + existing_results.append(result) + else: + existing_results = [result] + + with open("results.json", "w") as f: + json.dump(existing_results, f, indent=2) diff --git a/scripts/ci/publish_traces.py b/scripts/ci/publish_traces.py new file mode 100644 index 000000000..5c27cf87f --- /dev/null +++ b/scripts/ci/publish_traces.py @@ -0,0 +1,263 @@ +""" +Publish performance traces to GitHub repository +""" + +import argparse +import base64 +import json +import os +import sys +from urllib.request import Request, urlopen + + +def make_github_request(url, token, method="GET", data=None): + """Make authenticated request to GitHub API""" + headers = { + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {token}", + # "User-Agent": "sglang-ci", + "X-GitHub-Api-Version": "2022-11-28", + } + + if data: + headers["Content-Type"] = "application/json" + data = json.dumps(data).encode("utf-8") + + req = Request(url, data=data, headers=headers, method=method) + + try: + with urlopen(req) as response: + return response.read().decode("utf-8") + except Exception as e: + print(f"GitHub API request failed: {e}") + if hasattr(e, "read"): + try: + error_body = e.read().decode("utf-8") + print(f"Error response body: {error_body}") + except: + pass + raise + + +def verify_token_permissions(repo_owner, repo_name, token): + """Verify that the token has necessary permissions for the repository""" + print("Verifying token permissions...") + + # Check if we can access the repository + try: + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}" + response = make_github_request(url, token) + repo_data = json.loads(response) + print(f"Repository access verified: {repo_data['full_name']}") + except Exception as e: + print(f"Failed to access repository: {e}") + return False + + # Check if we can read the repository contents + try: + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents" + response = make_github_request(url, token) + print("Repository contents access verified") + except Exception as e: + print(f"Failed to access repository contents: {e}") + return False + + return True + + +def get_branch_sha(repo_owner, repo_name, branch, token): + """Get SHA of the branch head""" + url = ( + f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/refs/heads/{branch}" + ) + response = make_github_request(url, token) + data = json.loads(response) + return data["object"]["sha"] + + +def get_tree_sha(repo_owner, repo_name, commit_sha, token): + """Get tree SHA from commit""" + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/commits/{commit_sha}" + response = make_github_request(url, token) + data = json.loads(response) + return data["tree"]["sha"] + + +def create_blob(repo_owner, repo_name, content, token): + """Create a blob with file content""" + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/blobs" + + # Encode content as base64 for GitHub API + content_b64 = base64.b64encode(content).decode("utf-8") + + data = {"content": content_b64, "encoding": "base64"} + + response = make_github_request(url, token, method="POST", data=data) + return json.loads(response)["sha"] + + +def create_tree(repo_owner, repo_name, base_tree_sha, files, token): + """Create a new tree with files""" + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/trees" + + tree_items = [] + for file_path, content in files: + # Create blob first to get SHA + blob_sha = create_blob(repo_owner, repo_name, content, token) + tree_items.append( + { + "path": file_path, + "mode": "100644", + "type": "blob", + "sha": blob_sha, + } + ) + + data = {"base_tree": base_tree_sha, "tree": tree_items} + + response = make_github_request(url, token, method="POST", data=data) + return json.loads(response)["sha"] + + +def create_commit(repo_owner, repo_name, tree_sha, parent_sha, message, token): + """Create a new commit""" + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/commits" + + data = {"tree": tree_sha, "parents": [parent_sha], "message": message} + + response = make_github_request(url, token, method="POST", data=data) + return json.loads(response)["sha"] + + +def update_branch_ref(repo_owner, repo_name, branch, commit_sha, token): + """Update branch reference to point to new commit""" + url = ( + f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/refs/heads/{branch}" + ) + + data = {"sha": commit_sha} + + make_github_request(url, token, method="PATCH", data=data) + + +def copy_trace_files(source_dir, target_base_path, is_vlm=False): + """Copy trace files and return list of files to upload""" + files_to_upload = [] + + if not os.path.exists(source_dir): + print(f"Warning: Traces directory {source_dir} does not exist") + return files_to_upload + + # Walk through source directory and find .json.gz files + for root, dirs, files in os.walk(source_dir): + for file in files: + if file.endswith(".json.gz"): + source_file = os.path.join(root, file) + # Calculate relative path from source_dir + rel_path = os.path.relpath(source_file, source_dir) + target_path = f"{target_base_path}/{rel_path}" + + # Read file content + with open(source_file, "rb") as f: + content = f.read() + + files_to_upload.append((target_path, content)) + + return files_to_upload + + +def publish_traces(traces_dir, run_id, run_number, is_vlm=False): + """Publish traces to GitHub repository in a single commit""" + # Get environment variables + token = os.getenv("GITHUB_TOKEN") + if not token: + print("Error: GITHUB_TOKEN environment variable not set") + sys.exit(1) + + # Repository configuration + repo_owner = "sglang-bot" + repo_name = "sglang-ci-data" + branch = "main" + target_base_path = f"traces/{run_id}" + + # Copy trace files + files_to_upload = copy_trace_files(traces_dir, target_base_path, is_vlm) + + if not files_to_upload: + print("No trace files found to upload") + return + + print(f"Found {len(files_to_upload)} files to upload") + + # Verify token permissions before proceeding + if not verify_token_permissions(repo_owner, repo_name, token): + print( + "Token permission verification failed. Please check the token permissions." + ) + sys.exit(1) + + try: + # Get current branch head + branch_sha = get_branch_sha(repo_owner, repo_name, branch, token) + print(f"Current branch head: {branch_sha}") + + # Get current tree + tree_sha = get_tree_sha(repo_owner, repo_name, branch_sha, token) + print(f"Current tree SHA: {tree_sha}") + + # Create new tree with all files + new_tree_sha = create_tree( + repo_owner, repo_name, tree_sha, files_to_upload, token + ) + print(f"Created new tree: {new_tree_sha}") + + # Create commit + commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)" + commit_sha = create_commit( + repo_owner, repo_name, new_tree_sha, branch_sha, commit_message, token + ) + print(f"Created commit: {commit_sha}") + + # Update branch reference + update_branch_ref(repo_owner, repo_name, branch, commit_sha, token) + print("Updated branch reference") + + print("Successfully published all traces in a single commit") + + except Exception as e: + print(f"Failed to publish traces: {e}") + raise + + +def main(): + parser = argparse.ArgumentParser( + description="Publish performance traces to GitHub repository" + ) + parser.add_argument("--vlm", action="store_true", help="Process VLM model traces") + args = parser.parse_args() + + # Get environment variables + + run_id = os.getenv("GITHUB_RUN_ID", "test") + run_number = os.getenv("GITHUB_RUN_NUMBER", "12345") + + if not run_id or not run_number: + print( + "Error: GITHUB_RUN_ID and GITHUB_RUN_NUMBER environment variables must be set" + ) + sys.exit(1) + + # Determine traces directory + if args.vlm: + traces_dir = "performance_profiles_vlms" + print("Processing VLM model traces") + else: + traces_dir = "performance_profiles_text_models" + print("Processing text model traces") + + # Publish traces + publish_traces(traces_dir, run_id, run_number, args.vlm) + + +if __name__ == "__main__": + main() diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 53720467a..7b5210f5b 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -165,9 +165,6 @@ suites = { "per-commit-8-gpu-h20": [ TestFile("quant/test_w4a8_deepseek_v3.py", 371), ], - "nightly": [ - TestFile("test_nightly_gsm8k_eval.py"), - ], "vllm_dependency_test": [ TestFile("quant/test_awq.py", 163), TestFile("test_bnb.py", 5), diff --git a/test/srt/test_nightly_gsm8k_eval_amd.py b/test/srt/test_nightly_gsm8k_eval_amd.py index d03684b99..232fde507 100644 --- a/test/srt/test_nightly_gsm8k_eval_amd.py +++ b/test/srt/test_nightly_gsm8k_eval_amd.py @@ -15,8 +15,10 @@ from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, is_in_ci, + parse_models, popen_launch_server, write_github_step_summary, + write_results_to_json, ) MODEL_SCORE_THRESHOLDS = { @@ -73,10 +75,6 @@ TRITON_MOE_MODELS = { } -def parse_models(model_string): - return [model.strip() for model in model_string.split(",") if model.strip()] - - def popen_launch_server_wrapper(base_url, model, is_tp2): other_args = ["--log-level-http", "warning", "--trust-remote-code"] if is_tp2: @@ -91,31 +89,6 @@ def popen_launch_server_wrapper(base_url, model, is_tp2): return process -def write_results_to_json(model, metrics, mode="a"): - result = { - "timestamp": datetime.now().isoformat(), - "model": model, - "metrics": metrics, - "score": metrics["score"], - } - - existing_results = [] - if mode == "a" and os.path.exists("results.json"): - try: - with open("results.json", "r") as f: - existing_results = json.load(f) - except json.JSONDecodeError: - existing_results = [] - - if isinstance(existing_results, list): - existing_results.append(result) - else: - existing_results = [result] - - with open("results.json", "w") as f: - json.dump(existing_results, f, indent=2) - - def check_model_scores(results): failed_models = [] summary = " | model | score | threshold |\n" diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_text_models_gsm8k_eval.py similarity index 58% rename from test/srt/test_nightly_gsm8k_eval.py rename to test/srt/test_nightly_text_models_gsm8k_eval.py index a6b3070e4..07c95952e 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_text_models_gsm8k_eval.py @@ -1,8 +1,6 @@ import json -import os import unittest import warnings -from datetime import datetime from types import SimpleNamespace from sglang.srt.utils import kill_process_tree @@ -14,9 +12,10 @@ from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, - is_in_ci, + check_evaluation_test_results, + parse_models, popen_launch_server, - write_github_step_summary, + write_results_to_json, ) MODEL_SCORE_THRESHOLDS = { @@ -25,11 +24,11 @@ MODEL_SCORE_THRESHOLDS = { "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85, "google/gemma-2-27b-it": 0.91, "meta-llama/Llama-3.1-70B-Instruct": 0.95, - "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64, + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.62, "Qwen/Qwen2-57B-A14B-Instruct": 0.86, "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83, "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, - "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84, + "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.835, "zai-org/GLM-4.5-Air-FP8": 0.75, # The threshold of neuralmagic/gemma-2-2b-it-FP8 should be 0.6, but this model has some accuracy regression. # The fix is tracked at https://github.com/sgl-project/sglang/issues/4324, we set it to 0.50, for now, to make CI green. @@ -41,78 +40,6 @@ MODEL_SCORE_THRESHOLDS = { } -def parse_models(model_string): - return [model.strip() for model in model_string.split(",") if model.strip()] - - -def popen_launch_server_wrapper(base_url, model, is_tp2): - other_args = ["--log-level-http", "warning", "--trust-remote-code"] - if is_tp2: - other_args.extend(["--tp", "2"]) - - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) - return process - - -def write_results_to_json(model, metrics, mode="a"): - result = { - "timestamp": datetime.now().isoformat(), - "model": model, - "metrics": metrics, - "score": metrics["score"], - } - - existing_results = [] - if mode == "a" and os.path.exists("results.json"): - try: - with open("results.json", "r") as f: - existing_results = json.load(f) - except json.JSONDecodeError: - existing_results = [] - - if isinstance(existing_results, list): - existing_results.append(result) - else: - existing_results = [result] - - with open("results.json", "w") as f: - json.dump(existing_results, f, indent=2) - - -def check_model_scores(results): - failed_models = [] - summary = " | model | score | threshold |\n" - summary += "| ----- | ----- | --------- |\n" - - for model, score in results: - threshold = MODEL_SCORE_THRESHOLDS.get(model) - if threshold is None: - print(f"Warning: No threshold defined for model {model}") - continue - - if score < threshold: - failed_models.append( - f"\nScore Check Failed: {model}\n" - f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})" - ) - - line = f"| {model} | {score} | {threshold} |\n" - summary += line - - print(summary) - - if is_in_ci(): - write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}") - - if failed_models: - raise AssertionError("\n".join(failed_models)) - - # Do not use `CustomTestCase` since `test_mgsm_en_all_models` does not want retry class TestNightlyGsm8KEval(unittest.TestCase): @classmethod @@ -131,11 +58,17 @@ class TestNightlyGsm8KEval(unittest.TestCase): ) is_first = True all_results = [] - + model_count = 0 for model_group, is_fp8, is_tp2 in self.model_groups: for model in model_group: + model_count += 1 with self.subTest(model=model): - process = popen_launch_server_wrapper(self.base_url, model, is_tp2) + process = popen_launch_server( + model=model, + base_url=self.base_url, + other_args=["--tp", "2"] if is_tp2 else [], + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) args = SimpleNamespace( base_url=self.base_url, @@ -153,7 +86,8 @@ class TestNightlyGsm8KEval(unittest.TestCase): write_results_to_json(model, metrics, "w" if is_first else "a") is_first = False - all_results.append((model, metrics["score"])) + # 0.0 for empty latency + all_results.append((model, metrics["score"], 0.0)) kill_process_tree(process.pid) try: @@ -164,7 +98,12 @@ class TestNightlyGsm8KEval(unittest.TestCase): print(f"Error reading results.json: {e}") # Check all scores after collecting all results - check_model_scores(all_results) + check_evaluation_test_results( + all_results, + self.__class__.__name__, + model_accuracy_thresholds=MODEL_SCORE_THRESHOLDS, + model_count=model_count, + ) if __name__ == "__main__": diff --git a/test/srt/test_nightly_text_models_perf.py b/test/srt/test_nightly_text_models_perf.py new file mode 100644 index 000000000..a9ab6d003 --- /dev/null +++ b/test/srt/test_nightly_text_models_perf.py @@ -0,0 +1,135 @@ +import os +import subprocess +import time +import unittest + +from sglang.bench_one_batch_server import BenchmarkResult +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + _parse_int_list_env, + is_in_ci, + parse_models, + popen_launch_server, + write_github_step_summary, +) + +PROFILE_DIR = "performance_profiles_text_models" + + +class TestNightlyTextModelsPerformance(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model_groups = [ + (parse_models("meta-llama/Llama-3.1-8B-Instruct"), False, False), + (parse_models("Qwen/Qwen2-57B-A14B-Instruct"), False, True), + # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), + # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), + # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), + # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), + ] + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + os.makedirs(PROFILE_DIR, exist_ok=True) + cls.full_report = f"## {cls.__name__}\n" + BenchmarkResult.help_str() + + def test_bench_one_batch(self): + all_benchmark_results = [] + + for model_group, is_fp8, is_tp2 in self.model_groups: + for model in model_group: + benchmark_results = [] + with self.subTest(model=model): + process = popen_launch_server( + model=model, + base_url=self.base_url, + other_args=["--tp", "2"] if is_tp2 else [], + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) + try: + + profile_filename = ( + f"{model.replace('/', '_')}_{int(time.time())}" + ) + profile_path_prefix = os.path.join( + PROFILE_DIR, profile_filename + ) + json_output_file = ( + f"results_{model.replace('/', '_')}_{int(time.time())}.json" + ) + + command = [ + "python3", + "-m", + "sglang.bench_one_batch_server", + "--model", + model, + "--base-url", + self.base_url, + "--batch-size", + *[str(x) for x in self.batch_sizes], + "--input-len", + *[str(x) for x in self.input_lens], + "--output-len", + *[str(x) for x in self.output_lens], + "--show-report", + "--profile", + "--profile-by-stage", + "--profile-filename-prefix", + profile_path_prefix, + f"--output-path={json_output_file}", + "--no-append-to-github-summary", + ] + + print(f"Running command: {' '.join(command)}") + result = subprocess.run(command, capture_output=True, text=True) + + if result.returncode != 0: + print( + f"Error running benchmark for {model} with batch size:" + ) + print(result.stderr) + # Continue to next batch size even if one fails + continue + + # Load and deserialize JSON results + if os.path.exists(json_output_file): + import json + + with open(json_output_file, "r") as f: + json_data = json.load(f) + + # Convert JSON data to BenchmarkResult objects + for data in json_data: + benchmark_result = BenchmarkResult(**data) + all_benchmark_results.append(benchmark_result) + benchmark_results.append(benchmark_result) + + print( + f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}" + ) + + # Clean up JSON file + os.remove(json_output_file) + else: + print( + f"Warning: JSON output file {json_output_file} not found" + ) + + finally: + kill_process_tree(process.pid) + + report_part = BenchmarkResult.generate_markdown_report( + PROFILE_DIR, benchmark_results + ) + self.full_report += report_part + "\n" + + if is_in_ci(): + write_github_step_summary(self.full_report) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_nightly_vlms_mmmu_eval.py b/test/srt/test_nightly_vlms_mmmu_eval.py new file mode 100644 index 000000000..dc12fa125 --- /dev/null +++ b/test/srt/test_nightly_vlms_mmmu_eval.py @@ -0,0 +1,117 @@ +import json +import unittest +import warnings +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + ModelDeploySetup, + ModelEvalMetrics, + check_evaluation_test_results, + popen_launch_server, + write_results_to_json, +) + +MODEL_THRESHOLDS = { + # Conservative thresholds on 100 MMMU samples, especially for latency thresholds + ModelDeploySetup("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics(0.330, 56.1), + ModelDeploySetup("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 39.9), + ModelDeploySetup("Efficient-Large-Model/NVILA-Lite-2B-hf-0626"): ModelEvalMetrics( + 0.305, 23.8 + ), + ModelDeploySetup("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9), + ModelDeploySetup("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3), + ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 14.5), + ModelDeploySetup("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(0.330, 22.3), + ModelDeploySetup("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3), + ModelDeploySetup("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5), + ModelDeploySetup("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0), + ModelDeploySetup("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3), + ModelDeploySetup("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), + ModelDeploySetup("unsloth/Mistral-Small-3.1-24B-Instruct-2503"): ModelEvalMetrics( + 0.310, 16.7 + ), + ModelDeploySetup("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0), + ModelDeploySetup("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4), +} + + +class TestNightlyVLMMmmuEval(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.models = list(MODEL_THRESHOLDS.keys()) + cls.base_url = DEFAULT_URL_FOR_TEST + + def test_mmmu_vlm_models(self): + warnings.filterwarnings( + "ignore", category=ResourceWarning, message="unclosed.*socket" + ) + is_first = True + all_results = [] + + for model in self.models: + model_path = model.model_path + with self.subTest(model=model_path): + process = popen_launch_server( + model=model_path, + base_url=self.base_url, + other_args=model.extra_args, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) + try: + args = SimpleNamespace( + base_url=self.base_url, + model=model_path, + eval_name="mmmu", + num_examples=100, + num_threads=64, + max_tokens=30, + ) + + args.return_latency = True + + metrics, latency = run_eval(args) + + metrics["score"] = round(metrics["score"], 4) + metrics["latency"] = round(latency, 4) + print( + f"{'=' * 42}\n{model_path} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n" + ) + + write_results_to_json(model_path, metrics, "w" if is_first else "a") + is_first = False + + all_results.append( + (model_path, metrics["score"], metrics["latency"]) + ) + finally: + kill_process_tree(process.pid) + + try: + with open("results.json", "r") as f: + print("\nFinal Results from results.json:") + print(json.dumps(json.load(f), indent=2)) + except Exception as e: + print(f"Error reading results: {e}") + + model_accuracy_thresholds = { + model.model_path: threshold.accuracy + for model, threshold in MODEL_THRESHOLDS.items() + } + model_latency_thresholds = { + model.model_path: threshold.eval_time + for model, threshold in MODEL_THRESHOLDS.items() + } + check_evaluation_test_results( + all_results, + self.__class__.__name__, + model_accuracy_thresholds=model_accuracy_thresholds, + model_latency_thresholds=model_latency_thresholds, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_nightly_vlms_perf.py b/test/srt/test_nightly_vlms_perf.py new file mode 100644 index 000000000..c4d10a56e --- /dev/null +++ b/test/srt/test_nightly_vlms_perf.py @@ -0,0 +1,135 @@ +import os +import subprocess +import unittest +import warnings + +from sglang.bench_one_batch_server import BenchmarkResult +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + _parse_int_list_env, + is_in_ci, + parse_models, + popen_launch_server, + write_github_step_summary, +) + +PROFILE_DIR = "performance_profiles_vlms" + +MODEL_DEFAULTS = [ + # Keep conservative defaults. Can be overridden by env NIGHTLY_VLM_MODELS + "Qwen/Qwen2.5-VL-7B-Instruct", + "google/gemma-3-27b-it", + # "OpenGVLab/InternVL2_5-2B", + # buggy in official transformers impl + # "openbmb/MiniCPM-V-2_6", +] + + +class TestNightlyVLMModelsPerformance(unittest.TestCase): + @classmethod + def setUpClass(cls): + warnings.filterwarnings( + "ignore", category=ResourceWarning, message="unclosed.*socket" + ) + cls.models = parse_models( + os.environ.get("NIGHTLY_VLM_MODELS", ",".join(MODEL_DEFAULTS)) + ) + cls.base_url = DEFAULT_URL_FOR_TEST + + cls.batch_sizes = _parse_int_list_env("NIGHTLY_VLM_BATCH_SIZES", "1,1,2,8,16") + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_VLM_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_VLM_OUTPUT_LENS", "512")) + cls.full_report = f"## {cls.__name__}\n" + BenchmarkResult.help_str() + + def test_bench_one_batch(self): + all_benchmark_results = [] + + for model in self.models: + benchmark_results = [] + with self.subTest(model=model): + process = popen_launch_server( + model=model, + base_url=self.base_url, + other_args=["--mem-fraction-static=0.7"], + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) + try: + # Run bench_one_batch_server against the launched server + profile_filename = f"{model.replace('/', '_')}" + # path for this run + profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename) + + # JSON output file for this model + json_output_file = f"results_{model.replace('/', '_')}.json" + + command = [ + "python3", + "-m", + "sglang.bench_one_batch_server", + f"--model={model}", + "--base-url", + self.base_url, + "--batch-size", + *[str(x) for x in self.batch_sizes], + "--input-len", + *[str(x) for x in self.input_lens], + "--output-len", + *[str(x) for x in self.output_lens], + "--trust-remote-code", + "--dataset-name=mmmu", + "--profile", + "--profile-by-stage", + f"--profile-filename-prefix={profile_path_prefix}", + "--show-report", + f"--output-path={json_output_file}", + "--no-append-to-github-summary", + ] + + print(f"Running command: {' '.join(command)}") + result = subprocess.run(command, capture_output=True, text=True) + + if result.returncode != 0: + print(f"Error running benchmark for {model} with batch size:") + print(result.stderr) + # Continue to next batch size even if one fails + continue + + print(f"Output for {model} with batch size:") + print(result.stdout) + + # Load and deserialize JSON results + if os.path.exists(json_output_file): + import json + + with open(json_output_file, "r") as f: + json_data = json.load(f) + + # Convert JSON data to BenchmarkResult objects + for data in json_data: + benchmark_result = BenchmarkResult(**data) + all_benchmark_results.append(benchmark_result) + benchmark_results.append(benchmark_result) + + print( + f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}" + ) + + else: + print(f"Warning: JSON output file {json_output_file} not found") + + finally: + kill_process_tree(process.pid) + + report_part = BenchmarkResult.generate_markdown_report( + PROFILE_DIR, benchmark_results + ) + self.full_report += report_part + "\n" + + if is_in_ci(): + write_github_step_summary(self.full_report) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_vllm_dependency.py b/test/srt/test_vllm_dependency.py index b4451f369..918f3ee6c 100644 --- a/test/srt/test_vllm_dependency.py +++ b/test/srt/test_vllm_dependency.py @@ -14,6 +14,7 @@ from sglang.test.test_utils import ( is_in_ci, popen_launch_server, write_github_step_summary, + write_results_to_json, ) MODEL_SCORE_THRESHOLDS = { @@ -52,31 +53,6 @@ def popen_launch_server_wrapper(base_url, model, is_fp8, is_tp2): return process -def write_results_to_json(model, metrics, mode="a"): - result = { - "timestamp": datetime.now().isoformat(), - "model": model, - "metrics": metrics, - "score": metrics["score"], - } - - existing_results = [] - if mode == "a" and os.path.exists("results.json"): - try: - with open("results.json", "r") as f: - existing_results = json.load(f) - except json.JSONDecodeError: - existing_results = [] - - if isinstance(existing_results, list): - existing_results.append(result) - else: - existing_results = [result] - - with open("results.json", "w") as f: - json.dump(existing_results, f, indent=2) - - def check_model_scores(results): failed_models = [] summary = " | model | score | threshold |\n"