ci: refactor nightly test (#10495)

2025-09-27 06:24:30 +08:00
parent 05a3526654
commit 777eb53897
16 changed files with 1656 additions and 187 deletions
--- a/.github/workflows/nightly-test.yml
+++ b/.github/workflows/nightly-test.yml
@@ -15,8 +15,8 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  nightly-test:
+  nightly-test-eval-text-models:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: github.repository == 'sgl-project/sglang'
    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
@@ -26,8 +26,82 @@ jobs:
        run: |
          bash scripts/ci/ci_install_dependency.sh
-      - name: Run test
+      - name: Run eval test for text models
        timeout-minutes: 120
        run: |
          cd test/srt
-          python3 run_suite.py --suite nightly --timeout-per-file 3600
+          python3 test_nightly_text_models_gsm8k_eval.py
  nightly-test-perf-text-models:
    if: github.repository == 'sgl-project/sglang'
    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Install dependencies
        run: |
          bash scripts/ci/ci_install_dependency.sh
      - name: Run performance test for text models
        timeout-minutes: 180
        env:
          TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
          PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
        run: |
          rm -rf test/srt/performance_profiles_text_models/
          python3 test/srt/test_nightly_text_models_perf.py
      - name: Publish traces to storage repo
        env:
          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          GITHUB_RUN_NUMBER: ${{ github.run_number }}
        run: |
          python3 scripts/ci/publish_traces.py
  nightly-test-eval-vlms:
    if: github.repository == 'sgl-project/sglang'
    runs-on: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Install dependencies
        run: |
          bash scripts/ci/ci_install_dependency.sh
      - name: Run eval test for VLM models (fixed MMMU-100)
        timeout-minutes: 240
        run: |
          cd test/srt
          python3 test_nightly_vlms_mmmu_eval.py
  nightly-test-perf-vlms:
    if: github.repository == 'sgl-project/sglang'
    runs-on: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Install dependencies
        run: |
          bash scripts/ci/ci_install_dependency.sh
      - name: Run perf test for VLM models (MMMU)
        timeout-minutes: 240
        env:
          TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
          PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
        run: |
          rm -rf test/srt/performance_profiles_vlms/
          python3 test/srt/test_nightly_vlms_perf.py
      - name: Publish traces to storage repo
        env:
          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          GITHUB_RUN_NUMBER: ${{ github.run_number }}
        run: |
          python3 scripts/ci/publish_traces.py --vlm
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -443,11 +443,9 @@ def latency_test_run_once(
    if profile:
        profiler.stop()
-        profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
+        trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
-        _save_profile_trace_results(profiler, profile_filename)
+        _save_profile_trace_results(profiler, trace_filename)
-        rank_print(
+        rank_print(f"torch profiler chrome trace for prefill saved to {trace_filename}")
            f"torch profiler chrome trace for prefill saved to {profile_filename}"
        )
    # Decode
    decode_latencies = []
@@ -479,10 +477,10 @@ def latency_test_run_once(
        if profile and i == output_len / 2:
            profiler.stop()
-            profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
+            trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
-            _save_profile_trace_results(profiler, profile_filename)
+            _save_profile_trace_results(profiler, trace_filename)
            rank_print(
-                f"torch profiler chrome trace for decoding 1 token saved to {profile_filename}"
+                f"torch profiler chrome trace for decoding 1 token saved to {trace_filename}"
            )
    # Record decode timing from 2nd output
--- a/python/sglang/bench_one_batch_server.py
+++ b/python/sglang/bench_one_batch_server.py
@@ -9,6 +9,7 @@ python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --
 python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
 python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
 python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --output-path results.json --profile
 """
 import argparse
@@ -19,12 +20,17 @@ import multiprocessing
 import os
 import random
 import time
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 import numpy as np
 import requests
 from pydantic import BaseModel
-from sglang.bench_serving import get_tokenizer, sample_random_requests
+from sglang.bench_serving import (
    get_tokenizer,
    sample_mmmu_requests,
    sample_random_requests,
 )
 from sglang.profiler import run_profile
 from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import ServerArgs
@@ -32,6 +38,109 @@ from sglang.srt.utils import is_blackwell, kill_process_tree
 from sglang.test.test_utils import is_in_ci, write_github_step_summary
 class ProfileLinks(BaseModel):
    """Pydantic model for profile trace links."""
    extend: Optional[str] = None
    decode: Optional[str] = None
 class BenchmarkResult(BaseModel):
    """Pydantic model for benchmark results table data, for a single isl and osl"""
    model_path: str
    run_name: str
    batch_size: int
    input_len: int
    output_len: int
    latency: float
    ttft: float
    input_throughput: float
    output_throughput: float
    overall_throughput: float
    last_gen_throughput: float
    acc_length: Optional[float] = None
    profile_links: Optional[ProfileLinks] = None
    @staticmethod
    def help_str() -> str:
        return f"""
 Note: To view the traces through perfetto-ui, please:
 1. use Google Chrome
 2. enable popup
 """
    def to_markdown_row(
        self, trace_dir, base_url: str = "", relay_base: str = ""
    ) -> str:
        """Convert this benchmark result to a markdown table row."""
        # Calculate costs (assuming H100 pricing for now)
        hourly_cost_per_gpu = 2  # $2/hour for one H100
        hourly_cost = hourly_cost_per_gpu * 1  # Assuming tp_size = 1 for simplicity
        input_util = 0.7
        accept_length = (
            round(self.acc_length, 2) if self.acc_length is not None else "n/a"
        )
        itl = 1 / (self.output_throughput / self.batch_size) * 1000
        input_cost = 1e6 / (self.input_throughput * input_util) / 3600 * hourly_cost
        output_cost = 1e6 / self.output_throughput / 3600 * hourly_cost
        def get_perfetto_relay_link_from_trace_file(trace_file: str):
            import os
            from urllib.parse import quote
            rel_path = os.path.relpath(trace_file, trace_dir)
            raw_file_link = f"{base_url}/{rel_path}"
            relay_link = (
                f"{relay_base}?src={quote(raw_file_link, safe='')}"
                if relay_base and quote
                else raw_file_link
            )
            return relay_link
        # Handle profile links
        profile_link = "NA | NA"
        if self.profile_links:
            if self.profile_links.extend or self.profile_links.decode:
                # Create a combined link or use the first available one
                trace_files = [self.profile_links.extend, self.profile_links.decode]
                trace_files_relay_links = [
                    f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
                    for trace_file in trace_files
                ]
                profile_link = " | ".join(trace_files_relay_links)
        # Build the row
        return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
    @classmethod
    def generate_markdown_report(
        cls, trace_dir, results: List["BenchmarkResult"]
    ) -> str:
        """Generate a markdown report from a list of BenchmarkResult object from a single run."""
        import os
        summary = f"### {results[0].model_path}\n"
        # summary += (
        #     f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
        # )
        summary += "| batch size | input len | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
        summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
        # all results should share the same isl & osl
        for result in results:
            base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
            relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
            relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
            # base_url = "https://github.com/sgl-project/ci-data/traces"
            summary += result.to_markdown_row(trace_dir, base_url, relay_base)
        return summary
@dataclasses.dataclass
 class BenchArgs:
    run_name: str = "default"
@@ -50,8 +159,12 @@ class BenchArgs:
    profile: bool = False
    profile_steps: int = 3
    profile_by_stage: bool = False
    profile_filename_prefix: str = None
    append_to_github_summary: bool = True
    dataset_path: str = ""
    parallel_batch: bool = False
    dataset_name: str = "random"
    output_path: Optional[str] = None
    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
@@ -67,6 +180,13 @@ class BenchArgs:
            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
        )
        parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
        parser.add_argument(
            "--dataset-name",
            type=str,
            default=BenchArgs.dataset_name,
            choices=["mmmu", "random"],
            help="Name of the dataset to benchmark on.",
        )
        parser.add_argument("--return-logprob", action="store_true")
        parser.add_argument(
            "--client-stream-interval",
@@ -96,14 +216,36 @@ class BenchArgs:
            help="Path to the dataset.",
        )
        parser.add_argument("--parallel-batch", action="store_true")
        parser.add_argument(
            "--profile-filename-prefix",
            type=str,
            default=BenchArgs.profile_filename_prefix,
        )
        parser.add_argument(
            "--no-append-to-github-summary",
            action="store_false",
            dest="append_to_github_summary",
            help="Disable appending the output of this run to github ci summary",
        )
        parser.add_argument(
            "--output-path",
            type=str,
            default=BenchArgs.output_path,
            help="Path to save benchmark results as JSON format. If not specified, results will only be saved to result-filename.",
        )
    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        # use the default value's type to cast the args into correct types.
        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
-        return cls(
+        kwargs = {}
-            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        for attr, attr_type in attrs:
-        )
+            val = getattr(args, attr)
            if attr_type is type(None):
                kwargs[attr] = val
            else:
                kwargs[attr] = attr_type(val)
        return cls(**kwargs)
 def launch_server_internal(server_args):
@@ -148,23 +290,35 @@ def run_one_case(
    run_name: str,
    result_filename: str,
    tokenizer,
    dataset_name="",
    profile: bool = False,
    profile_steps: int = 3,
    profile_by_stage: bool = False,
    profile_filename_prefix: str = None,
    dataset_path: str = "",
    parallel_batch: bool = False,
 ):
    requests.post(url + "/flush_cache")
-    input_requests = sample_random_requests(
+    # TODO: reuse bench_serving.get_dataset ?
-        input_len=input_len,
+    if dataset_name == "mmmu":
-        output_len=output_len,
+        input_requests = sample_mmmu_requests(
-        num_prompts=batch_size,
+            num_requests=batch_size,
-        range_ratio=1.0,
+            tokenizer=tokenizer,
-        tokenizer=tokenizer,
+            fixed_output_len=output_len,
-        dataset_path=dataset_path,
+            apply_chat_template=True,
-        random_sample=True,
+            random_sample=False,
-        return_text=False,
+        )
-    )
+    elif dataset_name == "random":
        input_requests = sample_random_requests(
            input_len=input_len,
            output_len=output_len,
            num_prompts=batch_size,
            range_ratio=1.0,
            tokenizer=tokenizer,
            dataset_path=dataset_path,
            random_sample=True,
            return_text=False,
        )
    use_structured_outputs = False
    if use_structured_outputs:
@@ -181,26 +335,48 @@ def run_one_case(
    profile_link = None
    if profile:
        output_dir, profile_name = None, None
        if profile_filename_prefix:
            output_dir = os.path.dirname(profile_filename_prefix)
            profile_name = os.path.basename(profile_filename_prefix)
        profile_link: str = run_profile(
-            url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
+            url,
            profile_steps,
            ["CPU", "GPU"],
            output_dir,
            profile_name,
            profile_by_stage,
        )
    tic = time.perf_counter()
    payload = {
        "sampling_params": {
            "temperature": temperature,
            "max_new_tokens": output_len,
            "ignore_eos": True,
            "json_schema": json_schema,
            "stream_interval": stream_interval,
        },
        "return_logprob": return_logprob,
        "stream": True,
        **({"parallel_batch": parallel_batch} if parallel_batch else {}),
    }
    if dataset_name == "mmmu":
        # vlm
        input_ids = []
        for input_req in input_requests:
            input_ids += [tokenizer.encode(input_req.prompt)]
        payload["image_data"] = [req.image_data for req in input_requests]
    else:
        input_ids = [req.prompt for req in input_requests]
    payload["input_ids"] = input_ids
    response = requests.post(
        url + "/generate",
-        json={
+        json=payload,
            "input_ids": [req.prompt for req in input_requests],
            "sampling_params": {
                "temperature": temperature,
                "max_new_tokens": output_len,
                "ignore_eos": True,
                "json_schema": json_schema,
                "stream_interval": stream_interval,
            },
            "return_logprob": return_logprob,
            "stream": True,
            **({"parallel_batch": parallel_batch} if parallel_batch else {}),
        },
        stream=True,
    )
@@ -264,10 +440,100 @@ def run_one_case(
        overall_throughput,
        last_gen_throughput,
        acc_length,
-        profile_link if profile else None,
+        profile_link,
    )
 def save_results_as_json(result: List[Tuple], bench_args: BenchArgs, model: str):
    """Save benchmark results as JSON using Pydantic models."""
    json_results = []
    # Generate all parameter combinations to match with results
    param_combinations = list(
        itertools.product(
            bench_args.batch_size, bench_args.input_len, bench_args.output_len
        )
    )
    for i, (
        batch_size,
        latency,
        ttft,
        input_throughput,
        output_throughput,
        overall_throughput,
        last_gen_throughput,
        acc_length,
        profile_link,
    ) in enumerate(result):
        # Get the corresponding parameters for this result
        bs, input_len, output_len = param_combinations[i]
        # Parse profile links if available
        profile_links = None
        if profile_link:
            profile_links = parse_profile_links(
                profile_link, batch_size, input_len, output_len
            )
        benchmark_result = BenchmarkResult(
            model_path=model,
            run_name=bench_args.run_name,
            batch_size=batch_size,
            input_len=input_len,
            output_len=output_len,
            latency=latency,
            ttft=ttft,
            input_throughput=input_throughput,
            output_throughput=output_throughput,
            overall_throughput=overall_throughput,
            last_gen_throughput=last_gen_throughput,
            acc_length=acc_length,
            profile_links=profile_links,
        )
        json_results.append(benchmark_result.model_dump())
    # Save to JSON file
    with open(bench_args.output_path, "w", encoding="utf-8") as f:
        json.dump(json_results, f, indent=2, ensure_ascii=False)
    print(f"Results saved as JSON to {bench_args.output_path}")
 def parse_profile_links(
    profile_dir: str, batch_size: int, input_len: int, output_len: int
 ) -> Optional[ProfileLinks]:
    """Parse profile directory to extract extend and decode trace file links."""
    if not profile_dir or not os.path.exists(profile_dir):
        return None
    extend_link = None
    decode_link = None
    # Look for extend/prefill trace files
    for file in os.listdir(profile_dir):
        if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
            if "extend" in file.lower() or "prefill" in file.lower():
                extend_link = os.path.join(profile_dir, file)
            elif "decode" in file.lower():
                decode_link = os.path.join(profile_dir, file)
    # If no specific extend/decode files found, try to find files with batch/input/output info
    if not extend_link or not decode_link:
        for file in os.listdir(profile_dir):
            if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
                if f"_batch{batch_size}_input{input_len}_output{output_len}_" in file:
                    if "prefill" in file.lower() or "extend" in file.lower():
                        extend_link = os.path.join(profile_dir, file)
                    elif "decode" in file.lower():
                        decode_link = os.path.join(profile_dir, file)
    if extend_link or decode_link:
        return ProfileLinks(extend=extend_link, decode=decode_link)
    return None
 def get_report_summary(
    result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
 ):
@@ -358,6 +624,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
            return_logprob=bench_args.return_logprob,
            stream_interval=bench_args.client_stream_interval,
            input_len_step_percentage=bench_args.input_len_step_percentage,
            dataset_name=bench_args.dataset_name,
            run_name="",
            result_filename="",
            tokenizer=tokenizer,
@@ -384,10 +651,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                    stream_interval=bench_args.client_stream_interval,
                    input_len_step_percentage=bench_args.input_len_step_percentage,
                    run_name=bench_args.run_name,
                    dataset_name=bench_args.dataset_name,
                    result_filename=bench_args.result_filename,
                    tokenizer=tokenizer,
                    dataset_path=bench_args.dataset_path,
                    parallel_batch=bench_args.parallel_batch,
                    profile_filename_prefix=bench_args.profile_filename_prefix,
                )
            )
@@ -410,11 +679,13 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                                run_name=bench_args.run_name,
                                result_filename=bench_args.result_filename,
                                tokenizer=tokenizer,
                                dataset_name=bench_args.dataset_name,
                                profile=bench_args.profile,
                                profile_steps=bench_args.profile_steps,
                                profile_by_stage=bench_args.profile_by_stage,
                                dataset_path=bench_args.dataset_path,
                                parallel_batch=bench_args.parallel_batch,
                                profile_filename_prefix=bench_args.profile_filename_prefix,
                            )[-1],
                        )
                    )
@@ -427,13 +698,16 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
    print(f"\nResults are saved to {bench_args.result_filename}")
    # Save results as JSON if output_path is specified
    if bench_args.output_path:
        save_results_as_json(result, bench_args, model=server_args.model_path)
    if not bench_args.show_report:
        return
    summary = get_report_summary(result, server_args, bench_args)
    print(summary)
-    if is_in_ci():
+    if is_in_ci() and bench_args.append_to_github_summary:
        write_github_step_summary(summary)
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -208,6 +208,10 @@ async def async_request_openai_completions(
            "ignore_eos": not args.disable_ignore_eos,
            **request_func_input.extra_request_body,
        }
        if request_func_input.image_data:
            payload.update({"image_data": request_func_input.image_data})
        headers = get_auth_headers()
        output = RequestFuncOutput.init_new(request_func_input)
@@ -664,7 +668,7 @@ def get_dataset(args, tokenizer):
            num_prompts=args.num_prompts,
            range_ratio=args.random_range_ratio,
            tokenizer=tokenizer,
-            dataset_path=args.dataset_path,
+            dataset_path=args.dataset_name,
            random_sample=args.dataset_name == "random",
            return_text=not tokenize_prompt,
        )
--- a/python/sglang/srt/managers/scheduler_profiler_mixin.py
+++ b/python/sglang/srt/managers/scheduler_profiler_mixin.py
@@ -97,7 +97,7 @@ class SchedulerProfilerMixin:
    def start_profile(
        self, stage: Optional[ForwardMode] = None
    ) -> ProfileReqOutput | None:
-        stage_str = f" for {stage.__str__()}" if stage else ""
+        stage_str = f" for {stage.name}" if stage else ""
        logger.info(
            f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
        )
@@ -181,7 +181,7 @@ class SchedulerProfilerMixin:
        if not Path(self.torch_profiler_output_dir).exists():
            Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
-        stage_suffix = f"-{stage.__str__()}" if stage else ""
+        stage_suffix = f"-{stage.name}" if stage else ""
        logger.info("Stop profiling" + stage_suffix + "...")
        if self.torch_profiler is not None:
            self.torch_profiler.stop()
@@ -247,7 +247,7 @@ class SchedulerProfilerMixin:
                if self.profiler_decode_ct == 0:
                    if self.profile_in_progress:
                        # force trace flush
-                        self.stop_profile(ForwardMode.EXTEND)
+                        self.stop_profile(stage=ForwardMode.EXTEND)
                    self.start_profile(batch.forward_mode)
                self.profiler_decode_ct += 1
                if self.profiler_decode_ct > self.profiler_target_decode_ct:
--- a/python/sglang/test/run_eval.py
+++ b/python/sglang/test/run_eval.py
@@ -60,6 +60,11 @@ def run_eval(args):
        from sglang.test.simple_eval_humaneval import HumanEval
        eval_obj = HumanEval(args.num_examples, args.num_threads)
    elif args.eval_name == "mmmu":
        # VLM MMMU evaluation with fixed 100 examples by default
        from sglang.test.simple_eval_mmmu_vlm import MMMUVLMEval
        eval_obj = MMMUVLMEval(args.num_examples, args.num_threads)
    else:
        raise ValueError(f"Invalid eval name: {args.eval_name}")
@@ -94,6 +99,8 @@ def run_eval(args):
    print(f"Total latency: {latency:.3f} s")
    print(f"Score: {metrics['score']:.3f}")
    if getattr(args, "return_latency", False):
        return metrics, latency
    return metrics
--- a/python/sglang/test/simple_eval_mmmu_vlm.py
+++ b/python/sglang/test/simple_eval_mmmu_vlm.py
@@ -0,0 +1,441 @@
 """
 MMMU evaluation for VLMs using the run_eval simple-evals interface.
 """
 from __future__ import annotations
 import base64
 import io
 from typing import List, Optional, Tuple
 from datasets import concatenate_datasets, load_dataset
 from PIL import Image
 from sglang.test import simple_eval_common as common
 from sglang.test.simple_eval_common import (
    HTML_JINJA,
    Eval,
    EvalResult,
    SamplerBase,
    SingleEvalResult,
    map_with_progress,
 )
 class MMMUVLMEval(Eval):
    DOMAIN_CAT2SUB_CAT = {
        "Art and Design": ["Art", "Art_Theory", "Design", "Music"],
        "Business": ["Accounting", "Economics", "Finance", "Manage", "Marketing"],
        "Science": ["Biology", "Chemistry", "Geography", "Math", "Physics"],
        "Health and Medicine": [
            "Basic_Medical_Science",
            "Clinical_Medicine",
            "Diagnostics_and_Laboratory_Medicine",
            "Pharmacy",
            "Public_Health",
        ],
        "Humanities and Social Science": [
            "History",
            "Literature",
            "Sociology",
            "Psychology",
        ],
        "Tech and Engineering": [
            "Agriculture",
            "Architecture_and_Engineering",
            "Computer_Science",
            "Electronics",
            "Energy_and_Power",
            "Materials",
            "Mechanical_Engineering",
        ],
    }
    def __init__(
        self, num_examples: Optional[int] = 100, num_threads: int = 32, seed: int = 42
    ):
        """Create MMMU VLM eval (Math subset, 100 fixed samples by default)."""
        self.num_examples = num_examples
        self.num_threads = num_threads
        self.seed = seed
        # Prepare samples deterministically across all MMMU subjects (validation split)
        self.samples = self._prepare_mmmu_samples(self.num_examples)
    @staticmethod
    def _to_data_uri(image: Image.Image) -> str:
        if image.mode == "RGBA":
            image = image.convert("RGB")
        buf = io.BytesIO()
        image.save(buf, format="PNG")
        b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
        return f"data:image/png;base64,{b64}"
    @staticmethod
    def _build_mc_mapping(options: List[str]) -> Tuple[dict, List[str]]:
        index2ans = {}
        all_choices = []
        ch = ord("A")
        for opt in options:
            letter = chr(ch)
            index2ans[letter] = opt
            all_choices.append(letter)
            ch += 1
        return index2ans, all_choices
    def _prepare_mmmu_samples(self, k: int) -> List[dict]:
        # Subjects and domains copied from MMMU data_utils to categorize results
        subjects: List[str] = []
        for subs in self.DOMAIN_CAT2SUB_CAT.values():
            subjects.extend(subs)
        # Load validation split of each subject
        datasets = []
        for subj in subjects:
            try:
                d = load_dataset("MMMU/MMMU", subj, split="validation")
                # attach subject info via transform
                d = d.add_column("__subject__", [subj] * len(d))
                datasets.append(d)
            except Exception:
                continue
        if not datasets:
            raise RuntimeError("Failed to load MMMU datasets")
        merged = concatenate_datasets(datasets)
        # Deterministic selection: sort by id (fallback to subject+index)
        def _key(idx):
            ex = merged[idx]
            return str(ex.get("id", f"{ex['__subject__']}:{idx}"))
        order = sorted(range(len(merged)), key=_key)
        picked_indices = order[:k]
        samples: List[dict] = []
        for idx in picked_indices:
            ex = merged[idx]
            subject = ex["__subject__"]
            image = ex.get("image_1")
            if image is None or not hasattr(image, "convert"):
                continue
            data_uri = self._to_data_uri(image)
            question = ex.get("question", "")
            answer = ex.get("answer")
            raw_options = ex.get("options")
            question_type = "open"
            index2ans = None
            all_choices = None
            options = None
            if raw_options:
                try:
                    options = (
                        raw_options
                        if isinstance(raw_options, list)
                        else list(eval(raw_options))
                    )
                    if isinstance(options, list) and len(options) > 0:
                        index2ans, all_choices = self._build_mc_mapping(options)
                        question_type = "multiple-choice"
                except Exception:
                    options = None
            # Build final textual prompt; include choices if MC
            prompt_text = f"Question: {question}\n\n"
            if options:
                letters = [chr(ord("A") + i) for i in range(len(options))]
                for letter, opt in zip(letters, options):
                    prompt_text += f"{letter}) {opt}\n"
            prompt_text += "\nAnswer: "
            samples.append(
                {
                    "id": ex.get("id", f"{subject}:{idx}"),
                    "final_input_prompt": prompt_text,
                    "image_data": data_uri,
                    "answer": answer,
                    "question_type": question_type,
                    "index2ans": index2ans,
                    "all_choices": all_choices,
                    "category": subject,
                }
            )
        return samples
    @staticmethod
    def _split_prompt_for_image(prompt: str) -> tuple[str, str]:
        """Split a prompt containing an inline image tag into prefix and suffix.
        If no tag is present, treat the whole prompt as prefix and empty suffix.
        """
        if "<" in prompt and ">" in prompt:
            prefix = prompt.split("<")[0]
            suffix = prompt.split(">", 1)[1]
            return prefix, suffix
        return prompt, ""
    @staticmethod
    def build_chat_messages_from_prompt(prompt: str, image_data) -> List:
        """Split a prompt containing an inline image tag into prefix and suffix.
        If no tag is present, treat the whole prompt as prefix and empty suffix.
        """
        # Build a vision+text message for OpenAI-compatible API
        prefix, suffix = MMMUVLMEval._split_prompt_for_image(prompt)
        content: List[dict] = []
        if prefix:
            content.append({"type": "text", "text": prefix})
        content.append({"type": "image_url", "image_url": {"url": image_data}})
        if suffix:
            content.append({"type": "text", "text": suffix})
        prompt_messages = [{"role": "user", "content": content}]
        return prompt_messages
    def __call__(self, sampler: SamplerBase) -> EvalResult:
        def fn(sample: dict):
            prompt = sample["final_input_prompt"]
            image_data = sample["image_data"]
            prompt_messages = MMMUVLMEval.build_chat_messages_from_prompt(
                prompt, image_data
            )
            # Sample
            response_text = sampler(prompt_messages)
            # Parse and score
            gold = sample["answer"]
            if (
                sample["question_type"] == "multiple-choice"
                and sample["all_choices"]
                and sample["index2ans"]
            ):
                pred = _parse_multi_choice_response(
                    response_text, sample["all_choices"], sample["index2ans"]
                )
                score = 1.0 if (gold is not None and pred == gold) else 0.0
                extracted_answer = pred
            else:
                parsed_list = _parse_open_response(response_text)
                score = (
                    1.0 if (gold is not None and _eval_open(gold, parsed_list)) else 0.0
                )
                extracted_answer = ", ".join(map(str, parsed_list))
            html_rendered = common.jinja_env.from_string(HTML_JINJA).render(
                prompt_messages=prompt_messages,
                next_message=dict(content=response_text, role="assistant"),
                score=score,
                correct_answer=gold,
                extracted_answer=extracted_answer,
            )
            convo = prompt_messages + [dict(content=response_text, role="assistant")]
            return SingleEvalResult(
                html=html_rendered,
                score=score,
                metrics={"__category__": sample["category"]},
                convo=convo,
            )
        results = map_with_progress(fn, self.samples, self.num_threads)
        # Build category table and overall accuracy
        # Gather per-sample correctness and category
        per_cat_total: dict[str, int] = {}
        per_cat_correct: dict[str, int] = {}
        htmls = []
        convos = []
        scores: List[float] = []
        for r in results:
            # __category__ stored under metrics
            cat = r.metrics.get("__category__") if r.metrics else None
            if cat is None:
                cat = "Unknown"
            per_cat_total[cat] = per_cat_total.get(cat, 0) + 1
            if r.score:
                per_cat_correct[cat] = per_cat_correct.get(cat, 0) + 1
            htmls.append(r.html)
            convos.append(r.convo)
            if r.score is not None:
                scores.append(r.score)
        evaluation_result = {}
        for cat, tot in per_cat_total.items():
            corr = per_cat_correct.get(cat, 0)
            acc = (corr / tot) if tot > 0 else 0.0
            evaluation_result[cat] = {"acc": round(acc, 3), "num_example": tot}
        printable_results = {}
        # Domains first
        for domain, cats in self.DOMAIN_CAT2SUB_CAT.items():
            acc_sum = 0.0
            num_sum = 0
            for cat in cats:
                if cat in evaluation_result:
                    acc_sum += (
                        evaluation_result[cat]["acc"]
                        * evaluation_result[cat]["num_example"]
                    )
                    num_sum += evaluation_result[cat]["num_example"]
            if num_sum > 0:
                printable_results[f"Overall-{domain}"] = {
                    "num": num_sum,
                    "acc": round(acc_sum / num_sum, 3),
                }
            # add each sub-category row if present
            for cat in cats:
                if cat in evaluation_result:
                    printable_results[cat] = {
                        "num": evaluation_result[cat]["num_example"],
                        "acc": evaluation_result[cat]["acc"],
                    }
        # Overall
        total_num = sum(v["num_example"] for v in evaluation_result.values())
        overall_acc = (
            sum(v["acc"] * v["num_example"] for v in evaluation_result.values())
            / total_num
            if total_num > 0
            else 0.0
        )
        printable_results["Overall"] = {"num": total_num, "acc": round(overall_acc, 3)}
        # Build EvalResult
        return EvalResult(
            score=overall_acc, metrics=printable_results, htmls=htmls, convos=convos
        )
 def _parse_multi_choice_response(
    response: str, all_choices: List[str], index2ans: dict
 ) -> str:
    # loosely adapted from benchmark mmmu eval
    for char in [",", ".", "!", "?", ";", ":", "'"]:
        response = response.strip(char)
    response = " " + response + " "
    # Prefer explicit letter with bracket e.g. (A)
    candidates: List[str] = []
    for choice in all_choices:
        if f"({choice})" in response:
            candidates.append(choice)
    if not candidates:
        for choice in all_choices:
            if f" {choice} " in response:
                candidates.append(choice)
    if not candidates and len(response.split()) > 5:
        # try match by option text
        for idx, ans in index2ans.items():
            if ans and ans.lower() in response.lower():
                candidates.append(idx)
    if not candidates:
        # fallback to first choice
        return all_choices[0]
    if len(candidates) == 1:
        return candidates[0]
    # choose the last occurrence
    starts = []
    for can in candidates:
        pos = response.rfind(f"({can})")
        if pos == -1:
            pos = response.rfind(f" {can} ")
        if pos == -1 and index2ans.get(can):
            pos = response.lower().rfind(index2ans[can].lower())
        starts.append(pos)
    return candidates[int(max(range(len(starts)), key=lambda i: starts[i]))]
 def _check_is_number(s: str) -> bool:
    try:
        float(s.replace(",", ""))
        return True
    except Exception:
        return False
 def _normalize_str(s: str):
    s = s.strip()
    if _check_is_number(s):
        s = s.replace(",", "")
        try:
            v = round(float(s), 2)
            return [v]
        except Exception:
            return [s.lower()]
    return [s.lower()] if len(s) > 1 else [" " + s, s + " "]
 def _extract_numbers(s: str) -> List[str]:
    import re as _re
    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
    return (
        _re.findall(pattern_commas, s)
        + _re.findall(pattern_scientific, s)
        + _re.findall(pattern_simple, s)
    )
 def _parse_open_response(response: str) -> List[str]:
    import re as _re
    def get_key_subresponses(resp: str) -> List[str]:
        resp = resp.strip().strip(".").lower()
        subs = _re.split(r"\.\s(?=[A-Z])|\n", resp)
        indicators = [
            "could be ",
            "so ",
            "is ",
            "thus ",
            "therefore ",
            "final ",
            "answer ",
            "result ",
        ]
        keys = []
        for i, s in enumerate(subs):
            cands = [*indicators]
            if i == len(subs) - 1:
                cands.append("=")
            shortest = None
            for ind in cands:
                if ind in s:
                    part = s.split(ind)[-1].strip()
                    if not shortest or len(part) < len(shortest):
                        shortest = part
            if shortest and shortest not in [":", ",", ".", "!", "?", ";", ":", "'"]:
                keys.append(shortest)
        return keys or [resp]
    key_resps = get_key_subresponses(response)
    pred_list = key_resps.copy()
    for r in key_resps:
        pred_list.extend(_extract_numbers(r))
    out = []
    for x in pred_list:
        out.extend(_normalize_str(x))
    # dedup
    return list(dict.fromkeys(out))
 def _eval_open(gold, preds: List[str]) -> bool:
    if isinstance(gold, list):
        norm_answers = []
        for ans in gold:
            norm_answers.extend(_normalize_str(ans))
    else:
        norm_answers = _normalize_str(gold)
    for p in preds:
        if isinstance(p, str):
            for na in norm_answers:
                if isinstance(na, str) and na in p:
                    return True
        else:
            if p in norm_answers:
                return True
    return False
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -14,10 +14,12 @@ import time
 import unittest
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from datetime import datetime
 from functools import partial
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any, Awaitable, Callable, List, Optional, Tuple
 from urllib.parse import quote
 import aiohttp
 import numpy as np
@@ -1467,3 +1469,137 @@ def dump_bench_raw_result(
 def _ensure_remove_suffix(text: str, suffix: str):
    assert text.endswith(suffix)
    return text.removesuffix(suffix)
 class ModelDeploySetup:
    def __init__(self, model_path: str, extra_args: List[str] = []):
        self.model_path = model_path
        if "--enable-multimodal" not in extra_args:
            extra_args.append("--enable-multimodal")
        if "--trust-remote-code" not in extra_args:
            extra_args.append("--trust-remote-code")
        self.extra_args = extra_args
 class ModelEvalMetrics:
    def __init__(self, accuracy: float, eval_time: float):
        self.accuracy = accuracy
        self.eval_time = eval_time
 def extract_trace_link_from_bench_one_batch_server_output(output: str) -> str:
    match = re.search(r"\[Profile\]\((.*?)\)", output)
    if match:
        trace_link = match.group(1)
        return trace_link
    return None
 def parse_models(model_string: str):
    return [model.strip() for model in model_string.split(",") if model.strip()]
 def check_evaluation_test_results(
    results,
    test_name,
    model_accuracy_thresholds,
    model_latency_thresholds=None,
    model_count=None,
 ):
    """
    results: list of tuple of (model_path, accuracy, latency)
    """
    failed_models = []
    if model_latency_thresholds is not None:
        summary = " | model | status | score | score_threshold | latency | latency_threshold | \n"
        summary += "| ----- | ------ | ----- | --------------- | ------- | ----------------- | \n"
    else:
        summary = " | model | status | score | score_threshold | \n"
        summary += "| ----- | ------ | ----- | --------------- | \n"
    for model, accuracy, latency in results:
        accuracy_threshold = model_accuracy_thresholds.get(model)
        if accuracy_threshold is None:
            print(f"Warning: No threshold defined for model {model}")
            continue
        latency_threshold = (
            model_latency_thresholds.get(model, None)
            if model_latency_thresholds
            else 1e9
        )
        is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
        status_emoji = "✅" if is_success else "❌"
        if not is_success:
            failed_models.append(
                f"\nScore Check Failed: {model}\n"
                f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
            )
        if model_latency_thresholds is not None:
            line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
        else:
            line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
        summary += line
    print(summary)
    if is_in_ci():
        write_github_step_summary(f"## {test_name}\n{summary}")
    some_model_failed_to_get_result = len(results) != (
        model_count or len(model_accuracy_thresholds)
    )
    if some_model_failed_to_get_result:
        print("Some model has failed to launch and be evaluated")
    if failed_models or some_model_failed_to_get_result:
        raise AssertionError("\n".join(failed_models))
 # Bench knobs for bench_one_batch_server (override by env)
 def _parse_int_list_env(name: str, default_val: str):
    val = os.environ.get(name, default_val)
    return [int(x) for x in val.split(",") if x]
 # Return filenames
 def find_traces_under_path(path: str) -> List[str]:
    results = []
    for _, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".trace.json.gz"):
                results.append(f"{file}")
    return results
 def write_results_to_json(model, metrics, mode="a"):
    result = {
        "timestamp": datetime.now().isoformat(),
        "model": model,
        "metrics": metrics,
        "score": metrics["score"],
    }
    if "latency" in metrics:
        result["latency"] = (metrics.get("latency"),)
    existing_results = []
    if mode == "a" and os.path.exists("results.json"):
        try:
            with open("results.json", "r") as f:
                existing_results = json.load(f)
        except json.JSONDecodeError:
            existing_results = []
    if isinstance(existing_results, list):
        existing_results.append(result)
    else:
        existing_results = [result]
    with open("results.json", "w") as f:
        json.dump(existing_results, f, indent=2)
--- a/scripts/ci/publish_traces.py
+++ b/scripts/ci/publish_traces.py
@@ -0,0 +1,263 @@
 """
 Publish performance traces to GitHub repository
 """
 import argparse
 import base64
 import json
 import os
 import sys
 from urllib.request import Request, urlopen
 def make_github_request(url, token, method="GET", data=None):
    """Make authenticated request to GitHub API"""
    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"Bearer {token}",
        # "User-Agent": "sglang-ci",
        "X-GitHub-Api-Version": "2022-11-28",
    }
    if data:
        headers["Content-Type"] = "application/json"
        data = json.dumps(data).encode("utf-8")
    req = Request(url, data=data, headers=headers, method=method)
    try:
        with urlopen(req) as response:
            return response.read().decode("utf-8")
    except Exception as e:
        print(f"GitHub API request failed: {e}")
        if hasattr(e, "read"):
            try:
                error_body = e.read().decode("utf-8")
                print(f"Error response body: {error_body}")
            except:
                pass
        raise
 def verify_token_permissions(repo_owner, repo_name, token):
    """Verify that the token has necessary permissions for the repository"""
    print("Verifying token permissions...")
    # Check if we can access the repository
    try:
        url = f"https://api.github.com/repos/{repo_owner}/{repo_name}"
        response = make_github_request(url, token)
        repo_data = json.loads(response)
        print(f"Repository access verified: {repo_data['full_name']}")
    except Exception as e:
        print(f"Failed to access repository: {e}")
        return False
    # Check if we can read the repository contents
    try:
        url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents"
        response = make_github_request(url, token)
        print("Repository contents access verified")
    except Exception as e:
        print(f"Failed to access repository contents: {e}")
        return False
    return True
 def get_branch_sha(repo_owner, repo_name, branch, token):
    """Get SHA of the branch head"""
    url = (
        f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/refs/heads/{branch}"
    )
    response = make_github_request(url, token)
    data = json.loads(response)
    return data["object"]["sha"]
 def get_tree_sha(repo_owner, repo_name, commit_sha, token):
    """Get tree SHA from commit"""
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/commits/{commit_sha}"
    response = make_github_request(url, token)
    data = json.loads(response)
    return data["tree"]["sha"]
 def create_blob(repo_owner, repo_name, content, token):
    """Create a blob with file content"""
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/blobs"
    # Encode content as base64 for GitHub API
    content_b64 = base64.b64encode(content).decode("utf-8")
    data = {"content": content_b64, "encoding": "base64"}
    response = make_github_request(url, token, method="POST", data=data)
    return json.loads(response)["sha"]
 def create_tree(repo_owner, repo_name, base_tree_sha, files, token):
    """Create a new tree with files"""
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/trees"
    tree_items = []
    for file_path, content in files:
        # Create blob first to get SHA
        blob_sha = create_blob(repo_owner, repo_name, content, token)
        tree_items.append(
            {
                "path": file_path,
                "mode": "100644",
                "type": "blob",
                "sha": blob_sha,
            }
        )
    data = {"base_tree": base_tree_sha, "tree": tree_items}
    response = make_github_request(url, token, method="POST", data=data)
    return json.loads(response)["sha"]
 def create_commit(repo_owner, repo_name, tree_sha, parent_sha, message, token):
    """Create a new commit"""
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/commits"
    data = {"tree": tree_sha, "parents": [parent_sha], "message": message}
    response = make_github_request(url, token, method="POST", data=data)
    return json.loads(response)["sha"]
 def update_branch_ref(repo_owner, repo_name, branch, commit_sha, token):
    """Update branch reference to point to new commit"""
    url = (
        f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/refs/heads/{branch}"
    )
    data = {"sha": commit_sha}
    make_github_request(url, token, method="PATCH", data=data)
 def copy_trace_files(source_dir, target_base_path, is_vlm=False):
    """Copy trace files and return list of files to upload"""
    files_to_upload = []
    if not os.path.exists(source_dir):
        print(f"Warning: Traces directory {source_dir} does not exist")
        return files_to_upload
    # Walk through source directory and find .json.gz files
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".json.gz"):
                source_file = os.path.join(root, file)
                # Calculate relative path from source_dir
                rel_path = os.path.relpath(source_file, source_dir)
                target_path = f"{target_base_path}/{rel_path}"
                # Read file content
                with open(source_file, "rb") as f:
                    content = f.read()
                files_to_upload.append((target_path, content))
    return files_to_upload
 def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
    """Publish traces to GitHub repository in a single commit"""
    # Get environment variables
    token = os.getenv("GITHUB_TOKEN")
    if not token:
        print("Error: GITHUB_TOKEN environment variable not set")
        sys.exit(1)
    # Repository configuration
    repo_owner = "sglang-bot"
    repo_name = "sglang-ci-data"
    branch = "main"
    target_base_path = f"traces/{run_id}"
    # Copy trace files
    files_to_upload = copy_trace_files(traces_dir, target_base_path, is_vlm)
    if not files_to_upload:
        print("No trace files found to upload")
        return
    print(f"Found {len(files_to_upload)} files to upload")
    # Verify token permissions before proceeding
    if not verify_token_permissions(repo_owner, repo_name, token):
        print(
            "Token permission verification failed. Please check the token permissions."
        )
        sys.exit(1)
    try:
        # Get current branch head
        branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
        print(f"Current branch head: {branch_sha}")
        # Get current tree
        tree_sha = get_tree_sha(repo_owner, repo_name, branch_sha, token)
        print(f"Current tree SHA: {tree_sha}")
        # Create new tree with all files
        new_tree_sha = create_tree(
            repo_owner, repo_name, tree_sha, files_to_upload, token
        )
        print(f"Created new tree: {new_tree_sha}")
        # Create commit
        commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)"
        commit_sha = create_commit(
            repo_owner, repo_name, new_tree_sha, branch_sha, commit_message, token
        )
        print(f"Created commit: {commit_sha}")
        # Update branch reference
        update_branch_ref(repo_owner, repo_name, branch, commit_sha, token)
        print("Updated branch reference")
        print("Successfully published all traces in a single commit")
    except Exception as e:
        print(f"Failed to publish traces: {e}")
        raise
 def main():
    parser = argparse.ArgumentParser(
        description="Publish performance traces to GitHub repository"
    )
    parser.add_argument("--vlm", action="store_true", help="Process VLM model traces")
    args = parser.parse_args()
    # Get environment variables
    run_id = os.getenv("GITHUB_RUN_ID", "test")
    run_number = os.getenv("GITHUB_RUN_NUMBER", "12345")
    if not run_id or not run_number:
        print(
            "Error: GITHUB_RUN_ID and GITHUB_RUN_NUMBER environment variables must be set"
        )
        sys.exit(1)
    # Determine traces directory
    if args.vlm:
        traces_dir = "performance_profiles_vlms"
        print("Processing VLM model traces")
    else:
        traces_dir = "performance_profiles_text_models"
        print("Processing text model traces")
    # Publish traces
    publish_traces(traces_dir, run_id, run_number, args.vlm)
 if __name__ == "__main__":
    main()
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -165,9 +165,6 @@ suites = {
    "per-commit-8-gpu-h20": [
        TestFile("quant/test_w4a8_deepseek_v3.py", 371),
    ],
    "nightly": [
        TestFile("test_nightly_gsm8k_eval.py"),
    ],
    "vllm_dependency_test": [
        TestFile("quant/test_awq.py", 163),
        TestFile("test_bnb.py", 5),
--- a/test/srt/test_nightly_gsm8k_eval_amd.py
+++ b/test/srt/test_nightly_gsm8k_eval_amd.py
@@ -15,8 +15,10 @@ from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    is_in_ci,
    parse_models,
    popen_launch_server,
    write_github_step_summary,
    write_results_to_json,
 )
 MODEL_SCORE_THRESHOLDS = {
@@ -73,10 +75,6 @@ TRITON_MOE_MODELS = {
 }
 def parse_models(model_string):
    return [model.strip() for model in model_string.split(",") if model.strip()]
 def popen_launch_server_wrapper(base_url, model, is_tp2):
    other_args = ["--log-level-http", "warning", "--trust-remote-code"]
    if is_tp2:
@@ -91,31 +89,6 @@ def popen_launch_server_wrapper(base_url, model, is_tp2):
    return process
 def write_results_to_json(model, metrics, mode="a"):
    result = {
        "timestamp": datetime.now().isoformat(),
        "model": model,
        "metrics": metrics,
        "score": metrics["score"],
    }
    existing_results = []
    if mode == "a" and os.path.exists("results.json"):
        try:
            with open("results.json", "r") as f:
                existing_results = json.load(f)
        except json.JSONDecodeError:
            existing_results = []
    if isinstance(existing_results, list):
        existing_results.append(result)
    else:
        existing_results = [result]
    with open("results.json", "w") as f:
        json.dump(existing_results, f, indent=2)
 def check_model_scores(results):
    failed_models = []
    summary = " | model | score | threshold |\n"
--- a/test/srt/test_nightly_text_models_gsm8k_eval.py
+++ b/test/srt/test_nightly_text_models_gsm8k_eval.py
@@ -1,8 +1,6 @@
 import json
 import os
 import unittest
 import warnings
 from datetime import datetime
 from types import SimpleNamespace
 from sglang.srt.utils import kill_process_tree
@@ -14,9 +12,10 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
-    is_in_ci,
+    check_evaluation_test_results,
    parse_models,
    popen_launch_server,
-    write_github_step_summary,
+    write_results_to_json,
 )
 MODEL_SCORE_THRESHOLDS = {
@@ -25,11 +24,11 @@ MODEL_SCORE_THRESHOLDS = {
    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
    "google/gemma-2-27b-it": 0.91,
    "meta-llama/Llama-3.1-70B-Instruct": 0.95,
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.62,
    "Qwen/Qwen2-57B-A14B-Instruct": 0.86,
    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
-    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,
+    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.835,
    "zai-org/GLM-4.5-Air-FP8": 0.75,
    # The threshold of neuralmagic/gemma-2-2b-it-FP8 should be 0.6, but this model has some accuracy regression.
    # The fix is tracked at https://github.com/sgl-project/sglang/issues/4324, we set it to 0.50, for now, to make CI green.
@@ -41,78 +40,6 @@ MODEL_SCORE_THRESHOLDS = {
 }
 def parse_models(model_string):
    return [model.strip() for model in model_string.split(",") if model.strip()]
 def popen_launch_server_wrapper(base_url, model, is_tp2):
    other_args = ["--log-level-http", "warning", "--trust-remote-code"]
    if is_tp2:
        other_args.extend(["--tp", "2"])
    process = popen_launch_server(
        model,
        base_url,
        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
        other_args=other_args,
    )
    return process
 def write_results_to_json(model, metrics, mode="a"):
    result = {
        "timestamp": datetime.now().isoformat(),
        "model": model,
        "metrics": metrics,
        "score": metrics["score"],
    }
    existing_results = []
    if mode == "a" and os.path.exists("results.json"):
        try:
            with open("results.json", "r") as f:
                existing_results = json.load(f)
        except json.JSONDecodeError:
            existing_results = []
    if isinstance(existing_results, list):
        existing_results.append(result)
    else:
        existing_results = [result]
    with open("results.json", "w") as f:
        json.dump(existing_results, f, indent=2)
 def check_model_scores(results):
    failed_models = []
    summary = " | model | score | threshold |\n"
    summary += "| ----- | ----- | --------- |\n"
    for model, score in results:
        threshold = MODEL_SCORE_THRESHOLDS.get(model)
        if threshold is None:
            print(f"Warning: No threshold defined for model {model}")
            continue
        if score < threshold:
            failed_models.append(
                f"\nScore Check Failed: {model}\n"
                f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
            )
        line = f"| {model} | {score} | {threshold} |\n"
        summary += line
    print(summary)
    if is_in_ci():
        write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}")
    if failed_models:
        raise AssertionError("\n".join(failed_models))
 # Do not use `CustomTestCase` since `test_mgsm_en_all_models` does not want retry
 class TestNightlyGsm8KEval(unittest.TestCase):
    @classmethod
@@ -131,11 +58,17 @@ class TestNightlyGsm8KEval(unittest.TestCase):
        )
        is_first = True
        all_results = []
-
+        model_count = 0
        for model_group, is_fp8, is_tp2 in self.model_groups:
            for model in model_group:
                model_count += 1
                with self.subTest(model=model):
-                    process = popen_launch_server_wrapper(self.base_url, model, is_tp2)
+                    process = popen_launch_server(
                        model=model,
                        base_url=self.base_url,
                        other_args=["--tp", "2"] if is_tp2 else [],
                        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
                    )
                    args = SimpleNamespace(
                        base_url=self.base_url,
@@ -153,7 +86,8 @@ class TestNightlyGsm8KEval(unittest.TestCase):
                    write_results_to_json(model, metrics, "w" if is_first else "a")
                    is_first = False
-                    all_results.append((model, metrics["score"]))
+                    # 0.0 for empty latency
                    all_results.append((model, metrics["score"], 0.0))
                    kill_process_tree(process.pid)
        try:
@@ -164,7 +98,12 @@ class TestNightlyGsm8KEval(unittest.TestCase):
            print(f"Error reading results.json: {e}")
        # Check all scores after collecting all results
-        check_model_scores(all_results)
+        check_evaluation_test_results(
            all_results,
            self.__class__.__name__,
            model_accuracy_thresholds=MODEL_SCORE_THRESHOLDS,
            model_count=model_count,
        )
 if __name__ == "__main__":
--- a/test/srt/test_nightly_text_models_perf.py
+++ b/test/srt/test_nightly_text_models_perf.py
@@ -0,0 +1,135 @@
 import os
 import subprocess
 import time
 import unittest
 from sglang.bench_one_batch_server import BenchmarkResult
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    _parse_int_list_env,
    is_in_ci,
    parse_models,
    popen_launch_server,
    write_github_step_summary,
 )
 PROFILE_DIR = "performance_profiles_text_models"
 class TestNightlyTextModelsPerformance(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model_groups = [
            (parse_models("meta-llama/Llama-3.1-8B-Instruct"), False, False),
            (parse_models("Qwen/Qwen2-57B-A14B-Instruct"), False, True),
            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
        ]
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.batch_sizes = [1, 1, 8, 16, 64]
        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
        cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))
        os.makedirs(PROFILE_DIR, exist_ok=True)
        cls.full_report = f"## {cls.__name__}\n" + BenchmarkResult.help_str()
    def test_bench_one_batch(self):
        all_benchmark_results = []
        for model_group, is_fp8, is_tp2 in self.model_groups:
            for model in model_group:
                benchmark_results = []
                with self.subTest(model=model):
                    process = popen_launch_server(
                        model=model,
                        base_url=self.base_url,
                        other_args=["--tp", "2"] if is_tp2 else [],
                        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
                    )
                    try:
                        profile_filename = (
                            f"{model.replace('/', '_')}_{int(time.time())}"
                        )
                        profile_path_prefix = os.path.join(
                            PROFILE_DIR, profile_filename
                        )
                        json_output_file = (
                            f"results_{model.replace('/', '_')}_{int(time.time())}.json"
                        )
                        command = [
                            "python3",
                            "-m",
                            "sglang.bench_one_batch_server",
                            "--model",
                            model,
                            "--base-url",
                            self.base_url,
                            "--batch-size",
                            *[str(x) for x in self.batch_sizes],
                            "--input-len",
                            *[str(x) for x in self.input_lens],
                            "--output-len",
                            *[str(x) for x in self.output_lens],
                            "--show-report",
                            "--profile",
                            "--profile-by-stage",
                            "--profile-filename-prefix",
                            profile_path_prefix,
                            f"--output-path={json_output_file}",
                            "--no-append-to-github-summary",
                        ]
                        print(f"Running command: {' '.join(command)}")
                        result = subprocess.run(command, capture_output=True, text=True)
                        if result.returncode != 0:
                            print(
                                f"Error running benchmark for {model} with batch size:"
                            )
                            print(result.stderr)
                            # Continue to next batch size even if one fails
                            continue
                        # Load and deserialize JSON results
                        if os.path.exists(json_output_file):
                            import json
                            with open(json_output_file, "r") as f:
                                json_data = json.load(f)
                            # Convert JSON data to BenchmarkResult objects
                            for data in json_data:
                                benchmark_result = BenchmarkResult(**data)
                                all_benchmark_results.append(benchmark_result)
                                benchmark_results.append(benchmark_result)
                            print(
                                f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}"
                            )
                            # Clean up JSON file
                            os.remove(json_output_file)
                        else:
                            print(
                                f"Warning: JSON output file {json_output_file} not found"
                            )
                    finally:
                        kill_process_tree(process.pid)
                    report_part = BenchmarkResult.generate_markdown_report(
                        PROFILE_DIR, benchmark_results
                    )
                    self.full_report += report_part + "\n"
        if is_in_ci():
            write_github_step_summary(self.full_report)
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_nightly_vlms_mmmu_eval.py
+++ b/test/srt/test_nightly_vlms_mmmu_eval.py
@@ -0,0 +1,117 @@
 import json
 import unittest
 import warnings
 from types import SimpleNamespace
 from sglang.srt.utils import kill_process_tree
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    ModelDeploySetup,
    ModelEvalMetrics,
    check_evaluation_test_results,
    popen_launch_server,
    write_results_to_json,
 )
 MODEL_THRESHOLDS = {
    # Conservative thresholds on 100 MMMU samples, especially for latency thresholds
    ModelDeploySetup("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics(0.330, 56.1),
    ModelDeploySetup("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 39.9),
    ModelDeploySetup("Efficient-Large-Model/NVILA-Lite-2B-hf-0626"): ModelEvalMetrics(
        0.305, 23.8
    ),
    ModelDeploySetup("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
    ModelDeploySetup("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
    ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 14.5),
    ModelDeploySetup("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(0.330, 22.3),
    ModelDeploySetup("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
    ModelDeploySetup("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
    ModelDeploySetup("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
    ModelDeploySetup("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
    ModelDeploySetup("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
    ModelDeploySetup("unsloth/Mistral-Small-3.1-24B-Instruct-2503"): ModelEvalMetrics(
        0.310, 16.7
    ),
    ModelDeploySetup("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0),
    ModelDeploySetup("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4),
 }
 class TestNightlyVLMMmmuEval(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.models = list(MODEL_THRESHOLDS.keys())
        cls.base_url = DEFAULT_URL_FOR_TEST
    def test_mmmu_vlm_models(self):
        warnings.filterwarnings(
            "ignore", category=ResourceWarning, message="unclosed.*socket"
        )
        is_first = True
        all_results = []
        for model in self.models:
            model_path = model.model_path
            with self.subTest(model=model_path):
                process = popen_launch_server(
                    model=model_path,
                    base_url=self.base_url,
                    other_args=model.extra_args,
                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
                )
                try:
                    args = SimpleNamespace(
                        base_url=self.base_url,
                        model=model_path,
                        eval_name="mmmu",
                        num_examples=100,
                        num_threads=64,
                        max_tokens=30,
                    )
                    args.return_latency = True
                    metrics, latency = run_eval(args)
                    metrics["score"] = round(metrics["score"], 4)
                    metrics["latency"] = round(latency, 4)
                    print(
                        f"{'=' * 42}\n{model_path} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
                    )
                    write_results_to_json(model_path, metrics, "w" if is_first else "a")
                    is_first = False
                    all_results.append(
                        (model_path, metrics["score"], metrics["latency"])
                    )
                finally:
                    kill_process_tree(process.pid)
        try:
            with open("results.json", "r") as f:
                print("\nFinal Results from results.json:")
                print(json.dumps(json.load(f), indent=2))
        except Exception as e:
            print(f"Error reading results: {e}")
        model_accuracy_thresholds = {
            model.model_path: threshold.accuracy
            for model, threshold in MODEL_THRESHOLDS.items()
        }
        model_latency_thresholds = {
            model.model_path: threshold.eval_time
            for model, threshold in MODEL_THRESHOLDS.items()
        }
        check_evaluation_test_results(
            all_results,
            self.__class__.__name__,
            model_accuracy_thresholds=model_accuracy_thresholds,
            model_latency_thresholds=model_latency_thresholds,
        )
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_nightly_vlms_perf.py
+++ b/test/srt/test_nightly_vlms_perf.py
@@ -0,0 +1,135 @@
 import os
 import subprocess
 import unittest
 import warnings
 from sglang.bench_one_batch_server import BenchmarkResult
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    _parse_int_list_env,
    is_in_ci,
    parse_models,
    popen_launch_server,
    write_github_step_summary,
 )
 PROFILE_DIR = "performance_profiles_vlms"
 MODEL_DEFAULTS = [
    # Keep conservative defaults. Can be overridden by env NIGHTLY_VLM_MODELS
    "Qwen/Qwen2.5-VL-7B-Instruct",
    "google/gemma-3-27b-it",
    # "OpenGVLab/InternVL2_5-2B",
    # buggy in official transformers impl
    # "openbmb/MiniCPM-V-2_6",
 ]
 class TestNightlyVLMModelsPerformance(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        warnings.filterwarnings(
            "ignore", category=ResourceWarning, message="unclosed.*socket"
        )
        cls.models = parse_models(
            os.environ.get("NIGHTLY_VLM_MODELS", ",".join(MODEL_DEFAULTS))
        )
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.batch_sizes = _parse_int_list_env("NIGHTLY_VLM_BATCH_SIZES", "1,1,2,8,16")
        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_VLM_INPUT_LENS", "4096"))
        cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_VLM_OUTPUT_LENS", "512"))
        cls.full_report = f"## {cls.__name__}\n" + BenchmarkResult.help_str()
    def test_bench_one_batch(self):
        all_benchmark_results = []
        for model in self.models:
            benchmark_results = []
            with self.subTest(model=model):
                process = popen_launch_server(
                    model=model,
                    base_url=self.base_url,
                    other_args=["--mem-fraction-static=0.7"],
                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
                )
                try:
                    # Run bench_one_batch_server against the launched server
                    profile_filename = f"{model.replace('/', '_')}"
                    # path for this run
                    profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename)
                    # JSON output file for this model
                    json_output_file = f"results_{model.replace('/', '_')}.json"
                    command = [
                        "python3",
                        "-m",
                        "sglang.bench_one_batch_server",
                        f"--model={model}",
                        "--base-url",
                        self.base_url,
                        "--batch-size",
                        *[str(x) for x in self.batch_sizes],
                        "--input-len",
                        *[str(x) for x in self.input_lens],
                        "--output-len",
                        *[str(x) for x in self.output_lens],
                        "--trust-remote-code",
                        "--dataset-name=mmmu",
                        "--profile",
                        "--profile-by-stage",
                        f"--profile-filename-prefix={profile_path_prefix}",
                        "--show-report",
                        f"--output-path={json_output_file}",
                        "--no-append-to-github-summary",
                    ]
                    print(f"Running command: {' '.join(command)}")
                    result = subprocess.run(command, capture_output=True, text=True)
                    if result.returncode != 0:
                        print(f"Error running benchmark for {model} with batch size:")
                        print(result.stderr)
                        # Continue to next batch size even if one fails
                        continue
                    print(f"Output for {model} with batch size:")
                    print(result.stdout)
                    # Load and deserialize JSON results
                    if os.path.exists(json_output_file):
                        import json
                        with open(json_output_file, "r") as f:
                            json_data = json.load(f)
                        # Convert JSON data to BenchmarkResult objects
                        for data in json_data:
                            benchmark_result = BenchmarkResult(**data)
                            all_benchmark_results.append(benchmark_result)
                            benchmark_results.append(benchmark_result)
                        print(
                            f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}"
                        )
                    else:
                        print(f"Warning: JSON output file {json_output_file} not found")
                finally:
                    kill_process_tree(process.pid)
                report_part = BenchmarkResult.generate_markdown_report(
                    PROFILE_DIR, benchmark_results
                )
                self.full_report += report_part + "\n"
        if is_in_ci():
            write_github_step_summary(self.full_report)
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_vllm_dependency.py
+++ b/test/srt/test_vllm_dependency.py
@@ -14,6 +14,7 @@ from sglang.test.test_utils import (
    is_in_ci,
    popen_launch_server,
    write_github_step_summary,
    write_results_to_json,
 )
 MODEL_SCORE_THRESHOLDS = {
@@ -52,31 +53,6 @@ def popen_launch_server_wrapper(base_url, model, is_fp8, is_tp2):
    return process
 def write_results_to_json(model, metrics, mode="a"):
    result = {
        "timestamp": datetime.now().isoformat(),
        "model": model,
        "metrics": metrics,
        "score": metrics["score"],
    }
    existing_results = []
    if mode == "a" and os.path.exists("results.json"):
        try:
            with open("results.json", "r") as f:
                existing_results = json.load(f)
        except json.JSONDecodeError:
            existing_results = []
    if isinstance(existing_results, list):
        existing_results.append(result)
    else:
        existing_results = [result]
    with open("results.json", "w") as f:
        json.dump(existing_results, f, indent=2)
 def check_model_scores(results):
    failed_models = []
    summary = " | model | score | threshold |\n"