ci: refactor nightly test (#10495)

2025-09-27 06:24:30 +08:00
parent 05a3526654
commit 777eb53897
16 changed files with 1656 additions and 187 deletions
--- a/python/sglang/bench_one_batch_server.py
+++ b/python/sglang/bench_one_batch_server.py
@@ -9,6 +9,7 @@ python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --

 python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
 python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --output-path results.json --profile
 """

 import argparse
@@ -19,12 +20,17 @@ import multiprocessing
 import os
 import random
 import time
-from typing import List, Tuple
+from typing import List, Optional, Tuple

 import numpy as np
 import requests
+from pydantic import BaseModel

-from sglang.bench_serving import get_tokenizer, sample_random_requests
+from sglang.bench_serving import (
+    get_tokenizer,
+    sample_mmmu_requests,
+    sample_random_requests,
+)
 from sglang.profiler import run_profile
 from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import ServerArgs
@@ -32,6 +38,109 @@ from sglang.srt.utils import is_blackwell, kill_process_tree
 from sglang.test.test_utils import is_in_ci, write_github_step_summary


+class ProfileLinks(BaseModel):
+    """Pydantic model for profile trace links."""
+
+    extend: Optional[str] = None
+    decode: Optional[str] = None
+
+
+class BenchmarkResult(BaseModel):
+    """Pydantic model for benchmark results table data, for a single isl and osl"""
+
+    model_path: str
+    run_name: str
+    batch_size: int
+    input_len: int
+    output_len: int
+    latency: float
+    ttft: float
+    input_throughput: float
+    output_throughput: float
+    overall_throughput: float
+    last_gen_throughput: float
+    acc_length: Optional[float] = None
+    profile_links: Optional[ProfileLinks] = None
+
+    @staticmethod
+    def help_str() -> str:
+        return f"""
+Note: To view the traces through perfetto-ui, please:
+1. use Google Chrome
+2. enable popup
+
+"""
+
+    def to_markdown_row(
+        self, trace_dir, base_url: str = "", relay_base: str = ""
+    ) -> str:
+        """Convert this benchmark result to a markdown table row."""
+        # Calculate costs (assuming H100 pricing for now)
+        hourly_cost_per_gpu = 2  # $2/hour for one H100
+        hourly_cost = hourly_cost_per_gpu * 1  # Assuming tp_size = 1 for simplicity
+        input_util = 0.7
+        accept_length = (
+            round(self.acc_length, 2) if self.acc_length is not None else "n/a"
+        )
+        itl = 1 / (self.output_throughput / self.batch_size) * 1000
+        input_cost = 1e6 / (self.input_throughput * input_util) / 3600 * hourly_cost
+        output_cost = 1e6 / self.output_throughput / 3600 * hourly_cost
+
+        def get_perfetto_relay_link_from_trace_file(trace_file: str):
+            import os
+            from urllib.parse import quote
+
+            rel_path = os.path.relpath(trace_file, trace_dir)
+            raw_file_link = f"{base_url}/{rel_path}"
+            relay_link = (
+                f"{relay_base}?src={quote(raw_file_link, safe='')}"
+                if relay_base and quote
+                else raw_file_link
+            )
+            return relay_link
+
+        # Handle profile links
+        profile_link = "NA | NA"
+        if self.profile_links:
+            if self.profile_links.extend or self.profile_links.decode:
+                # Create a combined link or use the first available one
+                trace_files = [self.profile_links.extend, self.profile_links.decode]
+                trace_files_relay_links = [
+                    f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
+                    for trace_file in trace_files
+                ]
+
+                profile_link = " | ".join(trace_files_relay_links)
+
+        # Build the row
+        return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
+
+    @classmethod
+    def generate_markdown_report(
+        cls, trace_dir, results: List["BenchmarkResult"]
+    ) -> str:
+        """Generate a markdown report from a list of BenchmarkResult object from a single run."""
+        import os
+
+        summary = f"### {results[0].model_path}\n"
+
+        # summary += (
+        #     f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
+        # )
+        summary += "| batch size | input len | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
+        summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
+
+        # all results should share the same isl & osl
+        for result in results:
+            base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
+            relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
+            relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
+            # base_url = "https://github.com/sgl-project/ci-data/traces"
+            summary += result.to_markdown_row(trace_dir, base_url, relay_base)
+
+        return summary
+
+
@dataclasses.dataclass
 class BenchArgs:
    run_name: str = "default"
@@ -50,8 +159,12 @@ class BenchArgs:
    profile: bool = False
    profile_steps: int = 3
    profile_by_stage: bool = False
+    profile_filename_prefix: str = None
+    append_to_github_summary: bool = True
    dataset_path: str = ""
    parallel_batch: bool = False
+    dataset_name: str = "random"
+    output_path: Optional[str] = None

    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
@@ -67,6 +180,13 @@ class BenchArgs:
            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
        )
        parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
+        parser.add_argument(
+            "--dataset-name",
+            type=str,
+            default=BenchArgs.dataset_name,
+            choices=["mmmu", "random"],
+            help="Name of the dataset to benchmark on.",
+        )
        parser.add_argument("--return-logprob", action="store_true")
        parser.add_argument(
            "--client-stream-interval",
@@ -96,14 +216,36 @@ class BenchArgs:
            help="Path to the dataset.",
        )
        parser.add_argument("--parallel-batch", action="store_true")
+        parser.add_argument(
+            "--profile-filename-prefix",
+            type=str,
+            default=BenchArgs.profile_filename_prefix,
+        )
+        parser.add_argument(
+            "--no-append-to-github-summary",
+            action="store_false",
+            dest="append_to_github_summary",
+            help="Disable appending the output of this run to github ci summary",
+        )
+        parser.add_argument(
+            "--output-path",
+            type=str,
+            default=BenchArgs.output_path,
+            help="Path to save benchmark results as JSON format. If not specified, results will only be saved to result-filename.",
+        )

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        # use the default value's type to cast the args into correct types.
        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
-        return cls(
-            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
-        )
+        kwargs = {}
+        for attr, attr_type in attrs:
+            val = getattr(args, attr)
+            if attr_type is type(None):
+                kwargs[attr] = val
+            else:
+                kwargs[attr] = attr_type(val)
+        return cls(**kwargs)


 def launch_server_internal(server_args):
@@ -148,23 +290,35 @@ def run_one_case(
    run_name: str,
    result_filename: str,
    tokenizer,
+    dataset_name="",
    profile: bool = False,
    profile_steps: int = 3,
    profile_by_stage: bool = False,
+    profile_filename_prefix: str = None,
    dataset_path: str = "",
    parallel_batch: bool = False,
 ):
    requests.post(url + "/flush_cache")
-    input_requests = sample_random_requests(
-        input_len=input_len,
-        output_len=output_len,
-        num_prompts=batch_size,
-        range_ratio=1.0,
-        tokenizer=tokenizer,
-        dataset_path=dataset_path,
-        random_sample=True,
-        return_text=False,
-    )
+    # TODO: reuse bench_serving.get_dataset ?
+    if dataset_name == "mmmu":
+        input_requests = sample_mmmu_requests(
+            num_requests=batch_size,
+            tokenizer=tokenizer,
+            fixed_output_len=output_len,
+            apply_chat_template=True,
+            random_sample=False,
+        )
+    elif dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=input_len,
+            output_len=output_len,
+            num_prompts=batch_size,
+            range_ratio=1.0,
+            tokenizer=tokenizer,
+            dataset_path=dataset_path,
+            random_sample=True,
+            return_text=False,
+        )

    use_structured_outputs = False
    if use_structured_outputs:
@@ -181,26 +335,48 @@ def run_one_case(

    profile_link = None
    if profile:
+        output_dir, profile_name = None, None
+        if profile_filename_prefix:
+            output_dir = os.path.dirname(profile_filename_prefix)
+            profile_name = os.path.basename(profile_filename_prefix)
        profile_link: str = run_profile(
-            url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
+            url,
+            profile_steps,
+            ["CPU", "GPU"],
+            output_dir,
+            profile_name,
+            profile_by_stage,
        )

    tic = time.perf_counter()
+
+    payload = {
+        "sampling_params": {
+            "temperature": temperature,
+            "max_new_tokens": output_len,
+            "ignore_eos": True,
+            "json_schema": json_schema,
+            "stream_interval": stream_interval,
+        },
+        "return_logprob": return_logprob,
+        "stream": True,
+        **({"parallel_batch": parallel_batch} if parallel_batch else {}),
+    }
+    if dataset_name == "mmmu":
+        # vlm
+        input_ids = []
+        for input_req in input_requests:
+            input_ids += [tokenizer.encode(input_req.prompt)]
+        payload["image_data"] = [req.image_data for req in input_requests]
+
+    else:
+        input_ids = [req.prompt for req in input_requests]
+
+    payload["input_ids"] = input_ids
+
    response = requests.post(
        url + "/generate",
-        json={
-            "input_ids": [req.prompt for req in input_requests],
-            "sampling_params": {
-                "temperature": temperature,
-                "max_new_tokens": output_len,
-                "ignore_eos": True,
-                "json_schema": json_schema,
-                "stream_interval": stream_interval,
-            },
-            "return_logprob": return_logprob,
-            "stream": True,
-            **({"parallel_batch": parallel_batch} if parallel_batch else {}),
-        },
+        json=payload,
        stream=True,
    )

@@ -264,10 +440,100 @@ def run_one_case(
        overall_throughput,
        last_gen_throughput,
        acc_length,
-        profile_link if profile else None,
+        profile_link,
    )


+def save_results_as_json(result: List[Tuple], bench_args: BenchArgs, model: str):
+    """Save benchmark results as JSON using Pydantic models."""
+    json_results = []
+
+    # Generate all parameter combinations to match with results
+    param_combinations = list(
+        itertools.product(
+            bench_args.batch_size, bench_args.input_len, bench_args.output_len
+        )
+    )
+
+    for i, (
+        batch_size,
+        latency,
+        ttft,
+        input_throughput,
+        output_throughput,
+        overall_throughput,
+        last_gen_throughput,
+        acc_length,
+        profile_link,
+    ) in enumerate(result):
+        # Get the corresponding parameters for this result
+        bs, input_len, output_len = param_combinations[i]
+
+        # Parse profile links if available
+        profile_links = None
+        if profile_link:
+            profile_links = parse_profile_links(
+                profile_link, batch_size, input_len, output_len
+            )
+
+        benchmark_result = BenchmarkResult(
+            model_path=model,
+            run_name=bench_args.run_name,
+            batch_size=batch_size,
+            input_len=input_len,
+            output_len=output_len,
+            latency=latency,
+            ttft=ttft,
+            input_throughput=input_throughput,
+            output_throughput=output_throughput,
+            overall_throughput=overall_throughput,
+            last_gen_throughput=last_gen_throughput,
+            acc_length=acc_length,
+            profile_links=profile_links,
+        )
+        json_results.append(benchmark_result.model_dump())
+
+    # Save to JSON file
+    with open(bench_args.output_path, "w", encoding="utf-8") as f:
+        json.dump(json_results, f, indent=2, ensure_ascii=False)
+
+    print(f"Results saved as JSON to {bench_args.output_path}")
+
+
+def parse_profile_links(
+    profile_dir: str, batch_size: int, input_len: int, output_len: int
+) -> Optional[ProfileLinks]:
+    """Parse profile directory to extract extend and decode trace file links."""
+    if not profile_dir or not os.path.exists(profile_dir):
+        return None
+
+    extend_link = None
+    decode_link = None
+
+    # Look for extend/prefill trace files
+    for file in os.listdir(profile_dir):
+        if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
+            if "extend" in file.lower() or "prefill" in file.lower():
+                extend_link = os.path.join(profile_dir, file)
+            elif "decode" in file.lower():
+                decode_link = os.path.join(profile_dir, file)
+
+    # If no specific extend/decode files found, try to find files with batch/input/output info
+    if not extend_link or not decode_link:
+        for file in os.listdir(profile_dir):
+            if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
+                if f"_batch{batch_size}_input{input_len}_output{output_len}_" in file:
+                    if "prefill" in file.lower() or "extend" in file.lower():
+                        extend_link = os.path.join(profile_dir, file)
+                    elif "decode" in file.lower():
+                        decode_link = os.path.join(profile_dir, file)
+
+    if extend_link or decode_link:
+        return ProfileLinks(extend=extend_link, decode=decode_link)
+
+    return None
+
+
 def get_report_summary(
    result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
 ):
@@ -358,6 +624,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
            return_logprob=bench_args.return_logprob,
            stream_interval=bench_args.client_stream_interval,
            input_len_step_percentage=bench_args.input_len_step_percentage,
+            dataset_name=bench_args.dataset_name,
            run_name="",
            result_filename="",
            tokenizer=tokenizer,
@@ -384,10 +651,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                    stream_interval=bench_args.client_stream_interval,
                    input_len_step_percentage=bench_args.input_len_step_percentage,
                    run_name=bench_args.run_name,
+                    dataset_name=bench_args.dataset_name,
                    result_filename=bench_args.result_filename,
                    tokenizer=tokenizer,
                    dataset_path=bench_args.dataset_path,
                    parallel_batch=bench_args.parallel_batch,
+                    profile_filename_prefix=bench_args.profile_filename_prefix,
                )
            )

@@ -410,11 +679,13 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                                run_name=bench_args.run_name,
                                result_filename=bench_args.result_filename,
                                tokenizer=tokenizer,
+                                dataset_name=bench_args.dataset_name,
                                profile=bench_args.profile,
                                profile_steps=bench_args.profile_steps,
                                profile_by_stage=bench_args.profile_by_stage,
                                dataset_path=bench_args.dataset_path,
                                parallel_batch=bench_args.parallel_batch,
+                                profile_filename_prefix=bench_args.profile_filename_prefix,
                            )[-1],
                        )
                    )
@@ -427,13 +698,16 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):

    print(f"\nResults are saved to {bench_args.result_filename}")

+    # Save results as JSON if output_path is specified
+    if bench_args.output_path:
+        save_results_as_json(result, bench_args, model=server_args.model_path)
+
    if not bench_args.show_report:
        return

    summary = get_report_summary(result, server_args, bench_args)
-    print(summary)

-    if is_in_ci():
+    if is_in_ci() and bench_args.append_to_github_summary:
        write_github_step_summary(summary)