ci: unify the model launch method of nightly ci (#11230)

2025-10-08 09:13:14 +08:00
parent f3764c26a3
commit 64d1505c0a
5 changed files with 192 additions and 153 deletions
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -20,7 +20,6 @@ from functools import partial
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any, Awaitable, Callable, List, Optional, Tuple
 from urllib.parse import quote
 import aiohttp
 import numpy as np
@@ -1652,15 +1651,26 @@ def _ensure_remove_suffix(text: str, suffix: str):
    return text.removesuffix(suffix)
-class ModelDeploySetup:
+class ModelLaunchSettings:
-    def __init__(self, model_path: str, extra_args: List[str] = []):
+    def __init__(
        self,
        model_path: str,
        tp_size: int = 1,
        extra_args: Optional[List[str]] = None,
        env: Optional[dict] = None,
    ):
        self.model_path = model_path
-        if "--enable-multimodal" not in extra_args:
+        self.tp_size = tp_size
-            extra_args.append("--enable-multimodal")
+        self.extra_args = list(extra_args) if extra_args else []
-        if "--trust-remote-code" not in extra_args:
+        self.env = env
            extra_args.append("--trust-remote-code")
-        self.extra_args = extra_args
+        if self.tp_size > 1 and "--tp" not in self.extra_args:
            self.extra_args.extend(["--tp", str(self.tp_size)])
        fixed_args = ["--enable-multimodal", "--trust-remote-code"]
        for fixed_arg in fixed_args:
            if fixed_arg not in self.extra_args:
                self.extra_args.append(fixed_arg)
 class ModelEvalMetrics:
--- a/test/srt/test_nightly_text_models_gsm8k_eval.py
+++ b/test/srt/test_nightly_text_models_gsm8k_eval.py
@@ -12,6 +12,7 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    ModelLaunchSettings,
    check_evaluation_test_results,
    parse_models,
    popen_launch_server,
@@ -44,12 +45,19 @@ MODEL_SCORE_THRESHOLDS = {
 class TestNightlyGsm8KEval(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.model_groups = [
+        cls.models = []
-            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
+        models_tp1 = parse_models(
-            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
+            DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
-            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
+        ) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1)
-            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
+        for model_path in models_tp1:
-        ]
+            cls.models.append(ModelLaunchSettings(model_path, tp_size=1))
        models_tp2 = parse_models(
            DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
        ) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2)
        for model_path in models_tp2:
            cls.models.append(ModelLaunchSettings(model_path, tp_size=2))
        cls.base_url = DEFAULT_URL_FOR_TEST
    def test_mgsm_en_all_models(self):
@@ -58,26 +66,24 @@ class TestNightlyGsm8KEval(unittest.TestCase):
        )
        is_first = True
        all_results = []
-        model_count = 0
+        for model_setup in self.models:
-        for model_group, is_fp8, is_tp2 in self.model_groups:
+            with self.subTest(model=model_setup.model_path):
-            for model in model_group:
+                other_args = list(model_setup.extra_args)
                model_count += 1
                with self.subTest(model=model):
                    other_args = ["--tp", "2"] if is_tp2 else []
-                    if model == "meta-llama/Llama-3.1-70B-Instruct":
+                if model_setup.model_path == "meta-llama/Llama-3.1-70B-Instruct":
-                        other_args.extend(["--mem-fraction-static", "0.9"])
+                    other_args.extend(["--mem-fraction-static", "0.9"])
-                    process = popen_launch_server(
+                process = popen_launch_server(
-                        model=model,
+                    model=model_setup.model_path,
-                        other_args=other_args,
+                    other_args=other_args,
-                        base_url=self.base_url,
+                    base_url=self.base_url,
-                        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-                    )
+                )
                try:
                    args = SimpleNamespace(
                        base_url=self.base_url,
-                        model=model,
+                        model=model_setup.model_path,
                        eval_name="mgsm_en",
                        num_examples=None,
                        num_threads=1024,
@@ -85,14 +91,17 @@ class TestNightlyGsm8KEval(unittest.TestCase):
                    metrics = run_eval(args)
                    print(
-                        f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
+                        f"{'=' * 42}\n{model_setup.model_path} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
                    )
-                    write_results_to_json(model, metrics, "w" if is_first else "a")
+                    write_results_to_json(
                        model_setup.model_path, metrics, "w" if is_first else "a"
                    )
                    is_first = False
                    # 0.0 for empty latency
-                    all_results.append((model, metrics["score"], 0.0))
+                    all_results.append((model_setup.model_path, metrics["score"], 0.0))
                finally:
                    kill_process_tree(process.pid)
        try:
@@ -107,7 +116,7 @@ class TestNightlyGsm8KEval(unittest.TestCase):
            all_results,
            self.__class__.__name__,
            model_accuracy_thresholds=MODEL_SCORE_THRESHOLDS,
-            model_count=model_count,
+            model_count=len(self.models),
        )
--- a/test/srt/test_nightly_text_models_perf.py
+++ b/test/srt/test_nightly_text_models_perf.py
@@ -8,6 +8,7 @@ from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    ModelLaunchSettings,
    _parse_int_list_env,
    is_in_ci,
    parse_models,
@@ -21,14 +22,16 @@ PROFILE_DIR = "performance_profiles_text_models"
 class TestNightlyTextModelsPerformance(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.model_groups = [
+        cls.models = []
-            (parse_models("meta-llama/Llama-3.1-8B-Instruct"), False, False),
+        # TODO: replace with DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 or other model lists
-            (parse_models("Qwen/Qwen2-57B-A14B-Instruct"), False, True),
+        for model_path in parse_models("meta-llama/Llama-3.1-8B-Instruct"):
-            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
+            cls.models.append(ModelLaunchSettings(model_path, tp_size=1))
-            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
+        for model_path in parse_models("Qwen/Qwen2-57B-A14B-Instruct"):
-            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
+            cls.models.append(ModelLaunchSettings(model_path, tp_size=2))
-            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
+        # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
-        ]
+        # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
        # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
        # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.batch_sizes = [1, 1, 8, 16, 64]
        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
@@ -39,93 +42,86 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
    def test_bench_one_batch(self):
        all_benchmark_results = []
-        for model_group, is_fp8, is_tp2 in self.model_groups:
+        for model_setup in self.models:
-            for model in model_group:
+            benchmark_results = []
-                benchmark_results = []
+            with self.subTest(model=model_setup.model_path):
-                with self.subTest(model=model):
+                process = popen_launch_server(
-                    process = popen_launch_server(
+                    model=model_setup.model_path,
-                        model=model,
+                    base_url=self.base_url,
-                        base_url=self.base_url,
+                    other_args=model_setup.extra_args,
-                        other_args=["--tp", "2"] if is_tp2 else [],
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-                        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                )
                try:
                    profile_filename = (
                        f"{model_setup.model_path.replace('/', '_')}_{int(time.time())}"
                    )
-                    try:
+                    profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename)
                    json_output_file = f"results_{model_setup.model_path.replace('/', '_')}_{int(time.time())}.json"
-                        profile_filename = (
+                    command = [
-                            f"{model.replace('/', '_')}_{int(time.time())}"
+                        "python3",
                        "-m",
                        "sglang.bench_one_batch_server",
                        "--model",
                        model_setup.model_path,
                        "--base-url",
                        self.base_url,
                        "--batch-size",
                        *[str(x) for x in self.batch_sizes],
                        "--input-len",
                        *[str(x) for x in self.input_lens],
                        "--output-len",
                        *[str(x) for x in self.output_lens],
                        "--show-report",
                        "--profile",
                        "--profile-by-stage",
                        "--profile-filename-prefix",
                        profile_path_prefix,
                        f"--output-path={json_output_file}",
                        "--no-append-to-github-summary",
                    ]
                    print(f"Running command: {' '.join(command)}")
                    result = subprocess.run(command, capture_output=True, text=True)
                    if result.returncode != 0:
                        print(
                            f"Error running benchmark for {model_setup.model_path} with batch size:"
                        )
-                        profile_path_prefix = os.path.join(
+                        print(result.stderr)
-                            PROFILE_DIR, profile_filename
+                        # Continue to next batch size even if one fails
-                        )
+                        continue
-                        json_output_file = (
+
-                            f"results_{model.replace('/', '_')}_{int(time.time())}.json"
+                    # Load and deserialize JSON results
                    if os.path.exists(json_output_file):
                        import json
                        with open(json_output_file, "r") as f:
                            json_data = json.load(f)
                        # Convert JSON data to BenchmarkResult objects
                        for data in json_data:
                            benchmark_result = BenchmarkResult(**data)
                            all_benchmark_results.append(benchmark_result)
                            benchmark_results.append(benchmark_result)
                        print(
                            f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}"
                        )
-                        command = [
+                        # Clean up JSON file
-                            "python3",
+                        os.remove(json_output_file)
-                            "-m",
+                    else:
-                            "sglang.bench_one_batch_server",
+                        print(f"Warning: JSON output file {json_output_file} not found")
                            "--model",
                            model,
                            "--base-url",
                            self.base_url,
                            "--batch-size",
                            *[str(x) for x in self.batch_sizes],
                            "--input-len",
                            *[str(x) for x in self.input_lens],
                            "--output-len",
                            *[str(x) for x in self.output_lens],
                            "--show-report",
                            "--profile",
                            "--profile-by-stage",
                            "--profile-filename-prefix",
                            profile_path_prefix,
                            f"--output-path={json_output_file}",
                            "--no-append-to-github-summary",
                        ]
-                        print(f"Running command: {' '.join(command)}")
+                finally:
-                        result = subprocess.run(command, capture_output=True, text=True)
+                    kill_process_tree(process.pid)
-                        if result.returncode != 0:
+                report_part = BenchmarkResult.generate_markdown_report(
-                            print(
+                    PROFILE_DIR, benchmark_results
-                                f"Error running benchmark for {model} with batch size:"
+                )
-                            )
+                self.full_report += report_part + "\n"
                            print(result.stderr)
                            # Continue to next batch size even if one fails
                            continue
                        # Load and deserialize JSON results
                        if os.path.exists(json_output_file):
                            import json
                            with open(json_output_file, "r") as f:
                                json_data = json.load(f)
                            # Convert JSON data to BenchmarkResult objects
                            for data in json_data:
                                benchmark_result = BenchmarkResult(**data)
                                all_benchmark_results.append(benchmark_result)
                                benchmark_results.append(benchmark_result)
                            print(
                                f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}"
                            )
                            # Clean up JSON file
                            os.remove(json_output_file)
                        else:
                            print(
                                f"Warning: JSON output file {json_output_file} not found"
                            )
                    finally:
                        kill_process_tree(process.pid)
                    report_part = BenchmarkResult.generate_markdown_report(
                        PROFILE_DIR, benchmark_results
                    )
                    self.full_report += report_part + "\n"
        if is_in_ci():
            write_github_step_summary(self.full_report)
--- a/test/srt/test_nightly_vlms_mmmu_eval.py
+++ b/test/srt/test_nightly_vlms_mmmu_eval.py
@@ -1,6 +1,7 @@
 import json
 import unittest
 import warnings
 from functools import partial
 from types import SimpleNamespace
 from sglang.srt.utils import kill_process_tree
@@ -8,8 +9,8 @@ from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    ModelDeploySetup,
    ModelEvalMetrics,
    ModelLaunchSettings,
    check_evaluation_test_results,
    popen_launch_server,
    write_results_to_json,
@@ -17,25 +18,29 @@ from sglang.test.test_utils import (
 MODEL_THRESHOLDS = {
    # Conservative thresholds on 100 MMMU samples, especially for latency thresholds
-    ModelDeploySetup("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics(0.330, 56.1),
+    ModelLaunchSettings("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics(
-    ModelDeploySetup("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 39.9),
+        0.330, 56.1
    ModelDeploySetup("Efficient-Large-Model/NVILA-Lite-2B-hf-0626"): ModelEvalMetrics(
        0.305, 23.8
    ),
-    ModelDeploySetup("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
+    ModelLaunchSettings("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 40.3),
-    ModelDeploySetup("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
+    ModelLaunchSettings(
-    ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
+        "Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
-    ModelDeploySetup("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(0.330, 22.3),
+    ): ModelEvalMetrics(0.305, 23.8),
-    ModelDeploySetup("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
+    ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
-    ModelDeploySetup("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
+    ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
-    ModelDeploySetup("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
+    ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
-    ModelDeploySetup("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
+    ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(
-    ModelDeploySetup("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
+        0.330, 22.3
    ModelDeploySetup("unsloth/Mistral-Small-3.1-24B-Instruct-2503"): ModelEvalMetrics(
        0.310, 16.7
    ),
-    ModelDeploySetup("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0),
+    ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
-    ModelDeploySetup("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4),
+    ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
    ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
    ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
    ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
    ModelLaunchSettings(
        "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
    ): ModelEvalMetrics(0.310, 16.7),
    ModelLaunchSettings("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0),
    ModelLaunchSettings("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4),
 }
--- a/test/srt/test_nightly_vlms_perf.py
+++ b/test/srt/test_nightly_vlms_perf.py
@@ -8,6 +8,7 @@ from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    ModelLaunchSettings,
    _parse_int_list_env,
    is_in_ci,
    parse_models,
@@ -19,8 +20,13 @@ PROFILE_DIR = "performance_profiles_vlms"
 MODEL_DEFAULTS = [
    # Keep conservative defaults. Can be overridden by env NIGHTLY_VLM_MODELS
-    "Qwen/Qwen2.5-VL-7B-Instruct",
+    ModelLaunchSettings(
-    "google/gemma-3-27b-it",
+        "Qwen/Qwen2.5-VL-7B-Instruct",
        extra_args=["--mem-fraction-static=0.7"],
    ),
    ModelLaunchSettings(
        "google/gemma-3-27b-it",
    ),
    # "OpenGVLab/InternVL2_5-2B",
    # buggy in official transformers impl
    # "openbmb/MiniCPM-V-2_6",
@@ -33,9 +39,18 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
        warnings.filterwarnings(
            "ignore", category=ResourceWarning, message="unclosed.*socket"
        )
-        cls.models = parse_models(
+
-            os.environ.get("NIGHTLY_VLM_MODELS", ",".join(MODEL_DEFAULTS))
+        nightly_vlm_models_str = os.environ.get("NIGHTLY_VLM_MODELS")
-        )
+        if nightly_vlm_models_str:
            cls.models = []
            model_paths = parse_models(nightly_vlm_models_str)
            for model_path in model_paths:
                cls.models.append(
                    ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS)
                )
        else:
            cls.models = MODEL_DEFAULTS
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.batch_sizes = _parse_int_list_env("NIGHTLY_VLM_BATCH_SIZES", "1,1,2,8,16")
@@ -46,29 +61,31 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
    def test_bench_one_batch(self):
        all_benchmark_results = []
-        for model in self.models:
+        for model_setup in self.models:
            benchmark_results = []
-            with self.subTest(model=model):
+            with self.subTest(model=model_setup.model_path):
                process = popen_launch_server(
-                    model=model,
+                    model=model_setup.model_path,
                    base_url=self.base_url,
-                    other_args=["--mem-fraction-static=0.7"],
+                    other_args=model_setup.extra_args,
                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
                )
                try:
                    # Run bench_one_batch_server against the launched server
-                    profile_filename = f"{model.replace('/', '_')}"
+                    profile_filename = f"{model_setup.model_path.replace('/', '_')}"
                    # path for this run
                    profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename)
                    # JSON output file for this model
-                    json_output_file = f"results_{model.replace('/', '_')}.json"
+                    json_output_file = (
                        f"results_{model_setup.model_path.replace('/', '_')}.json"
                    )
                    command = [
                        "python3",
                        "-m",
                        "sglang.bench_one_batch_server",
-                        f"--model={model}",
+                        f"--model={model_setup.model_path}",
                        "--base-url",
                        self.base_url,
                        "--batch-size",
@@ -91,12 +108,14 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
                    result = subprocess.run(command, capture_output=True, text=True)
                    if result.returncode != 0:
-                        print(f"Error running benchmark for {model} with batch size:")
+                        print(
                            f"Error running benchmark for {model_setup.model_path} with batch size:"
                        )
                        print(result.stderr)
                        # Continue to next batch size even if one fails
                        continue
-                    print(f"Output for {model} with batch size:")
+                    print(f"Output for {model_setup.model_path} with batch size:")
                    print(result.stdout)
                    # Load and deserialize JSON results