ci: improve nightly-ci (#11385)

2025-10-13 12:19:34 +08:00
parent a55cf5304a
commit 0c0779d667
6 changed files with 76 additions and 54 deletions
--- a/test/srt/test_nightly_text_models_perf.py
+++ b/test/srt/test_nightly_text_models_perf.py
@@ -3,7 +3,7 @@ import subprocess
 import time
 import unittest

-from sglang.bench_one_batch_server import BenchmarkResult
+from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -41,7 +41,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):

    def test_bench_one_batch(self):
        all_benchmark_results = []
-
+        all_model_succeed = True
        for model_setup in self.models:
            benchmark_results = []
            with self.subTest(model=model_setup.model_path):
@@ -113,19 +113,21 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
                        # Clean up JSON file
                        os.remove(json_output_file)
                    else:
+                        all_model_succeed = False
                        print(f"Warning: JSON output file {json_output_file} not found")

                finally:
                    kill_process_tree(process.pid)

-                report_part = BenchmarkResult.generate_markdown_report(
-                    PROFILE_DIR, benchmark_results
-                )
+                report_part = generate_markdown_report(PROFILE_DIR, benchmark_results)
                self.full_report += report_part + "\n"

        if is_in_ci():
            write_github_step_summary(self.full_report)

+        if not all_model_succeed:
+            raise AssertionError("Some models failed the perf tests.")
+

 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_nightly_vlms_mmmu_eval.py
+++ b/test/srt/test_nightly_vlms_mmmu_eval.py
@@ -1,7 +1,6 @@
 import json
 import unittest
 import warnings
-from functools import partial
 from types import SimpleNamespace

 from sglang.srt.utils import kill_process_tree
@@ -26,16 +25,19 @@ MODEL_THRESHOLDS = {
        "Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
    ): ModelEvalMetrics(0.305, 23.8),
    ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
-    ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
+    ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 17.7),
    ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
    ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(
        0.330, 22.3
    ),
    ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
-    ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
-    ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
+    ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.259, 36.3),
+    ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 17.0),
    ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
    ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
+    ModelLaunchSettings(
+        "Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
+    ): ModelEvalMetrics(0.29, 29.1),
    ModelLaunchSettings(
        "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
    ): ModelEvalMetrics(0.310, 16.7),
--- a/test/srt/test_nightly_vlms_perf.py
+++ b/test/srt/test_nightly_vlms_perf.py
@@ -3,7 +3,7 @@ import subprocess
 import unittest
 import warnings

-from sglang.bench_one_batch_server import BenchmarkResult
+from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -27,6 +27,7 @@ MODEL_DEFAULTS = [
    ModelLaunchSettings(
        "google/gemma-3-27b-it",
    ),
+    ModelLaunchSettings("Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]),
    # "OpenGVLab/InternVL2_5-2B",
    # buggy in official transformers impl
    # "openbmb/MiniCPM-V-2_6",
@@ -45,9 +46,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
            cls.models = []
            model_paths = parse_models(nightly_vlm_models_str)
            for model_path in model_paths:
-                cls.models.append(
-                    ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS)
-                )
+                cls.models.append(ModelLaunchSettings(model_path))
        else:
            cls.models = MODEL_DEFAULTS

@@ -60,6 +59,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):

    def test_bench_one_batch(self):
        all_benchmark_results = []
+        all_model_succeed = True

        for model_setup in self.models:
            benchmark_results = []
@@ -112,7 +112,6 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
                            f"Error running benchmark for {model_setup.model_path} with batch size:"
                        )
                        print(result.stderr)
-                        # Continue to next batch size even if one fails
                        continue

                    print(f"Output for {model_setup.model_path} with batch size:")
@@ -136,19 +135,24 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
                        )

                    else:
+                        all_model_succeed = False
                        print(f"Warning: JSON output file {json_output_file} not found")

                finally:
                    kill_process_tree(process.pid)

-                report_part = BenchmarkResult.generate_markdown_report(
-                    PROFILE_DIR, benchmark_results
+                report_part = generate_markdown_report(
+                    PROFILE_DIR,
+                    benchmark_results,
                )
                self.full_report += report_part + "\n"

        if is_in_ci():
            write_github_step_summary(self.full_report)

+        if not all_model_succeed:
+            raise AssertionError("Some models failed the perf tests.")
+

 if __name__ == "__main__":
    unittest.main()