From 2e7633982ce51ffd360ca8932cd0f1254daa4e11 Mon Sep 17 00:00:00 2001
From: Mick <mickjagger19@icloud.com>
Date: Mon, 29 Sep 2025 03:38:29 +0800
Subject: [PATCH] fix: show failed models in nightly ci (#10986)

---
 python/sglang/bench_one_batch_server.py       |  5 +-
 python/sglang/srt/server_args.py              |  1 -
 python/sglang/test/test_utils.py              | 57 +++++++++++--------
 .../test_nightly_text_models_gsm8k_eval.py    |  2 +-
 test/srt/test_nightly_vlms_mmmu_eval.py       |  2 +-
 5 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py
index 0f2b6bc26..711236b3c 100644
--- a/python/sglang/bench_one_batch_server.py
+++ b/python/sglang/bench_one_batch_server.py
@@ -66,9 +66,8 @@ class BenchmarkResult(BaseModel):
     def help_str() -> str:
         return f"""
 Note: To view the traces through perfetto-ui, please:
-1. use Google Chrome
-2. enable popup
-
+    1. open with Google Chrome
+    2. allow popup
 """
 
     def to_markdown_row(
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index e1b53e6f9..1535d3a2a 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -51,7 +51,6 @@ from sglang.utils import is_in_ci
 
 logger = logging.getLogger(__name__)
 
-
 # Define constants
 LOAD_FORMAT_CHOICES = [
     "auto",
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 410f0aa99..1c5cd2fd1 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -1518,31 +1518,45 @@ def check_evaluation_test_results(
         summary = " | model | status | score | score_threshold | \n"
         summary += "| ----- | ------ | ----- | --------------- | \n"
 
-    for model, accuracy, latency in results:
-        accuracy_threshold = model_accuracy_thresholds.get(model)
-        if accuracy_threshold is None:
-            print(f"Warning: No threshold defined for model {model}")
-            continue
+    results_dict = {res[0]: (res[1], res[2]) for res in results}
 
+    for model, accuracy_threshold in sorted(model_accuracy_thresholds.items()):
         latency_threshold = (
-            model_latency_thresholds.get(model, None)
-            if model_latency_thresholds
+            model_latency_thresholds.get(model)
+            if model_latency_thresholds is not None
             else 1e9
         )
 
-        is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
-        status_emoji = "✅" if is_success else "❌"
+        if model in results_dict:
+            accuracy, latency = results_dict[model]
+            is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
+            status_emoji = "✅" if is_success else "❌"
 
-        if not is_success:
-            failed_models.append(
-                f"\nScore Check Failed: {model}\n"
-                f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
-            )
+            if not is_success:
+                if accuracy < accuracy_threshold:
+                    failed_models.append(
+                        f"\nScore Check Failed: {model}\n"
+                        f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
+                    )
+                if latency > latency_threshold:
+                    failed_models.append(
+                        f"\nLatency Check Failed: {model}\n"
+                        f"Model {model} latency ({latency:.4f}) is above threshold ({latency_threshold:.4f})"
+                    )
 
-        if model_latency_thresholds is not None:
-            line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
+            if model_latency_thresholds is not None:
+                line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
+            else:
+                line = (
+                    f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
+                )
         else:
-            line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
+            status_emoji = "❌"
+            failed_models.append(f"Model failed to launch or be evaluated: {model}")
+            if model_latency_thresholds is not None:
+                line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold} | N/A | {latency_threshold}\n"
+            else:
+                line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold}\n"
 
         summary += line
 
@@ -1551,13 +1565,8 @@ def check_evaluation_test_results(
     if is_in_ci():
         write_github_step_summary(f"## {test_name}\n{summary}")
 
-    some_model_failed_to_get_result = len(results) != (
-        model_count or len(model_accuracy_thresholds)
-    )
-    if some_model_failed_to_get_result:
-        print("Some model has failed to launch and be evaluated")
-
-    if failed_models or some_model_failed_to_get_result:
+    if failed_models:
+        print("Some models failed the evaluation.")
         raise AssertionError("\n".join(failed_models))
 
 
diff --git a/test/srt/test_nightly_text_models_gsm8k_eval.py b/test/srt/test_nightly_text_models_gsm8k_eval.py
index 07c95952e..a69019286 100644
--- a/test/srt/test_nightly_text_models_gsm8k_eval.py
+++ b/test/srt/test_nightly_text_models_gsm8k_eval.py
@@ -24,7 +24,7 @@ MODEL_SCORE_THRESHOLDS = {
     "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
     "google/gemma-2-27b-it": 0.91,
     "meta-llama/Llama-3.1-70B-Instruct": 0.95,
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.62,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.616,
     "Qwen/Qwen2-57B-A14B-Instruct": 0.86,
     "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
     "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
diff --git a/test/srt/test_nightly_vlms_mmmu_eval.py b/test/srt/test_nightly_vlms_mmmu_eval.py
index dc12fa125..be3230a66 100644
--- a/test/srt/test_nightly_vlms_mmmu_eval.py
+++ b/test/srt/test_nightly_vlms_mmmu_eval.py
@@ -24,7 +24,7 @@ MODEL_THRESHOLDS = {
     ),
     ModelDeploySetup("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
     ModelDeploySetup("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
-    ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 14.5),
+    ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
     ModelDeploySetup("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(0.330, 22.3),
     ModelDeploySetup("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
     ModelDeploySetup("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),