From 808d00406f4f25a9196ee687b84d06634339b413 Mon Sep 17 00:00:00 2001
From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com>
Date: Wed, 15 Apr 2026 17:16:27 +0800
Subject: [PATCH] [v0.18.0][CI]Add rank0 process count check for
 DeepSeek-R1-W8A8-HBM test (#8072)

### What this PR does / why we need it?
Adds a `check_rank0_process_count` validation step to the
DeepSeek-R1-W8A8-HBM nightly single-node test.

The check verifies that after the server starts, there is **exactly 1**
`vllm serve` process running on rank0. This guards against the
regression fixed in #8041 (extra NPU context leaking on device 0),
ensuring it does not silently reappear in future releases.

#### Changes

-
**`tests/e2e/nightly/single_node/models/scripts/test_single_node.py`**:
Add `run_check_rank0_process_count` async handler. It calls `npu-smi
info` for diagnostics, then uses `psutil` to assert exactly one `vllm
serve` process exists on rank0.
-
**`tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml`**:
Register `check_rank0_process_count` in the `test_content` list for the
HBM test case.

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
---
 .../models/configs/DeepSeek-R1-W8A8-HBM.yaml  |  3 +++
 .../models/scripts/test_single_node.py        | 27 +++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml b/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml
index d1f7f93e..c6f855bc 100644
--- a/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml
+++ b/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml
@@ -39,4 +39,7 @@ test_cases:
       - "--enforce-eager"
       - "--additional-config"
       - '{"ascend_scheduler_config": {"enabled": false}, "torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}}'
+    test_content:
+      - completion
+      - check_rank0_process_count
     benchmarks:
diff --git a/tests/e2e/nightly/single_node/models/scripts/test_single_node.py b/tests/e2e/nightly/single_node/models/scripts/test_single_node.py
index 3b278e46..703b8da9 100644
--- a/tests/e2e/nightly/single_node/models/scripts/test_single_node.py
+++ b/tests/e2e/nightly/single_node/models/scripts/test_single_node.py
@@ -1,7 +1,9 @@
+import asyncio
 import logging
 from typing import Any
 
 import openai
+import psutil
 import pytest
 import subprocess
 import sys
@@ -107,11 +109,36 @@ def run_benchmark_comparisons(config: SingleNodeConfig, results: Any) -> None:
         print(f"✅ Comparison passed: {eval_str} [threshold: {expected_threshold}]")
 
 
+async def run_check_rank0_process_count(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
+    proc = await asyncio.create_subprocess_exec(
+        "npu-smi", "info",
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    stdout_bytes, stderr_bytes = await proc.communicate()
+    if proc.returncode == 0:
+        logger.info("npu-smi info:\n%s", stdout_bytes.decode(errors='ignore'))
+    else:
+        logger.warning("npu-smi info failed: %s", stderr_bytes.decode(errors='ignore'))
+
+    vllm_serve_procs = [
+        p for p in psutil.process_iter(attrs=["pid", "cmdline"], ad_value=None)
+        if p.info["cmdline"]
+        and any("vllm" in arg for arg in p.info["cmdline"])
+        and any("serve" in arg for arg in p.info["cmdline"])
+    ]
+    count = len(vllm_serve_procs)
+    assert count == 1, (
+        f"rank0 process count check failed: expected exactly 1 vllm serve process on rank0, found {count}"
+    )
+
+
 # Extend this dictionary to add new test capabilities
 TEST_HANDLERS = {
     "completion": run_completion_test,
     "image": run_image_test,
     "chat_completion": run_chat_completion_test,
+    "check_rank0_process_count": run_check_rank0_process_count,
 }