From 808d00406f4f25a9196ee687b84d06634339b413 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Wed, 15 Apr 2026 17:16:27 +0800 Subject: [PATCH] [v0.18.0][CI]Add rank0 process count check for DeepSeek-R1-W8A8-HBM test (#8072) ### What this PR does / why we need it? Adds a `check_rank0_process_count` validation step to the DeepSeek-R1-W8A8-HBM nightly single-node test. The check verifies that after the server starts, there is **exactly 1** `vllm serve` process running on rank0. This guards against the regression fixed in #8041 (extra NPU context leaking on device 0), ensuring it does not silently reappear in future releases. #### Changes - **`tests/e2e/nightly/single_node/models/scripts/test_single_node.py`**: Add `run_check_rank0_process_count` async handler. It calls `npu-smi info` for diagnostics, then uses `psutil` to assert exactly one `vllm serve` process exists on rank0. - **`tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml`**: Register `check_rank0_process_count` in the `test_content` list for the HBM test case. Signed-off-by: hfadzxy --- .../models/configs/DeepSeek-R1-W8A8-HBM.yaml | 3 +++ .../models/scripts/test_single_node.py | 27 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml b/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml index d1f7f93e..c6f855bc 100644 --- a/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml +++ b/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml @@ -39,4 +39,7 @@ test_cases: - "--enforce-eager" - "--additional-config" - '{"ascend_scheduler_config": {"enabled": false}, "torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}}' + test_content: + - completion + - check_rank0_process_count benchmarks: diff --git a/tests/e2e/nightly/single_node/models/scripts/test_single_node.py b/tests/e2e/nightly/single_node/models/scripts/test_single_node.py index 3b278e46..703b8da9 100644 --- a/tests/e2e/nightly/single_node/models/scripts/test_single_node.py +++ b/tests/e2e/nightly/single_node/models/scripts/test_single_node.py @@ -1,7 +1,9 @@ +import asyncio import logging from typing import Any import openai +import psutil import pytest import subprocess import sys @@ -107,11 +109,36 @@ def run_benchmark_comparisons(config: SingleNodeConfig, results: Any) -> None: print(f"✅ Comparison passed: {eval_str} [threshold: {expected_threshold}]") +async def run_check_rank0_process_count(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None: + proc = await asyncio.create_subprocess_exec( + "npu-smi", "info", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout_bytes, stderr_bytes = await proc.communicate() + if proc.returncode == 0: + logger.info("npu-smi info:\n%s", stdout_bytes.decode(errors='ignore')) + else: + logger.warning("npu-smi info failed: %s", stderr_bytes.decode(errors='ignore')) + + vllm_serve_procs = [ + p for p in psutil.process_iter(attrs=["pid", "cmdline"], ad_value=None) + if p.info["cmdline"] + and any("vllm" in arg for arg in p.info["cmdline"]) + and any("serve" in arg for arg in p.info["cmdline"]) + ] + count = len(vllm_serve_procs) + assert count == 1, ( + f"rank0 process count check failed: expected exactly 1 vllm serve process on rank0, found {count}" + ) + + # Extend this dictionary to add new test capabilities TEST_HANDLERS = { "completion": run_completion_test, "image": run_image_test, "chat_completion": run_chat_completion_test, + "check_rank0_process_count": run_check_rank0_process_count, }