[v0.18.0][CI]Add rank0 process count check for DeepSeek-R1-W8A8-HBM test (#8072)

### What this PR does / why we need it?
Adds a `check_rank0_process_count` validation step to the
DeepSeek-R1-W8A8-HBM nightly single-node test.

The check verifies that after the server starts, there is **exactly 1**
`vllm serve` process running on rank0. This guards against the
regression fixed in #8041 (extra NPU context leaking on device 0),
ensuring it does not silently reappear in future releases.

#### Changes

-
**`tests/e2e/nightly/single_node/models/scripts/test_single_node.py`**:
Add `run_check_rank0_process_count` async handler. It calls `npu-smi
info` for diagnostics, then uses `psutil` to assert exactly one `vllm
serve` process exists on rank0.
-
**`tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml`**:
Register `check_rank0_process_count` in the `test_content` list for the
HBM test case.

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2026-04-15 17:16:27 +08:00
committed by GitHub
parent 95726d20eb
commit 808d00406f
2 changed files with 30 additions and 0 deletions

View File

@@ -39,4 +39,7 @@ test_cases:
- "--enforce-eager" - "--enforce-eager"
- "--additional-config" - "--additional-config"
- '{"ascend_scheduler_config": {"enabled": false}, "torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}}' - '{"ascend_scheduler_config": {"enabled": false}, "torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}}'
test_content:
- completion
- check_rank0_process_count
benchmarks: benchmarks:

View File

@@ -1,7 +1,9 @@
import asyncio
import logging import logging
from typing import Any from typing import Any
import openai import openai
import psutil
import pytest import pytest
import subprocess import subprocess
import sys import sys
@@ -107,11 +109,36 @@ def run_benchmark_comparisons(config: SingleNodeConfig, results: Any) -> None:
print(f"✅ Comparison passed: {eval_str} [threshold: {expected_threshold}]") print(f"✅ Comparison passed: {eval_str} [threshold: {expected_threshold}]")
async def run_check_rank0_process_count(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
proc = await asyncio.create_subprocess_exec(
"npu-smi", "info",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout_bytes, stderr_bytes = await proc.communicate()
if proc.returncode == 0:
logger.info("npu-smi info:\n%s", stdout_bytes.decode(errors='ignore'))
else:
logger.warning("npu-smi info failed: %s", stderr_bytes.decode(errors='ignore'))
vllm_serve_procs = [
p for p in psutil.process_iter(attrs=["pid", "cmdline"], ad_value=None)
if p.info["cmdline"]
and any("vllm" in arg for arg in p.info["cmdline"])
and any("serve" in arg for arg in p.info["cmdline"])
]
count = len(vllm_serve_procs)
assert count == 1, (
f"rank0 process count check failed: expected exactly 1 vllm serve process on rank0, found {count}"
)
# Extend this dictionary to add new test capabilities # Extend this dictionary to add new test capabilities
TEST_HANDLERS = { TEST_HANDLERS = {
"completion": run_completion_test, "completion": run_completion_test,
"image": run_image_test, "image": run_image_test,
"chat_completion": run_chat_completion_test, "chat_completion": run_chat_completion_test,
"check_rank0_process_count": run_check_rank0_process_count,
} }