[e2e]Fixed the issue that pyhccl e2e cannot run continuously with other tests (#1246)

### What this PR does / why we need it? 1.Fixed the issue that pyhccl e2e cannot run continuously with other tests. 2.Cleaned up the resources occupied by the dynamic_npugraph_batchsize e2e test. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? This is a e2e test e2e multi-cards tests local running successfully. - vLLM version: v0.9.2 - vLLM main: 0df4d9b06b Signed-off-by: leo-pony <nengjunma@outlook.com>
2025-07-29 19:38:30 +08:00
parent 61fc35184b
commit 4df8e0027c
2 changed files with 54 additions and 41 deletions
--- a/tests/e2e/multicard/test_dynamic_npugraph_batchsize.py
+++ b/tests/e2e/multicard/test_dynamic_npugraph_batchsize.py
@@ -16,7 +16,9 @@
 #
 import pytest
 import torch
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner

 MODELS = [
    "Qwen/Qwen2.5-0.5B-Instruct",
@@ -38,20 +40,20 @@ prompts = [
 def test_models(model: str, tp_size: int, max_tokens: int, temperature: int,
                ignore_eos: bool) -> None:
    # Create an LLM.
-    llm = LLM(
-        model=model,
-        tensor_parallel_size=tp_size,
-    )
-    # Prepare sampling_parames
-    sampling_params = SamplingParams(
-        max_tokens=max_tokens,
-        temperature=temperature,
-        ignore_eos=ignore_eos,
-    )
+    with VllmRunner(
+            model_name=model,
+            tensor_parallel_size=tp_size,
+    ) as vllm_model:
+        # Prepare sampling_parames
+        sampling_params = SamplingParams(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            ignore_eos=ignore_eos,
+        )

-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    outputs = llm.generate(prompts, sampling_params)
-    torch.npu.synchronize()
-    # The output length should be equal to prompts length.
-    assert len(outputs) == len(prompts)
+        # Generate texts from the prompts.
+        # The output is a list of RequestOutput objects
+        outputs = vllm_model.generate(prompts, sampling_params)
+        torch.npu.synchronize()
+        # The output length should be equal to prompts length.
+        assert len(outputs) == len(prompts)