[CI] Fix oom in chunk prefill (#1622)

### What this PR does / why we need it? Add the resource clear logic to fix oom issue when testing `tests/e2e/singlecard/core/ascend_scheduler`. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-07-07 10:14:40 +08:00
parent c58accc15e
commit 7efa4e92fe
2 changed files with 9 additions and 4 deletions
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import os
 import pytest
 import torch
 from vllm import LLM
 if os.getenv("VLLM_USE_V1", "0") != "1":
@@ -13,8 +15,8 @@ PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
-def model() -> LLM:
+def model():
-    return LLM(
+    llm = LLM(
        MODEL,
        enforce_eager=True,
        enable_prefix_caching=True,
@@ -23,6 +25,10 @@ def model() -> LLM:
        additional_config={"ascend_scheduler_config": {
            "enabled": True,
        }})
    yield llm
    del llm
    torch.npu.empty_cache()
    gc.collect()
 def test_concurrent_partial_prefill(model):
@@ -37,4 +43,4 @@ def test_prefix_cache_stats_is_recorded(model):
    input_tokens = {"prompt_token_ids": [101] * 129}
    _ = model.generate([input_tokens])
    outputs = model.generate([input_tokens])
-    assert outputs[0].num_cached_tokens == 128
+    assert outputs[0].num_cached_tokens == 128
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
@@ -17,7 +17,6 @@ MODELS = [
 ]
@pytest.mark.skipif(True, reason="oom in 910B4, fix me please")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens",
                         [4])  # cannot align results when max_tokens > 4