[CI] Fix oom in chunk prefill (#1622)

### What this PR does / why we need it?
Add the resource clear logic to fix oom issue when testing
`tests/e2e/singlecard/core/ascend_scheduler`.
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
CI passed with existing test.

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2025-07-07 10:14:40 +08:00
committed by GitHub
parent c58accc15e
commit 7efa4e92fe
2 changed files with 9 additions and 4 deletions

View File

@@ -1,8 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import os
import pytest
import torch
from vllm import LLM
if os.getenv("VLLM_USE_V1", "0") != "1":
@@ -13,8 +15,8 @@ PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
def model() -> LLM:
return LLM(
def model():
llm = LLM(
MODEL,
enforce_eager=True,
enable_prefix_caching=True,
@@ -23,6 +25,10 @@ def model() -> LLM:
additional_config={"ascend_scheduler_config": {
"enabled": True,
}})
yield llm
del llm
torch.npu.empty_cache()
gc.collect()
def test_concurrent_partial_prefill(model):
@@ -37,4 +43,4 @@ def test_prefix_cache_stats_is_recorded(model):
input_tokens = {"prompt_token_ids": [101] * 129}
_ = model.generate([input_tokens])
outputs = model.generate([input_tokens])
assert outputs[0].num_cached_tokens == 128
assert outputs[0].num_cached_tokens == 128

View File

@@ -17,7 +17,6 @@ MODELS = [
]
@pytest.mark.skipif(True, reason="oom in 910B4, fix me please")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens",
[4]) # cannot align results when max_tokens > 4