[CI] Fix oom in chunk prefill (#1622)
### What this PR does / why we need it? Add the resource clear logic to fix oom issue when testing `tests/e2e/singlecard/core/ascend_scheduler`. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. --------- Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm import LLM
|
||||
|
||||
if os.getenv("VLLM_USE_V1", "0") != "1":
|
||||
@@ -13,8 +15,8 @@ PROMPT = "Hello my name is Robert and I"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model() -> LLM:
|
||||
return LLM(
|
||||
def model():
|
||||
llm = LLM(
|
||||
MODEL,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
@@ -23,6 +25,10 @@ def model() -> LLM:
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
}})
|
||||
yield llm
|
||||
del llm
|
||||
torch.npu.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
|
||||
def test_concurrent_partial_prefill(model):
|
||||
@@ -37,4 +43,4 @@ def test_prefix_cache_stats_is_recorded(model):
|
||||
input_tokens = {"prompt_token_ids": [101] * 129}
|
||||
_ = model.generate([input_tokens])
|
||||
outputs = model.generate([input_tokens])
|
||||
assert outputs[0].num_cached_tokens == 128
|
||||
assert outputs[0].num_cached_tokens == 128
|
||||
|
||||
@@ -17,7 +17,6 @@ MODELS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(True, reason="oom in 910B4, fix me please")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens",
|
||||
[4]) # cannot align results when max_tokens > 4
|
||||
|
||||
Reference in New Issue
Block a user