[CI] Fix oom in chunk prefill (#1622)
### What this PR does / why we need it? Add the resource clear logic to fix oom issue when testing `tests/e2e/singlecard/core/ascend_scheduler`. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. --------- Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -1,8 +1,10 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import gc
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import torch
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
if os.getenv("VLLM_USE_V1", "0") != "1":
|
if os.getenv("VLLM_USE_V1", "0") != "1":
|
||||||
@@ -13,8 +15,8 @@ PROMPT = "Hello my name is Robert and I"
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def model() -> LLM:
|
def model():
|
||||||
return LLM(
|
llm = LLM(
|
||||||
MODEL,
|
MODEL,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
enable_prefix_caching=True,
|
enable_prefix_caching=True,
|
||||||
@@ -23,6 +25,10 @@ def model() -> LLM:
|
|||||||
additional_config={"ascend_scheduler_config": {
|
additional_config={"ascend_scheduler_config": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
}})
|
}})
|
||||||
|
yield llm
|
||||||
|
del llm
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
def test_concurrent_partial_prefill(model):
|
def test_concurrent_partial_prefill(model):
|
||||||
@@ -37,4 +43,4 @@ def test_prefix_cache_stats_is_recorded(model):
|
|||||||
input_tokens = {"prompt_token_ids": [101] * 129}
|
input_tokens = {"prompt_token_ids": [101] * 129}
|
||||||
_ = model.generate([input_tokens])
|
_ = model.generate([input_tokens])
|
||||||
outputs = model.generate([input_tokens])
|
outputs = model.generate([input_tokens])
|
||||||
assert outputs[0].num_cached_tokens == 128
|
assert outputs[0].num_cached_tokens == 128
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ MODELS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(True, reason="oom in 910B4, fix me please")
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("max_tokens",
|
@pytest.mark.parametrize("max_tokens",
|
||||||
[4]) # cannot align results when max_tokens > 4
|
[4]) # cannot align results when max_tokens > 4
|
||||||
|
|||||||
Reference in New Issue
Block a user