From 387ce1cc5b0eadff99a38c66743df7e994134d48 Mon Sep 17 00:00:00 2001 From: lilinsiman Date: Fri, 31 Oct 2025 09:17:09 +0800 Subject: [PATCH] add new e2e tests case for aclgraph memory to v0.11.0 (#3880) ### What this PR does / why we need it? add new e2e tests case for aclgraph memory to v0.11.0 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ut Signed-off-by: lilinsiman --- .github/workflows/_e2e_test.yaml | 1 + tests/e2e/singlecard/test_aclgraph_mem.py | 100 ++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 tests/e2e/singlecard/test_aclgraph_mem.py diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index ddbf4b3..9007a85 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -89,6 +89,7 @@ jobs: # the test separately. pytest -sv tests/e2e/singlecard/test_aclgraph.py + pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py pytest -sv tests/e2e/singlecard/test_bge_model.py pytest -sv tests/e2e/singlecard/test_camem.py diff --git a/tests/e2e/singlecard/test_aclgraph_mem.py b/tests/e2e/singlecard/test_aclgraph_mem.py new file mode 100644 index 0000000..c7e5078 --- /dev/null +++ b/tests/e2e/singlecard/test_aclgraph_mem.py @@ -0,0 +1,100 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import multiprocessing +import os +from unittest.mock import patch + +import pytest +import torch +from modelscope import snapshot_download # type: ignore +from vllm import SamplingParams + +from tests.e2e.conftest import VllmRunner +from vllm_ascend.worker.model_runner_v1 import NPUModelRunner + +MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] + + +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", + reason="aclgraph only support on v1") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [4]) +@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"}) +def test_aclgraph_mem_use(model: str, max_tokens: int) -> None: + del os.environ["VLLM_WORKER_MULTIPROC_METHOD"] + capture_called = multiprocessing.Value("i", 0) # int, 0 or 1 + capture_mem_before = multiprocessing.Value("q", -1) # long long (64-bit) + capture_mem_after = multiprocessing.Value("q", -1) # long long + + def capture_model_wrapper(original_method): + + def wrapped(self): + mem_before = torch.npu.mem_get_info()[0] # free memory + result = original_method(self) + mem_after = torch.npu.mem_get_info()[0] + with capture_called.get_lock(): + capture_called.value = 1 + capture_mem_before.value = mem_before + capture_mem_after.value = mem_after + return result + + return wrapped + + original_capture = NPUModelRunner._capture_model + + with patch.object(NPUModelRunner, + '_capture_model', + new=capture_model_wrapper(original_capture)): + prompts = [ + "Hello, my name is", "The president of the United States is", + "The capital of France is", "The future of AI is" + ] + sampling_params = SamplingParams(max_tokens=max_tokens, + temperature=0.0) + if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": + vllm_model = VllmRunner(snapshot_download(model), + max_model_len=1024, + quantization="ascend") + else: + vllm_model = VllmRunner(snapshot_download(model)) + _ = vllm_model.generate(prompts, sampling_params) + + assert capture_called.value == 1, "_capture_model was not called during test" + assert capture_mem_before.value != -1, "capture_mem_before not set" + assert capture_mem_after.value != -1, "capture_mem_after not set" + + print("capture_mem_before =", capture_mem_before.value) + print("capture_mem_after =", capture_mem_after.value) + + mem_used_by_capture = capture_mem_before.value - capture_mem_after.value + # Empirical observation: capturing ACL graphs for Qwen3-0.6B uses ~0.20 GiB of NPU memory. + # DeepSeek-V2-Lite-W8A8 uses ~0.68 GiB of NPU memory + # a 1.3x tolerance is applied to account for runtime variance. + if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": + baseline_capture_mem = 0.68 + capture_mem_tolerance = 1.5 + else: + baseline_capture_mem = 0.20 + capture_mem_tolerance = 1.3 + max_capture_mem_gib = baseline_capture_mem * capture_mem_tolerance + max_mem_expected = max_capture_mem_gib * (1024**3) + assert mem_used_by_capture < max_mem_expected, ( + f"_capture_model used more memory than expected. " + f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, " + f"Expected: < {max_capture_mem_gib:.2f} GiB") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn'