[aclgraph] implentment NPUPiecewiseBackend to enable aclgraph (#836)

### What this PR does / why we need it? 1. Implentment `NPUPiecewiseBackend` to enable aclgraph 2. Eable aclgraph by default in V1, but raise error when running deepseek and raise warning when running models except for qwen ### How was this patch tested? CI pass with the new ut --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-05-29 11:58:26 +08:00
parent cc74b97f74
commit a93bed4535
8 changed files with 380 additions and 33 deletions
--- a/tests/compile/test_aclgraph.py
+++ b/tests/compile/test_aclgraph.py
@@ -0,0 +1,102 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Compare the outputs of vLLM with and without aclgraph.
+
+Run `pytest tests/compile/test_aclgraph.py`.
+"""
+
+import os
+
+import pytest
+import torch
+from vllm import LLM, SamplingParams
+
+from tests.conftest import VllmRunner
+from tests.model_utils import check_outputs_equal
+from vllm_ascend.utils import vllm_version_is
+
+MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="aclgraph only support on v1")
+@pytest.mark.skipif(
+    (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
+    reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+def test_models(
+    model: str,
+    max_tokens: int,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    with monkeypatch.context() as m:
+        prompts = [
+            "Hello, my name is", "The president of the United States is",
+            "The capital of France is", "The future of AI is"
+        ]
+
+        # aclgraph only support on v1
+        m.setenv("VLLM_USE_V1", "1")
+
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         temperature=0.0)
+        # TODO: change to use vllmrunner when the registry of custom op is solved
+        # while running pytest
+        vllm_model = LLM(model)
+        vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
+        del vllm_model
+        torch.npu.empty_cache()
+
+        vllm_model = LLM(model, enforce_eager=True)
+        vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
+        del vllm_model
+        torch.npu.empty_cache()
+
+    vllm_aclgraph_outputs_list = []
+    for output in vllm_aclgraph_outputs:
+        vllm_aclgraph_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    vllm_eager_outputs_list = []
+    for output in vllm_eager_outputs:
+        vllm_eager_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs_list,
+        outputs_1_lst=vllm_aclgraph_outputs_list,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_aclgraph_outputs",
+    )
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="aclgraph only support on v1")
+@pytest.mark.skipif(
+    (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
+    reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
+def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        m.setenv("VLLM_USE_V1", "1")
+        with pytest.raises(NotImplementedError) as excinfo:
+            VllmRunner("deepseek-ai/DeepSeek-V2-Lite-Chat",
+                       max_model_len=1024,
+                       enforce_eager=False)
+        assert "ACL Graph does not support deepseek" in str(excinfo.value)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -77,7 +77,7 @@ class VllmRunner:
        block_size: int = 16,
        enable_chunked_prefill: bool = False,
        swap_space: int = 4,
-        enforce_eager: Optional[bool] = False,
+        enforce_eager: Optional[bool] = True,
        **kwargs,
    ) -> None:
        self.model = LLM(
--- a/tests/long_term/spec_decode/e2e/test_v1_spec_decode.py
+++ b/tests/long_term/spec_decode/e2e/test_v1_spec_decode.py
@@ -72,7 +72,7 @@ def test_ngram_correctness(
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")

-        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
        del ref_llm

@@ -85,6 +85,7 @@ def test_ngram_correctness(
                "num_speculative_tokens": 3,
            },
            max_model_len=1024,
+            enforce_eager=True,
        )
        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
        matches = 0
@@ -135,6 +136,7 @@ def test_eagle_correctness(
                "max_model_len": 2048,
            },
            max_model_len=2048,
+            enforce_eager=True,
        )
        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
        matches = 0
--- a/tests/multicard/test_dynamic_npugraph_batchsize.py
+++ b/tests/multicard/test_dynamic_npugraph_batchsize.py
@@ -18,8 +18,7 @@ import pytest
 import torch
 from vllm import LLM, SamplingParams

-# TODO: revert me when cuda hard code is fixed in 'VllmBackend'
-torch.cuda.CUDAGraph = torch.npu.NPUGraph
+from vllm_ascend.utils import vllm_version_is

 MODELS = [
    "Qwen/Qwen2.5-0.5B-Instruct",
@@ -33,6 +32,9 @@ prompts = [
 ]


+@pytest.mark.skipif(
+    (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
+    reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("max_tokens", [64])
--- a/tests/singlecard/test_offline_inference.py
+++ b/tests/singlecard/test_offline_inference.py
@@ -52,7 +52,7 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
    with VllmRunner(model,
                    max_model_len=8192,
                    dtype=dtype,
-                    enforce_eager=False,
+                    enforce_eager=True,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)