[aclgraph] implentment NPUPiecewiseBackend to enable aclgraph (#836)

### What this PR does / why we need it? 1. Implentment `NPUPiecewiseBackend` to enable aclgraph 2. Eable aclgraph by default in V1, but raise error when running deepseek and raise warning when running models except for qwen ### How was this patch tested? CI pass with the new ut --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-05-29 11:58:26 +08:00
parent cc74b97f74
commit a93bed4535
8 changed files with 380 additions and 33 deletions
--- a/tests/multicard/test_dynamic_npugraph_batchsize.py
+++ b/tests/multicard/test_dynamic_npugraph_batchsize.py
@@ -18,8 +18,7 @@ import pytest
 import torch
 from vllm import LLM, SamplingParams

-# TODO: revert me when cuda hard code is fixed in 'VllmBackend'
-torch.cuda.CUDAGraph = torch.npu.NPUGraph
+from vllm_ascend.utils import vllm_version_is

 MODELS = [
    "Qwen/Qwen2.5-0.5B-Instruct",
@@ -33,6 +32,9 @@ prompts = [
 ]


+@pytest.mark.skipif(
+    (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
+    reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("max_tokens", [64])