support FULL graph mode for GQA (#3970)

### What this PR does / why we need it? The current library only supports the FullDecodeOnly graph mode, which enables full graph execution during the decode. This PR extends support to allow full graph execution in both the prefill and decode, referred to as FULL graph mode. - vLLM version: v0.11.0 - vLLM main: 2918c1b49c Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
2025-11-17 10:50:35 +08:00
parent c334114f69
commit e38ef2c434
11 changed files with 328 additions and 296 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -233,7 +233,8 @@ class NPUPlatform(Platform):
                    "vllm.mla_forward"
                ])
                update_aclgraph_sizes(vllm_config)
-            elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
+            elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\
+                compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
                logger.info(
                    "FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
                    "using only ACL Graph mode")
@@ -270,7 +271,8 @@ class NPUPlatform(Platform):
                compilation_config.use_inductor = False
                compilation_config.splitting_ops.extend(["vllm::mla_forward"])
                update_aclgraph_sizes(vllm_config)
-            elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
+            elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\
+                compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
                logger.info(
                    "FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
                    "using only ACL Graph mode")