From 64669c4243080ac39207f3e4228d688976fdc066 Mon Sep 17 00:00:00 2001 From: Qiu Date: Mon, 22 Dec 2025 03:33:32 -0300 Subject: [PATCH] [misc][FlashComm1][ACLGraph] Incompatibility between Flashcomm1 and FULL_DECODE_ONLY. (#5200) ### What this PR does / why we need it? Currently, Flashcomm1 and FULL_DECODE_ONLY are incompatible. When both features are enabled, graph capture errors occur without clear error messages. After discussion, it has been determined that enabling FULL_DECODE_ONLY with Flashcomm1 in mixed deployment scenarios provides almost no TPOT benefit. Additionally, a reconstruction of the decode phase for flashcomm1 is currently underway. Therefore, related adaptation work is temporarily postponed and will be addressed after the decode phase reconstruction plan is finalized. For now, an assert will be added to provide clear error messages and correct deployment recommendations. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? NO - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: QiuChunshuo --- vllm_ascend/attention/sfa_v1.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index c2fd9dd5..32a66c0e 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -5,7 +5,7 @@ import torch import torch_npu from torch import nn from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl -from vllm.config import VllmConfig, get_current_vllm_config +from vllm.config import CUDAGraphMode, VllmConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group from vllm.forward_context import get_forward_context from vllm.logger import logger @@ -148,6 +148,12 @@ class AscendSFAMetadataBuilder: self.enable_sfa_cp = enable_sp() and \ hasattr(self.model_config.hf_config, "index_topk") + assert not ( + self.enable_sfa_cp + and self.vllm_config.compilation_config.cudagraph_mode + == CUDAGraphMode.FULL_DECODE_ONLY + ), "FlashComm1 is not compatible with FULL_DECODE_ONLY. Please set graph_mode to 'piecewise' or disable FlashComm1." + def reorder_batch(self, input_batch: "NPUInputBatch", scheduler_output: "SchedulerOutput") -> bool: # No need to reorder for Ascend SFA