From 64669c4243080ac39207f3e4228d688976fdc066 Mon Sep 17 00:00:00 2001
From: Qiu <chunshuoq@gmail.com>
Date: Mon, 22 Dec 2025 03:33:32 -0300
Subject: [PATCH] [misc][FlashComm1][ACLGraph] Incompatibility between
 Flashcomm1 and FULL_DECODE_ONLY. (#5200)

### What this PR does / why we need it?
Currently, Flashcomm1 and FULL_DECODE_ONLY are incompatible. When both
features are enabled, graph capture errors occur without clear error
messages.

After discussion, it has been determined that enabling FULL_DECODE_ONLY
with Flashcomm1 in mixed deployment scenarios provides almost no TPOT
benefit. Additionally, a reconstruction of the decode phase for
flashcomm1 is currently underway. Therefore, related adaptation work is
temporarily postponed and will be addressed after the decode phase
reconstruction plan is finalized.

For now, an assert will be added to provide clear error messages and
correct deployment recommendations.

### Does this PR introduce _any_ user-facing change?
NO

### How was this patch tested?
NO

- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
---
 vllm_ascend/attention/sfa_v1.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py
index c2fd9dd5..32a66c0e 100644
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -5,7 +5,7 @@ import torch
 import torch_npu
 from torch import nn
 from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
-from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config import CUDAGraphMode, VllmConfig, get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import logger
@@ -148,6 +148,12 @@ class AscendSFAMetadataBuilder:
         self.enable_sfa_cp = enable_sp() and \
             hasattr(self.model_config.hf_config, "index_topk")
 
+        assert not (
+            self.enable_sfa_cp
+            and self.vllm_config.compilation_config.cudagraph_mode
+            == CUDAGraphMode.FULL_DECODE_ONLY
+        ), "FlashComm1 is not compatible with FULL_DECODE_ONLY. Please set graph_mode to 'piecewise' or disable FlashComm1."
+
     def reorder_batch(self, input_batch: "NPUInputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
         # No need to reorder for Ascend SFA