[Feat][Graph] Support DeepSeek with ACL Graph (#2707)

### What this PR does / why we need it? In memory of #677 , a long overdue milestone. Now DeepSeek V3/R1 should be OK with ACL Graph. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? Working on it. - vLLM version: v0.10.2 - vLLM main: 68dbde5dbb --------- Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-09-16 17:50:17 +08:00
parent 3e60aa5483
commit 88ca8a051c
7 changed files with 64 additions and 42 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -93,6 +93,7 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.compilation.acl_graph import ACLGraphWrapper
+from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.logits_processor import build_logitsprocs
@@ -412,7 +413,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            self.is_kv_producer = vllm_config.kv_transfer_config.is_kv_producer
            self.is_kv_consumer = vllm_config.kv_transfer_config.is_kv_consumer

-        self.mc2_tokens_capacity = 512 * self.parallel_config.tensor_parallel_size
+        # NOTE: Technically, MC2 can have 512 tokens each rank, but this will consume too much memory. The formula is:
+        # ((maxBs * tokenNeedSizeDispatch * ep_worldsize * localMoeExpertNum) + (maxBs * tokenNeedSizeCombine * (k + sharedExpertNum))) * 2
+        # so we have to limit the MC2 tokens to save memory, should fix this in the future.
+        self.mc2_tokens_capacity = 512
        self.reserved_mc2_mask = torch.zeros(
            self.mc2_tokens_capacity,
            dtype=torch.bool,
@@ -2811,6 +2815,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                # or enable more requests to be processed simultaneously.
                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
                continue
+            if isinstance(attn_module, AscendMultiHeadLatentAttention):
+                continue

            # TODO: Support other attention modules, e.g., cross-attention
            # TODO(lucas): move the attention specs into the model layers like