[Feat][Graph] Support DeepSeek with ACL Graph (#2707)

### What this PR does / why we need it?
In memory of #677 , a long overdue milestone. Now DeepSeek V3/R1 should
be OK with ACL Graph.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
Working on it.

- vLLM version: v0.10.2
- vLLM main:
68dbde5dbb

---------

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
yiz-liu
2025-09-16 17:50:17 +08:00
committed by GitHub
parent 3e60aa5483
commit 88ca8a051c
7 changed files with 64 additions and 42 deletions

View File

@@ -93,6 +93,7 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.compilation.acl_graph import ACLGraphWrapper
from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
from vllm_ascend.multistream.ms_split import compute_split_seq_index
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.sample.logits_processor import build_logitsprocs
@@ -412,7 +413,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.is_kv_producer = vllm_config.kv_transfer_config.is_kv_producer
self.is_kv_consumer = vllm_config.kv_transfer_config.is_kv_consumer
self.mc2_tokens_capacity = 512 * self.parallel_config.tensor_parallel_size
# NOTE: Technically, MC2 can have 512 tokens each rank, but this will consume too much memory. The formula is:
# ((maxBs * tokenNeedSizeDispatch * ep_worldsize * localMoeExpertNum) + (maxBs * tokenNeedSizeCombine * (k + sharedExpertNum))) * 2
# so we have to limit the MC2 tokens to save memory, should fix this in the future.
self.mc2_tokens_capacity = 512
self.reserved_mc2_mask = torch.zeros(
self.mc2_tokens_capacity,
dtype=torch.bool,
@@ -2811,6 +2815,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# or enable more requests to be processed simultaneously.
self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
continue
if isinstance(attn_module, AscendMultiHeadLatentAttention):
continue
# TODO: Support other attention modules, e.g., cross-attention
# TODO(lucas): move the attention specs into the model layers like