[Feat][UT] Support Deepseekv32 FULL_DECODE_ONLY mode and add unit test of sfa_v1 (#3763)

### What this PR does / why we need it?
- Add support for DeepSeek v3.2 in FULL_DECODE_ONLY mode.
- Add unit test for sfa_v1.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?


- vLLM version: v0.11.0
- vLLM main:
83f478bb19

---------

Signed-off-by: 1Fire4 <wangdingyi2@huawei.com>
This commit is contained in:
1Fire4
2025-11-03 10:02:47 +08:00
committed by GitHub
parent d4c75088a0
commit 0b9b6d79fe
3 changed files with 216 additions and 6 deletions

View File

@@ -91,7 +91,7 @@ M = TypeVar("M", bound=AscendSFAMetadata)
class AscendSFAMetadataBuilder:
# Does this backend/builder support ACL Graphs for attention (default: no).
aclgraph_support: ClassVar[AttentionCGSupport] = \
AttentionCGSupport.NEVER
AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
"""
NOTE: Please read the comment at the top of the file before trying to
understand this class
@@ -189,6 +189,26 @@ class AscendSFAMetadataBuilder:
sin=sin,
cos=cos)
def build_for_graph_capture(
self,
common_attn_metadata: AscendCommonAttentionMetadata,
attn_state: AscendAttentionState = AscendAttentionState.DecodeOnly,
model: Optional[nn.Module] = None,
):
if attn_state == AscendAttentionState.DecodeOnly:
attn_metadata = self.build(
common_prefix_len=0,
common_attn_metadata=common_attn_metadata,
model=model,
)
else:
raise NotImplementedError(
"Currently we only support building dummy metadata for DecodeOnly state"
)
attn_metadata.attn_state = attn_state
return attn_metadata
class AscendSFAImpl(MLAAttentionImpl):
"""

View File

@@ -1894,7 +1894,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
)
forward_context = get_forward_context()
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL \
and not self.use_sparse:
# TODO: maybe_padded_num_tokens will be removed, use num_input_tokens instead
if self.vllm_config.model_config.use_mla:
if self.pcp_size * self.dcp_size > 1:
@@ -2687,11 +2688,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
[0] * dcp_world_size for _ in range(pcp_world_size)
] for _ in range(num_tokens)]
long_seq_metadata.num_computed_tokens_of_pcp_dcp = num_computed_tokens_of_pcp_dcp
common_attn_metadata = AscendCommonAttentionMetadata(
query_start_loc=torch.tensor(
if self.speculative_config:
query_start_loc = torch.tensor(
[0] + self.actual_seq_lengths_q[:num_reqs],
device=self.device,
dtype=torch.int32),
dtype=torch.int32)
else:
query_start_loc = self.query_start_loc[:num_reqs + 1]
common_attn_metadata = AscendCommonAttentionMetadata(
query_start_loc=query_start_loc,
query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs +
1],
seq_lens_cpu=self.seq_lens_cpu,
@@ -2737,7 +2742,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
forward_context = get_forward_context()
assert forward_context is not None
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
not forward_context.capturing:
not forward_context.capturing and not self.use_sparse:
if self.vllm_config.model_config.use_mla:
# FIXME: Try using `auto_dispatch_capture=True`
if self.pcp_size * self.dcp_size > 1: