[Feat][UT] Support Deepseekv32 FULL_DECODE_ONLY mode and add unit test of sfa_v1 (#3763)

### What this PR does / why we need it? - Add support for DeepSeek v3.2 in FULL_DECODE_ONLY mode. - Add unit test for sfa_v1. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: 1Fire4 <wangdingyi2@huawei.com>
2025-11-03 10:02:47 +08:00
parent d4c75088a0
commit 0b9b6d79fe
3 changed files with 216 additions and 6 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1894,7 +1894,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        )

        forward_context = get_forward_context()
-        if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
+        if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL \
+            and not self.use_sparse:
            # TODO: maybe_padded_num_tokens will be removed, use num_input_tokens instead
            if self.vllm_config.model_config.use_mla:
                if self.pcp_size * self.dcp_size > 1:
@@ -2687,11 +2688,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                        [0] * dcp_world_size for _ in range(pcp_world_size)
                    ] for _ in range(num_tokens)]
                    long_seq_metadata.num_computed_tokens_of_pcp_dcp = num_computed_tokens_of_pcp_dcp
-                common_attn_metadata = AscendCommonAttentionMetadata(
-                    query_start_loc=torch.tensor(
+                if self.speculative_config:
+                    query_start_loc = torch.tensor(
                        [0] + self.actual_seq_lengths_q[:num_reqs],
                        device=self.device,
-                        dtype=torch.int32),
+                        dtype=torch.int32)
+                else:
+                    query_start_loc = self.query_start_loc[:num_reqs + 1]
+                common_attn_metadata = AscendCommonAttentionMetadata(
+                    query_start_loc=query_start_loc,
                    query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs +
                                                                 1],
                    seq_lens_cpu=self.seq_lens_cpu,
@@ -2737,7 +2742,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        forward_context = get_forward_context()
        assert forward_context is not None
        if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
-            not forward_context.capturing:
+            not forward_context.capturing and not self.use_sparse:
            if self.vllm_config.model_config.use_mla:
                # FIXME: Try using `auto_dispatch_capture=True`
                if self.pcp_size * self.dcp_size > 1: