[310P][Bugfix]: fix ngram graph replay accuracy error (#7134)

### What this PR does / why we need it? On the 310P device, when running ACLGraph together with the n-gram speculative decoding algorithm, both graph capture and graph replay require `uniform_decode_query_len` and do not depend on `attention_state`. This leads to a rather interesting and unexpected issue on 310P: during decode-only, execution does **not** enter the graph, while in the split-fuse state (that is, the chunked prefill state), it instead enters graph execution directly. The issue can be resolved by forcibly setting `uniform_decode_query_len` to `1`, so that 310P captures only the decode-only graph, and replay is then controlled through `attention_state`. ### Does this PR introduce _any_ user-facing change? NO - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: Tflowers-0129 <2906339855@qq.com>
2026-03-12 17:08:08 +08:00
parent bfd049aa2c
commit e5343d6eb3
1 changed files with 139 additions and 0 deletions
--- a/vllm_ascend/_310p/model_runner_310p.py
+++ b/vllm_ascend/_310p/model_runner_310p.py
@@ -17,6 +17,8 @@

 from __future__ import annotations

+from contextlib import contextmanager, nullcontext
+
 import numpy as np
 import torch
 import torch_npu
@@ -24,14 +26,151 @@ from vllm.logger import logger
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, MambaSpec

+from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner

+_NGRAM_GRAPH_UNIFORM_DECODE_QUERY_LEN = 1
+

 class NPUModelRunner310(NPUModelRunner):
+    # Inherited from parent runner; annotated here to satisfy strict type checks.
+    uniform_decode_query_len: int
+
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._acl_format = ACL_FORMAT_FRACTAL_NZ
+        if self.speculative_config is not None and self.speculative_config.method == "ngram":
+            # 310P ngram requires decode-only graph shapes to be built with q_len=1.
+            # Keep dispatcher's internal query_len in sync to avoid key-init assert.
+            self.cudagraph_dispatcher.uniform_decode_query_len = _NGRAM_GRAPH_UNIFORM_DECODE_QUERY_LEN
+
+    @contextmanager
+    def temporary_modify_uniform_decode_query_len(self):
+        # This is only needed for the 310P ngram path where dispatcher uses q_len=1
+        # while runner's default uniform_decode_query_len remains 1 + num_spec_tokens.
+        # TODO: remove this temporary override after upstream supports independent
+        # decode capture query_len for backend-specific paths.
+        if self.speculative_config is None or self.speculative_config.method != "ngram":
+            yield
+            return
+
+        original_uniform_decode_query_len = self.uniform_decode_query_len
+        self.uniform_decode_query_len = _NGRAM_GRAPH_UNIFORM_DECODE_QUERY_LEN
+        try:
+            yield
+        finally:
+            self.uniform_decode_query_len = original_uniform_decode_query_len
+
+    def _determine_batch_execution_and_padding(
+        self,
+        num_tokens: int,
+        num_reqs: int,
+        num_scheduled_tokens_np: np.ndarray,
+        max_num_scheduled_tokens: int,
+        use_cascade_attn: bool,
+        allow_microbatching: bool = False,
+        force_eager: bool = False,
+        force_uniform_decode: bool | None = None,
+        force_has_lora: bool | None = None,
+        force_num_active_loras: int | None = None,
+        num_encoder_reqs: int = 0,
+    ):
+        if self.attn_state in (AscendAttentionState.ChunkedPrefill, AscendAttentionState.PrefillCacheHit):
+            force_eager = True
+
+        if force_uniform_decode is None and self.attn_state == AscendAttentionState.DecodeOnly:
+            decode_query_len = _NGRAM_GRAPH_UNIFORM_DECODE_QUERY_LEN
+            if (
+                max_num_scheduled_tokens == decode_query_len
+                and num_tokens == max_num_scheduled_tokens * num_reqs
+                and np.all(self.input_batch.num_computed_tokens_cpu[:num_reqs] > 0)
+            ):
+                # Respect explicit caller override: only force when unset.
+                force_uniform_decode = True
+
+        return super()._determine_batch_execution_and_padding(
+            num_tokens=num_tokens,
+            num_reqs=num_reqs,
+            num_scheduled_tokens_np=num_scheduled_tokens_np,
+            max_num_scheduled_tokens=max_num_scheduled_tokens,
+            use_cascade_attn=use_cascade_attn,
+            allow_microbatching=allow_microbatching,
+            force_eager=force_eager,
+            force_uniform_decode=force_uniform_decode,
+            force_has_lora=force_has_lora,
+            force_num_active_loras=force_num_active_loras,
+            num_encoder_reqs=num_encoder_reqs,
+        )
+
+    def _pad_query_start_loc_for_fia(self, num_tokens_padded: int, num_reqs_padded: int, num_reqs: int) -> int:
+        # Keep this aligned with the dispatcher because batch_desc.num_reqs is
+        # generated by dispatcher._create_padded_batch_descriptor().
+        # For 310P ngram we intentionally set dispatcher q_len=1, while runner's
+        # default uniform_decode_query_len may remain 1 + num_spec_tokens.
+        uniform_decode_query_len = self.cudagraph_dispatcher.uniform_decode_query_len
+
+        if num_tokens_padded == num_reqs_padded * uniform_decode_query_len:
+            # Uniform-batch case: num_reqs must be no greater than num_reqs_padded
+            assert num_reqs <= num_reqs_padded
+
+            last_loc = self.query_start_loc.np[num_reqs]
+            self.query_start_loc.np[num_reqs + 1 : num_reqs_padded + 1] = (
+                self.arange_np[1 : num_reqs_padded + 1 - num_reqs] * uniform_decode_query_len + last_loc
+            )
+        else:
+            # Mixed-batch case: num_reqs must equal num_reqs_padded
+            assert num_reqs == num_reqs_padded
+
+            # Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly
+            self.query_start_loc.np[num_reqs_padded + 1] = num_tokens_padded
+            num_reqs_padded = num_reqs_padded + 1
+
+        self.query_start_loc.copy_to_gpu()
+        return num_reqs_padded
+
+    @torch.inference_mode()
+    def _dummy_run(
+        self,
+        num_tokens: int,
+        with_prefill: bool = False,
+        cudagraph_runtime_mode=None,
+        force_attention: bool = False,
+        uniform_decode: bool = False,
+        is_profile: bool = False,
+        create_mixed_batch: bool = False,
+        allow_microbatching: bool = True,
+        skip_eplb: bool = False,
+        remove_lora: bool = True,
+        is_graph_capturing: bool = False,
+        num_active_loras: int = 0,
+    ):
+        temporary_context = self.temporary_modify_uniform_decode_query_len() if uniform_decode else nullcontext()
+        with temporary_context:
+            return super()._dummy_run(
+                num_tokens=num_tokens,
+                with_prefill=with_prefill,
+                cudagraph_runtime_mode=cudagraph_runtime_mode,
+                force_attention=force_attention,
+                uniform_decode=uniform_decode,
+                is_profile=is_profile,
+                create_mixed_batch=create_mixed_batch,
+                allow_microbatching=allow_microbatching,
+                skip_eplb=skip_eplb,
+                remove_lora=remove_lora,
+                is_graph_capturing=is_graph_capturing,
+                num_active_loras=num_active_loras,
+            )
+
+    def _check_and_update_cudagraph_mode(
+        self,
+        attention_backends,
+        kv_cache_groups,
+    ) -> None:
+        # 910B does not need this branch because runner/dispatcher query_len are
+        # naturally consistent there. 310P ngram needs temporary alignment.
+        with self.temporary_modify_uniform_decode_query_len():
+            super()._check_and_update_cudagraph_mode(attention_backends, kv_cache_groups)

    def initialize_kv_cache_tensors(self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
        """