[Feat] update op for mla (#4000)

### What this PR does / why we need it? 1、in mla_v1 module, add torch_npu.npu_attention_update op when pcp and dcp ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: LookAround <lixushi@huawei.com>
2025-11-07 09:48:39 +08:00
parent f8610b7d67
commit 79e536d939
2 changed files with 88 additions and 113 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1716,10 +1716,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            # We will ignore the sampled tokens from the partial requests.
            # TODO: Support prompt logprobs.
            spec_decode_metadata = None
-            logits_indices = torch.from_numpy(
-                cu_num_tokens
-            ) * self.pcp_size - self.num_pcp_pads[:num_reqs] - 1
-            logits_indices = logits_indices.to(self.device, non_blocking=True)
+            if self.pcp_size * self.dcp_size > 1:
+                logits_indices = torch.from_numpy(
+                    cu_num_tokens
+                ) * self.pcp_size - self.num_pcp_pads[:num_reqs] - 1
+                logits_indices = logits_indices.to(self.device,
+                                                   non_blocking=True)
+            else:
+                logits_indices = torch.from_numpy(cu_num_tokens - 1).to(
+                    self.device, non_blocking=True)
        else:
            # pcp not supported now
            assert self.pcp_size == 1