From 695e5c9ebca4d2010ff07357429635d5a19ffc9d Mon Sep 17 00:00:00 2001
From: linfeng-yuan <1102311262@qq.com>
Date: Tue, 9 Dec 2025 15:45:40 +0800
Subject: [PATCH] [0.11.0][ops] npu_top_k_top_p supports k and p only (#4153)

### What this PR does / why we need it?
With CANN 8.3 and corresponding PTA 2.7.1, `npu_top_k_top_p` supports
passing only k (1<=k<=1024) and p separately.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
E2E performance test with only `top_k` and `p` seperately. This pr gains
0.2ms improvements in TPOT with `batch_size=16`.

Signed-off-by: linfeng-yuan <1102311262@qq.com>
---
 vllm_ascend/sample/sampler.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py
index 37abdd4..7d015f1 100644
--- a/vllm_ascend/sample/sampler.py
+++ b/vllm_ascend/sample/sampler.py
@@ -24,14 +24,14 @@ class AscendTopKTopPSampler(TopKTopPSampler):
         k: torch.Tensor,
         p: torch.Tensor,
     ) -> torch.Tensor:
-        # npu_top_k_top_p uses the operator aclnnApplyTopKTopP, but aclnnApplyTopKTopP currently does not support 310P
-        if not is_310p() and p is not None and k is not None and 1 <= int(
-                k.max()) <= 1024:
-            # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p)
-            return torch_npu.npu_top_k_top_p(logits, p, k)
-
         if p is None and k is None:
             return logits
+        # npu_top_k_top_p uses the operator aclnnApplyTopKTopP, but aclnnApplyTopKTopP currently does not support 310P
+        if not is_310p():
+            # npu_top_k_top_p requires parameter k ranged from 1 to 1024
+            if k is None or 1 <= int(k.max()) <= 1024:
+                # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p)
+                return torch_npu.npu_top_k_top_p(logits, p, k)
 
         probs = logits.softmax(dim=-1)
         probs_sort, _ = probs.sort(dim=-1, descending=False)