[ops] support advanced apply_top_k_top_p without top_k constraint (#6098)

### What this PR does / why we need it? Implement `apply_top_k_top_p` via ascendC to eliminate the constraint of k [1,1024]. It enables high performance TopKTopP calculation and avoid D2H synchronization introduced by k validation. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? E2E serving with `k=4096` and `p=0.95` - vLLM version: v0.13.0 - vLLM main: d68209402d --------- Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: SlightwindSec <slightwindsec@gmail.com> Co-authored-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-26 09:08:42 +08:00
parent 4e3919e965
commit 96309e2b79
16 changed files with 2208 additions and 3 deletions
--- a/csrc/torch_binding.cpp
+++ b/csrc/torch_binding.cpp
@@ -1234,6 +1234,26 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> npu_moe_init_routing_
    return std::tie(expanded_x, expanded_row_idx, expert_tokens_count_or_cumsum, expanded_scale);
 }

+at::Tensor npu_apply_top_k_top_p(
+    const at::Tensor& logits,
+    const c10::optional<at::Tensor>& p,
+    const c10::optional<at::Tensor>& k)
+{
+    TORCH_CHECK(p.has_value() || k.has_value(),
+                "apply_top_k_top_p: p and k cannot be None at the same time.");
+
+    at::Tensor out = at::empty_like(logits);
+
+    EXEC_NPU_CMD(
+        aclnnApplyTopKTopPCustom,
+        logits,
+        p,
+        k,
+        out);
+
+    return out;
+}
+
 std::tuple<at::Tensor, at::Tensor, at::Tensor> moe_gating_top_k(
    const at::Tensor& x,
    int64_t k,
@@ -1495,4 +1515,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
        "-> (Tensor y ,Tensor rstd, Tensor x)"
        );
    ops.impl("npu_add_rms_norm_bias", torch::kPrivateUse1, &vllm_ascend::npu_add_rms_norm_bias);
+
+    ops.def("npu_apply_top_k_top_p(Tensor logits, Tensor? p=None, Tensor? k=None) -> Tensor");
+    ops.impl("npu_apply_top_k_top_p", torch::kPrivateUse1, &vllm_ascend::npu_apply_top_k_top_p);
 }