[ops] support advanced apply_top_k_top_p without top_k constraint (#6098)
### What this PR does / why we need it?
Implement `apply_top_k_top_p` via ascendC to eliminate the constraint of
k [1,1024]. It enables high performance TopKTopP calculation and avoid
D2H synchronization introduced by k validation.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
E2E serving with `k=4096` and `p=0.95`
- vLLM version: v0.13.0
- vLLM main:
d68209402d
---------
Signed-off-by: linfeng-yuan <1102311262@qq.com>
Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
Co-authored-by: SlightwindSec <slightwindsec@gmail.com>
This commit is contained in:
@@ -1234,6 +1234,26 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> npu_moe_init_routing_
|
||||
return std::tie(expanded_x, expanded_row_idx, expert_tokens_count_or_cumsum, expanded_scale);
|
||||
}
|
||||
|
||||
at::Tensor npu_apply_top_k_top_p(
|
||||
const at::Tensor& logits,
|
||||
const c10::optional<at::Tensor>& p,
|
||||
const c10::optional<at::Tensor>& k)
|
||||
{
|
||||
TORCH_CHECK(p.has_value() || k.has_value(),
|
||||
"apply_top_k_top_p: p and k cannot be None at the same time.");
|
||||
|
||||
at::Tensor out = at::empty_like(logits);
|
||||
|
||||
EXEC_NPU_CMD(
|
||||
aclnnApplyTopKTopPCustom,
|
||||
logits,
|
||||
p,
|
||||
k,
|
||||
out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor> moe_gating_top_k(
|
||||
const at::Tensor& x,
|
||||
int64_t k,
|
||||
@@ -1495,4 +1515,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
|
||||
"-> (Tensor y ,Tensor rstd, Tensor x)"
|
||||
);
|
||||
ops.impl("npu_add_rms_norm_bias", torch::kPrivateUse1, &vllm_ascend::npu_add_rms_norm_bias);
|
||||
|
||||
ops.def("npu_apply_top_k_top_p(Tensor logits, Tensor? p=None, Tensor? k=None) -> Tensor");
|
||||
ops.impl("npu_apply_top_k_top_p", torch::kPrivateUse1, &vllm_ascend::npu_apply_top_k_top_p);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user