From 73e4b4f49685511bb0401e4cc0e3544c76dce7cc Mon Sep 17 00:00:00 2001
From: zhaomingyu13 <zhaomingyu13@h-partners.com>
Date: Thu, 18 Dec 2025 23:07:14 +0800
Subject: [PATCH] [BugFix] Fix top_p,top_k issue with EAGLE and add top_p,top_k
 in EAGLE e2e (#5131)

### What this PR does / why we need it?
Add top_p,top_k in EAGLE e2e

- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
---
 tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py | 4 +++-
 vllm_ascend/sample/rejection_sampler.py                    | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
index 150fbeec..8b2a145a 100644
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -145,7 +145,9 @@ def test_eagle_correctness(
 
     sampling_params = SamplingParams(
         max_tokens=300,
-        temperature=0.0,
+        temperature=0.8,
+        top_p=0.7,
+        top_k=4,
         ignore_eos=False,
     )
 
diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py
index e6e9e791..44bf7264 100644
--- a/vllm_ascend/sample/rejection_sampler.py
+++ b/vllm_ascend/sample/rejection_sampler.py
@@ -83,8 +83,7 @@ def apply_sampling_constraints(
     if get_ascend_device_type(
     ) != AscendDeviceType._310P and top_p is not None and top_k is not None and 1 <= int(
             top_k.max()) <= 1024:
-        return torch_npu.npu_top_k_top_p(logits, top_p.to(torch.bfloat16),
-                                         top_k)
+        return torch_npu.npu_top_k_top_p(logits, top_p.to(logits.dtype), top_k)
     else:
         # NOTE(woosuk): `apply_top_k_top_p` uses sorting to calculate the mask,
         # which is slow for large vocab sizes. This may cause performance issues.