From 73e4b4f49685511bb0401e4cc0e3544c76dce7cc Mon Sep 17 00:00:00 2001 From: zhaomingyu13 Date: Thu, 18 Dec 2025 23:07:14 +0800 Subject: [PATCH] [BugFix] Fix top_p,top_k issue with EAGLE and add top_p,top_k in EAGLE e2e (#5131) ### What this PR does / why we need it? Add top_p,top_k in EAGLE e2e - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: zhaomingyu --- tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py | 4 +++- vllm_ascend/sample/rejection_sampler.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py index 150fbeec..8b2a145a 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -145,7 +145,9 @@ def test_eagle_correctness( sampling_params = SamplingParams( max_tokens=300, - temperature=0.0, + temperature=0.8, + top_p=0.7, + top_k=4, ignore_eos=False, ) diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py index e6e9e791..44bf7264 100644 --- a/vllm_ascend/sample/rejection_sampler.py +++ b/vllm_ascend/sample/rejection_sampler.py @@ -83,8 +83,7 @@ def apply_sampling_constraints( if get_ascend_device_type( ) != AscendDeviceType._310P and top_p is not None and top_k is not None and 1 <= int( top_k.max()) <= 1024: - return torch_npu.npu_top_k_top_p(logits, top_p.to(torch.bfloat16), - top_k) + return torch_npu.npu_top_k_top_p(logits, top_p.to(logits.dtype), top_k) else: # NOTE(woosuk): `apply_top_k_top_p` uses sorting to calculate the mask, # which is slow for large vocab sizes. This may cause performance issues.