[BugFix] Fix top_p,top_k issue with EAGLE and add top_p,top_k in EAGLE e2e (#5131)

### What this PR does / why we need it? Add top_p,top_k in EAGLE e2e - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
2025-12-18 23:07:14 +08:00
parent 073a3a6e6c
commit 73e4b4f496
2 changed files with 4 additions and 3 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -145,7 +145,9 @@ def test_eagle_correctness(

    sampling_params = SamplingParams(
        max_tokens=300,
-        temperature=0.0,
+        temperature=0.8,
+        top_p=0.7,
+        top_k=4,
        ignore_eos=False,
    )

--- a/vllm_ascend/sample/rejection_sampler.py
+++ b/vllm_ascend/sample/rejection_sampler.py
@@ -83,8 +83,7 @@ def apply_sampling_constraints(
    if get_ascend_device_type(
    ) != AscendDeviceType._310P and top_p is not None and top_k is not None and 1 <= int(
            top_k.max()) <= 1024:
-        return torch_npu.npu_top_k_top_p(logits, top_p.to(torch.bfloat16),
-                                         top_k)
+        return torch_npu.npu_top_k_top_p(logits, top_p.to(logits.dtype), top_k)
    else:
        # NOTE(woosuk): `apply_top_k_top_p` uses sorting to calculate the mask,
        # which is slow for large vocab sizes. This may cause performance issues.