From 2a0a5883119732aeed22ace5759f55eb00c71e16 Mon Sep 17 00:00:00 2001 From: liuchenbing2026 Date: Tue, 31 Mar 2026 09:36:48 +0800 Subject: [PATCH] =?UTF-8?q?[0.18.0][BugFix]=20Disable=20block=20verify=20t?= =?UTF-8?q?o=20avoid=20incorrect=20verification=20on=20NPU=20=E2=80=A6=20(?= =?UTF-8?q?#7839)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …(#7603) ### What this PR does / why we need it? Block verify uses cumprod(target_probs / draft_probs) for joint acceptance. Suffix/ngram methods have draft_probs=None, the fallback draft_token_probs=1.0 with cumprod is not equivalent to per-token verification, causing incorrect accept/reject results. Fix: using_block_verify = max_spec_len >= 3 and draft_probs is not None. MTP/Eagle3 unaffected. - vLLM version: v0.18.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ed359c497a728f08b5b41456c07a688ccd510fbc ### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Signed-off-by: liuchenbing Co-authored-by: liuchenbing --- vllm_ascend/sample/rejection_sampler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py index 695ae649..a9f54b5e 100644 --- a/vllm_ascend/sample/rejection_sampler.py +++ b/vllm_ascend/sample/rejection_sampler.py @@ -111,7 +111,9 @@ def rejection_sample( assert target_logits.shape == (num_tokens, vocab_size) # When num_speculative_tokens>=3, using block verify. - using_block_verify = max_spec_len >= 3 + # Skip block verify when draft_probs is None (suffix/ngram methods) + # to avoid incorrect verification results. + using_block_verify = max_spec_len >= 3 and draft_probs is not None # Create output buffer. output_token_ids = torch.empty(