[main][bugfix] Fix spec acceptance rate problem in vllm_0.15.0 (#6606)

### What this PR does / why we need it? The speculative inference acceptance rate decreases after the vllm version is upgraded to v0.15.0. This issue is resolved. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? UT and tests case - vLLM version: v0.15.0 - vLLM main: d7e17aaacd --------- Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2026-02-09 21:33:58 +08:00
parent 8d44ddacb0
commit 9564c6bb5d
2 changed files with 4 additions and 11 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -19,6 +19,7 @@ from vllm_ascend.utils import (
    is_drafter_moe_model,
    is_moe_model,
    speculative_enable_dispatch_gmm_combine_decode,
+    vllm_version_is,
 )


@@ -152,6 +153,9 @@ def set_ascend_forward_context(
                mc2_mask[num_actual_tokens:] = False
                forward_context.mc2_mask = mc2_mask

+        if is_draft_model and vllm_version_is("0.15.0"):
+            forward_context.remaining_moe_layers = None
+
        try:
            yield
        finally: