From ff63626874fae037c1b7fd8cf1fb87e0493e838c Mon Sep 17 00:00:00 2001 From: zhaomingyu13 Date: Fri, 23 Jan 2026 16:12:56 +0800 Subject: [PATCH] [Bugfix] Fix the issue of the acceptance rate decline for Qwen3-30B-A3B-EAGLE3 (#6138) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Due to the long-term lack of synchronization with the upstream code, a problem that led to a decrease in the acceptance rate of the Qwen3-30B-A3B-EAGLE3 draft model was introduced when fixing the bug(#5967). Now, synchronize with the upstream and fix this bug ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ```python from vllm import LLM, SamplingParams def main(): prompts = [ "The future of AI is", ] # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. llm = LLM( model="Qwen/Qwen3-30B-A3B", tensor_parallel_size=4, gpu_memory_utilization=0.9, enforce_eager=True, speculative_config={ "method": "eagle3", "model": "AngelSlim/Qwen3-a3B_eagle3" "num_speculative_tokens": 3, }, ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) print(f"Outputs: {outputs}") for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 Signed-off-by: zhaomingyu Co-authored-by: drslark --- vllm_ascend/spec_decode/eagle_proposer.py | 50 ++++++++++++++++++----- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 789dd091..fb2928e6 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -226,19 +226,49 @@ class EagleProposer(VllmEagleProposer): ) # If pp>1, the weights of mtp and the main model's embedding are not on the same device. # check if mtp model use main model's embedding and LMhead - if hasattr(model, "model") and hasattr(model.model, "embed_tokens") and \ - torch.equal(self.model.model.embed_tokens.weight, - model.model.embed_tokens.weight): - logger.info( - "The EAGLE head shares the same vocab embedding" \ - " with the target model." - ) - self.model.model.embed_tokens = target_embed_tokens + share_embeddings = False + if hasattr(self.model, "has_own_embed_tokens"): + # EAGLE model + if not self.model.has_own_embed_tokens: + share_embeddings = True + logger.info( + "Detected EAGLE model without its own embed_tokens in the" + " checkpoint. Sharing target model embedding weights with the" + " draft model." + ) + elif ( + isinstance(target_embed_tokens.weight, torch.Tensor) + and isinstance(self.model.model.embed_tokens.weight, torch.Tensor) + # TODO: Offload to CPU for comparison to avoid extra NPU memory + # usage in CI testing environments with limited NPU memory + and torch.equal( + target_embed_tokens.weight.cpu(), + self.model.model.embed_tokens.weight.cpu(), + ) + ): + share_embeddings = True + logger.info( + "Detected EAGLE model with embed_tokens identical to the target" + " model. Sharing target model embedding weights with the draft" + " model." + ) + else: + logger.info( + "Detected EAGLE model with distinct embed_tokens weights. " + "Keeping separate embedding weights from the target model." + ) else: + # MTP model + share_embeddings = True logger.info( - " The EAGLE head loaded its own vocab embedding" \ - " weights instead of sharing them with the target model." + "Detected MTP model. " + "Sharing target model embedding weights with the draft model." ) + + if share_embeddings: + if hasattr(self.model.model, "embed_tokens"): + del self.model.model.embed_tokens + self.model.model.embed_tokens = target_embed_tokens else: logger.info( "Since PP > 1 or other reasons the model head loaded its own vocab embedding" \