From ff63626874fae037c1b7fd8cf1fb87e0493e838c Mon Sep 17 00:00:00 2001
From: zhaomingyu13 <zhaomingyu13@h-partners.com>
Date: Fri, 23 Jan 2026 16:12:56 +0800
Subject: [PATCH] [Bugfix] Fix the issue of the acceptance rate decline for
 Qwen3-30B-A3B-EAGLE3 (#6138)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What this PR does / why we need it?
Due to the long-term lack of synchronization with the upstream code, a
problem that led to a decrease in the acceptance rate of the
Qwen3-30B-A3B-EAGLE3 draft model was introduced when fixing the
bug（#5967）. Now, synchronize with the upstream and fix this bug
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
```python
from vllm import LLM, SamplingParams

def main():
    prompts = [
        "The future of AI is",
    ]

    # Create a sampling params object.
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    # Create an LLM.
    llm = LLM(
            model="Qwen/Qwen3-30B-A3B",
            tensor_parallel_size=4,
            gpu_memory_utilization=0.9,
            enforce_eager=True,
            speculative_config={
                "method": "eagle3",
                "model": "AngelSlim/Qwen3-a3B_eagle3"
                "num_speculative_tokens": 3,
            },
        )

    # Generate texts from the prompts.
    outputs = llm.generate(prompts, sampling_params)
    print(f"Outputs: {outputs}")
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60

Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarkblood@qq.com>
---
 vllm_ascend/spec_decode/eagle_proposer.py | 50 ++++++++++++++++++-----
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index 789dd091..fb2928e6 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -226,19 +226,49 @@ class EagleProposer(VllmEagleProposer):
                 )
             # If pp>1, the weights of mtp and the main model's embedding are not on the same device.
             # check if mtp model use main model's embedding and LMhead
-            if hasattr(model, "model") and hasattr(model.model, "embed_tokens") and \
-                    torch.equal(self.model.model.embed_tokens.weight,
-                                model.model.embed_tokens.weight):
-                logger.info(
-                    "The EAGLE head shares the same vocab embedding" \
-                    " with the target model."
-                )
-                self.model.model.embed_tokens = target_embed_tokens
+            share_embeddings = False
+            if hasattr(self.model, "has_own_embed_tokens"):
+                # EAGLE model
+                if not self.model.has_own_embed_tokens:
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model without its own embed_tokens in the"
+                        " checkpoint. Sharing target model embedding weights with the"
+                        " draft model."
+                    )
+                elif (
+                    isinstance(target_embed_tokens.weight, torch.Tensor)
+                    and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
+                    # TODO: Offload to CPU for comparison to avoid extra NPU memory
+                    # usage in CI testing environments with limited NPU memory
+                    and torch.equal(
+                        target_embed_tokens.weight.cpu(),
+                        self.model.model.embed_tokens.weight.cpu(),
+                    )
+                ):
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model with embed_tokens identical to the target"
+                        " model. Sharing target model embedding weights with the draft"
+                        " model."
+                    )
+                else:
+                    logger.info(
+                        "Detected EAGLE model with distinct embed_tokens weights. "
+                        "Keeping separate embedding weights from the target model."
+                    )
             else:
+                # MTP model
+                share_embeddings = True
                 logger.info(
-                    " The EAGLE head loaded its own vocab embedding" \
-                    " weights instead of sharing them with the target model."
+                    "Detected MTP model. "
+                    "Sharing target model embedding weights with the draft model."
                 )
+
+            if share_embeddings:
+                if hasattr(self.model.model, "embed_tokens"):
+                    del self.model.model.embed_tokens
+                self.model.model.embed_tokens = target_embed_tokens
         else:
             logger.info(
                 "Since PP > 1 or other reasons the model head loaded its own vocab embedding" \