[Bugfix] Fix the issue of the acceptance rate decline for Qwen3-30B-A3B-EAGLE3 (#6138)

### What this PR does / why we need it? Due to the long-term lack of synchronization with the upstream code, a problem that led to a decrease in the acceptance rate of the Qwen3-30B-A3B-EAGLE3 draft model was introduced when fixing the bug（#5967）. Now, synchronize with the upstream and fix this bug ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ```python from vllm import LLM, SamplingParams def main(): prompts = [ "The future of AI is", ] # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. llm = LLM( model="Qwen/Qwen3-30B-A3B", tensor_parallel_size=4, gpu_memory_utilization=0.9, enforce_eager=True, speculative_config={ "method": "eagle3", "model": "AngelSlim/Qwen3-a3B_eagle3" "num_speculative_tokens": 3, }, ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) print(f"Outputs: {outputs}") for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` - vLLM version: v0.13.0 - vLLM main: d68209402d Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com> Co-authored-by: drslark <slarkblood@qq.com>
2026-01-23 16:12:56 +08:00
parent a3079cd253
commit ff63626874
1 changed files with 40 additions and 10 deletions
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -226,19 +226,49 @@ class EagleProposer(VllmEagleProposer):
                )
            # If pp>1, the weights of mtp and the main model's embedding are not on the same device.
            # check if mtp model use main model's embedding and LMhead
-            if hasattr(model, "model") and hasattr(model.model, "embed_tokens") and \
+            share_embeddings = False
-                    torch.equal(self.model.model.embed_tokens.weight,
+            if hasattr(self.model, "has_own_embed_tokens"):
-                                model.model.embed_tokens.weight):
+                # EAGLE model
-                logger.info(
+                if not self.model.has_own_embed_tokens:
-                    "The EAGLE head shares the same vocab embedding" \
+                    share_embeddings = True
-                    " with the target model."
+                    logger.info(
-                )
+                        "Detected EAGLE model without its own embed_tokens in the"
-                self.model.model.embed_tokens = target_embed_tokens
+                        " checkpoint. Sharing target model embedding weights with the"
                        " draft model."
                    )
                elif (
                    isinstance(target_embed_tokens.weight, torch.Tensor)
                    and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
                    # TODO: Offload to CPU for comparison to avoid extra NPU memory
                    # usage in CI testing environments with limited NPU memory
                    and torch.equal(
                        target_embed_tokens.weight.cpu(),
                        self.model.model.embed_tokens.weight.cpu(),
                    )
                ):
                    share_embeddings = True
                    logger.info(
                        "Detected EAGLE model with embed_tokens identical to the target"
                        " model. Sharing target model embedding weights with the draft"
                        " model."
                    )
                else:
                    logger.info(
                        "Detected EAGLE model with distinct embed_tokens weights. "
                        "Keeping separate embedding weights from the target model."
                    )
            else:
                # MTP model
                share_embeddings = True
                logger.info(
-                    " The EAGLE head loaded its own vocab embedding" \
+                    "Detected MTP model. "
-                    " weights instead of sharing them with the target model."
+                    "Sharing target model embedding weights with the draft model."
                )
            if share_embeddings:
                if hasattr(self.model.model, "embed_tokens"):
                    del self.model.model.embed_tokens
                self.model.model.embed_tokens = target_embed_tokens
        else:
            logger.info(
                "Since PP > 1 or other reasons the model head loaded its own vocab embedding" \