[Bugfix] Fix the issue of the acceptance rate decline for Qwen3-30B-A3B-EAGLE3 (#6138)
### What this PR does / why we need it?
Due to the long-term lack of synchronization with the upstream code, a
problem that led to a decrease in the acceptance rate of the
Qwen3-30B-A3B-EAGLE3 draft model was introduced when fixing the
bug(#5967). Now, synchronize with the upstream and fix this bug
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(
model="Qwen/Qwen3-30B-A3B",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "AngelSlim/Qwen3-a3B_eagle3"
"num_speculative_tokens": 3,
},
)
# Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.13.0
- vLLM main:
d68209402d
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarkblood@qq.com>
This commit is contained in:
@@ -226,19 +226,49 @@ class EagleProposer(VllmEagleProposer):
|
|||||||
)
|
)
|
||||||
# If pp>1, the weights of mtp and the main model's embedding are not on the same device.
|
# If pp>1, the weights of mtp and the main model's embedding are not on the same device.
|
||||||
# check if mtp model use main model's embedding and LMhead
|
# check if mtp model use main model's embedding and LMhead
|
||||||
if hasattr(model, "model") and hasattr(model.model, "embed_tokens") and \
|
share_embeddings = False
|
||||||
torch.equal(self.model.model.embed_tokens.weight,
|
if hasattr(self.model, "has_own_embed_tokens"):
|
||||||
model.model.embed_tokens.weight):
|
# EAGLE model
|
||||||
logger.info(
|
if not self.model.has_own_embed_tokens:
|
||||||
"The EAGLE head shares the same vocab embedding" \
|
share_embeddings = True
|
||||||
" with the target model."
|
logger.info(
|
||||||
)
|
"Detected EAGLE model without its own embed_tokens in the"
|
||||||
self.model.model.embed_tokens = target_embed_tokens
|
" checkpoint. Sharing target model embedding weights with the"
|
||||||
|
" draft model."
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
isinstance(target_embed_tokens.weight, torch.Tensor)
|
||||||
|
and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
|
||||||
|
# TODO: Offload to CPU for comparison to avoid extra NPU memory
|
||||||
|
# usage in CI testing environments with limited NPU memory
|
||||||
|
and torch.equal(
|
||||||
|
target_embed_tokens.weight.cpu(),
|
||||||
|
self.model.model.embed_tokens.weight.cpu(),
|
||||||
|
)
|
||||||
|
):
|
||||||
|
share_embeddings = True
|
||||||
|
logger.info(
|
||||||
|
"Detected EAGLE model with embed_tokens identical to the target"
|
||||||
|
" model. Sharing target model embedding weights with the draft"
|
||||||
|
" model."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
"Detected EAGLE model with distinct embed_tokens weights. "
|
||||||
|
"Keeping separate embedding weights from the target model."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
|
# MTP model
|
||||||
|
share_embeddings = True
|
||||||
logger.info(
|
logger.info(
|
||||||
" The EAGLE head loaded its own vocab embedding" \
|
"Detected MTP model. "
|
||||||
" weights instead of sharing them with the target model."
|
"Sharing target model embedding weights with the draft model."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if share_embeddings:
|
||||||
|
if hasattr(self.model.model, "embed_tokens"):
|
||||||
|
del self.model.model.embed_tokens
|
||||||
|
self.model.model.embed_tokens = target_embed_tokens
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Since PP > 1 or other reasons the model head loaded its own vocab embedding" \
|
"Since PP > 1 or other reasons the model head loaded its own vocab embedding" \
|
||||||
|
|||||||
Reference in New Issue
Block a user