Eagle3 mm support, enablement on qwen3vl (#4848)
### What this PR does / why we need it?
follow pr
[https://github.com/vllm-project/vllm/pull/20788](https://github.com/vllm-project/vllm/pull/20788)
, Eagle3 mm support, enablement on qwen3vl
target model
[Qwen/Qwen3-VL-8B-Instruct]([https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct])
eagle3
[MNN/Qwen3-VL-8B-Instruct-Eagle3](https://www.modelscope.cn/models/MNN/Qwen3-VL-8B-Instruct-Eagle3)
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
pytest ./tests/e2e/singlecard/test_completion_with_prompt_embeds.py -vv
vLLM with eagle3 :
```bash
vllm serve /model/Qwen3-VL-8B-Instruct --enforce-eager --port 9100 --max-model-len 32768 --max-num-seqs 32 --tensor-parallel-size 2 --allowed-local-media-path /model/gx/images --speculative-config '{
"method": "eagle3",
"model": "/model/hf/Qwen3-VL-8B-Instruct-Eagle3",
"num_speculative_tokens": 3
}'
```
vLLM without eagle3 :
```bash
vllm serve /model/Qwen3-VL-8B-Instruct --enforce-eager --port 9100 --max-model-len 32768 --max-num-seqs 32 --tensor-parallel-size 2 --allowed-local-media-path /model/gx/images
```
bench:
```
vllm bench serve --backend openai-chat --base-url http://127.0.0.1:9100 --tokenizer /model/Qwen3-VL-8B-Instruct --endpoint /v1/chat/completions --model /model/Qwen3-VL-8B-Instruct --dataset-name random --num-prompts 50 --max-concurrency 5 --temperature 0 --top-p 1.0 --seed 123
```
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: jesse <szxfml@gmail.com>
This commit is contained in:
@@ -85,6 +85,14 @@ def eagle3_model_name():
|
||||
return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vl_model_name():
|
||||
return "Qwen/Qwen3-VL-8B-Instruct"
|
||||
|
||||
def vl_eagle3_model_name():
|
||||
return "MNN/Qwen3-VL-8B-Instruct-Eagle3"
|
||||
|
||||
|
||||
def test_ngram_correctness(
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
sampling_config: SamplingParams,
|
||||
@@ -129,6 +137,48 @@ def test_ngram_correctness(
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
|
||||
|
||||
def test_qwen3_vl_eagle_correctness(
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
sampling_config: SamplingParams,
|
||||
vl_model_name: str,
|
||||
):
|
||||
'''
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using eagle speculative decoding.
|
||||
'''
|
||||
with VllmRunner(
|
||||
vl_model_name,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
) as ref_llm:
|
||||
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
|
||||
|
||||
spec_model_name = vl_eagle3_model_name()
|
||||
with VllmRunner(
|
||||
vl_model_name,
|
||||
speculative_config={
|
||||
"method": "eagle3",
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 2,
|
||||
},
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
|
||||
# Heuristic: expect at least 70% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
|
||||
def test_suffix_correctness(
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
sampling_config: SamplingParams,
|
||||
|
||||
@@ -50,6 +50,7 @@ class TestEagleProposerInitialization(TestBase):
|
||||
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
|
||||
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
||||
self.vllm_config.model_config.enforce_eager = False
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.speculative_config.enforce_eager = False
|
||||
self.vllm_config.scheduler_config.async_scheduling = False
|
||||
init_ascend_config(self.vllm_config)
|
||||
@@ -156,6 +157,7 @@ class TestEagleProposerLoadModel(TestBase):
|
||||
}]
|
||||
|
||||
mock_model = MagicMock()
|
||||
mock_model.supports_multimodal = False
|
||||
mock_model.model.embed_tokens = MagicMock()
|
||||
mock_model.lm_head = MagicMock()
|
||||
mock_model.multimodal_cpu_fields = None
|
||||
@@ -226,7 +228,7 @@ class TestEagleProposerLoadModel(TestBase):
|
||||
self.proposer.name = SpecDcodeType.EAGLE
|
||||
|
||||
self.proposer.load_model(mock_model)
|
||||
mock_model.get_language_model.assert_called_once()
|
||||
self.assertEqual(mock_model.get_language_model.call_count, 2)
|
||||
self.assertIs(self.proposer.model.lm_head,
|
||||
mock_model.get_language_model.return_value.lm_head)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user