2025-11-27 21:59:31 +08:00
|
|
|
from unittest.mock import MagicMock, patch
|
2026-03-13 16:14:15 +08:00
|
|
|
import unittest
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import torch
|
2026-02-10 14:08:59 +08:00
|
|
|
from vllm.config import CacheConfig, CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
from tests.ut.base import TestBase
|
2025-12-29 16:25:52 +08:00
|
|
|
from vllm_ascend.ascend_config import init_ascend_config
|
2026-03-05 14:30:10 +08:00
|
|
|
from vllm_ascend.spec_decode.eagle_proposer import AscendEagleProposer
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestEagleProposerInitialization(TestBase):
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.vllm_config = MagicMock(spec=VllmConfig)
|
|
|
|
|
self.vllm_config.speculative_config = MagicMock()
|
|
|
|
|
self.vllm_config.cache_config = MagicMock(spec=CacheConfig)
|
|
|
|
|
self.vllm_config.scheduler_config = MagicMock()
|
|
|
|
|
self.vllm_config.model_config = MagicMock()
|
2026-03-13 14:07:35 +08:00
|
|
|
self.vllm_config.model_config.hf_text_config = MagicMock(
|
|
|
|
|
spec=[]
|
|
|
|
|
) # Empty spec to prevent hasattr from returning True
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.model_config.hf_text_config.to_dict = MagicMock(return_value={})
|
|
|
|
|
self.vllm_config.compilation_config = MagicMock()
|
2025-11-27 21:59:31 +08:00
|
|
|
self.device = torch.device("cpu")
|
|
|
|
|
self.runner = MagicMock()
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.runner.pin_memory = False
|
2026-02-10 14:08:59 +08:00
|
|
|
self.runner.pcp_size = 1
|
|
|
|
|
self.runner.dcp_size = 1
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
self.vllm_config.cache_config.block_size = 16
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_seqs = 32
|
|
|
|
|
self.vllm_config.model_config.dtype = torch.float16
|
|
|
|
|
self.vllm_config.model_config.max_model_len = 2048
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.model_config.uses_xdrope_dim = 0
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.parallel_config.data_parallel_rank = 0
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.parallel_config.data_parallel_size = 1
|
|
|
|
|
self.vllm_config.parallel_config.prefill_context_parallel_size = 1
|
|
|
|
|
self.vllm_config.parallel_config.enable_expert_parallel = False
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
2026-03-13 14:07:35 +08:00
|
|
|
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(2)])
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
|
|
|
|
|
self.vllm_config.speculative_config.disable_padded_drafter_batch = False
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.additional_config = None
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer.start()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs = patch(
|
2026-03-13 14:07:35 +08:00
|
|
|
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
|
2025-12-29 16:25:52 +08:00
|
|
|
)
|
|
|
|
|
self.mock_supports_multimodal_inputs.start()
|
2025-12-16 22:06:40 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
# Set the current vllm config
|
|
|
|
|
set_current_vllm_config(self.vllm_config)
|
|
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
def tearDown(self):
|
|
|
|
|
self.mock_cpugpubuffer.stop()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs.stop()
|
2026-02-10 14:08:59 +08:00
|
|
|
# Clear the current vllm config
|
|
|
|
|
set_current_vllm_config(None)
|
2025-12-16 22:06:40 +08:00
|
|
|
|
2025-12-29 09:54:51 +08:00
|
|
|
def test_initialization_eagle_graph(self):
|
2025-11-27 21:59:31 +08:00
|
|
|
self.vllm_config.speculative_config.method = "eagle"
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
|
2026-01-27 08:44:36 +08:00
|
|
|
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
|
2025-11-27 21:59:31 +08:00
|
|
|
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
|
|
|
|
self.vllm_config.model_config.enforce_eager = False
|
2026-01-19 08:58:07 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
2025-12-29 09:54:51 +08:00
|
|
|
self.vllm_config.speculative_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.scheduler_config.async_scheduling = False
|
2025-12-29 16:25:52 +08:00
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
2026-03-13 14:07:35 +08:00
|
|
|
proposer = AscendEagleProposer(vllm_config=self.vllm_config, device=self.device, runner=self.runner)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
self.assertEqual(proposer.hidden_size, 4096)
|
|
|
|
|
self.assertTrue(proposer.use_cuda_graph)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
expected_max_num_tokens = proposer.max_num_tokens
|
2026-03-13 14:07:35 +08:00
|
|
|
self.assertEqual(proposer.input_ids.shape, (expected_max_num_tokens,))
|
|
|
|
|
self.assertEqual(proposer.positions.shape, (expected_max_num_tokens,))
|
2026-02-10 14:08:59 +08:00
|
|
|
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 4096))
|
2026-03-13 14:07:35 +08:00
|
|
|
self.assertEqual(proposer.arange.shape, (expected_max_num_tokens,))
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-29 09:54:51 +08:00
|
|
|
def test_initialization_eagle3_enforce_eager(self):
|
2025-11-27 21:59:31 +08:00
|
|
|
self.vllm_config.speculative_config.method = "eagle3"
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
|
|
|
|
|
self.vllm_config.compilation_config.mode = CompilationMode.NONE
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.compilation_config.pass_config = MagicMock()
|
|
|
|
|
self.vllm_config.compilation_config.pass_config.enable_sp = False
|
2025-11-27 21:59:31 +08:00
|
|
|
self.vllm_config.model_config.enforce_eager = True
|
2025-12-29 16:25:52 +08:00
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
2026-03-13 14:07:35 +08:00
|
|
|
proposer = AscendEagleProposer(vllm_config=self.vllm_config, device=self.device, runner=self.runner)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
self.assertEqual(proposer.hidden_size, 2048)
|
|
|
|
|
self.assertFalse(proposer.use_cuda_graph)
|
|
|
|
|
expected_max_num_tokens = proposer.max_num_tokens
|
|
|
|
|
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048))
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-29 09:54:51 +08:00
|
|
|
def test_initialization_eagle3_full_graph_async(self):
|
|
|
|
|
self.vllm_config.speculative_config.method = "eagle3"
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
|
|
|
|
|
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
|
|
|
|
self.vllm_config.model_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.speculative_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.scheduler_config.async_scheduling = True
|
2025-12-29 16:25:52 +08:00
|
|
|
init_ascend_config(self.vllm_config)
|
2025-12-29 09:54:51 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
2026-03-13 14:07:35 +08:00
|
|
|
proposer = AscendEagleProposer(vllm_config=self.vllm_config, device=self.device, runner=self.runner)
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
self.assertEqual(proposer.hidden_size, 2048)
|
|
|
|
|
self.assertTrue(proposer.use_cuda_graph)
|
|
|
|
|
expected_max_num_tokens = proposer.max_num_tokens
|
|
|
|
|
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048))
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
|
|
|
|
|
def test_initialization_mtp_full_graph_async(self):
|
|
|
|
|
self.vllm_config.speculative_config.method = "mtp"
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
|
|
|
|
|
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
|
|
|
|
self.vllm_config.model_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.speculative_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.scheduler_config.async_scheduling = True
|
|
|
|
|
init_ascend_config(self.vllm_config)
|
|
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
2026-03-13 14:07:35 +08:00
|
|
|
proposer = AscendEagleProposer(vllm_config=self.vllm_config, device=self.device, runner=self.runner)
|
2025-12-29 09:54:51 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
self.assertEqual(proposer.hidden_size, 2048)
|
|
|
|
|
self.assertFalse(proposer.use_cuda_graph)
|
|
|
|
|
expected_max_num_tokens = proposer.max_num_tokens
|
|
|
|
|
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048))
|
2025-12-29 09:54:51 +08:00
|
|
|
|
2026-03-13 16:14:15 +08:00
|
|
|
@unittest.skip("Skip due to the changes in #7153, fix me later")
|
2025-11-27 21:59:31 +08:00
|
|
|
class TestEagleProposerLoadModel(TestBase):
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.vllm_config = MagicMock(spec=VllmConfig)
|
|
|
|
|
self.vllm_config.speculative_config = MagicMock()
|
|
|
|
|
self.vllm_config.speculative_config.method = "eagle"
|
|
|
|
|
self.device = torch.device("cpu")
|
|
|
|
|
self.runner = MagicMock()
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.runner.pin_memory = False
|
2026-02-10 14:08:59 +08:00
|
|
|
self.runner.pcp_size = 1
|
|
|
|
|
self.runner.dcp_size = 1
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
self.vllm_config.cache_config.block_size = 16
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_seqs = 32
|
|
|
|
|
self.vllm_config.model_config.dtype = torch.float16
|
|
|
|
|
self.vllm_config.model_config.max_model_len = 2048
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.model_config.uses_xdrope_dim = 0
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.parallel_config.data_parallel_rank = 0
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.parallel_config.data_parallel_size = 1
|
|
|
|
|
self.vllm_config.parallel_config.prefill_context_parallel_size = 1
|
|
|
|
|
self.vllm_config.parallel_config.enable_expert_parallel = False
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
2026-03-13 14:07:35 +08:00
|
|
|
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(2)])
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
|
|
|
|
|
self.vllm_config.speculative_config.disable_padded_drafter_batch = False
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.additional_config = None
|
|
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer.start()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs = patch(
|
2026-03-13 14:07:35 +08:00
|
|
|
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
|
2025-12-29 16:25:52 +08:00
|
|
|
)
|
|
|
|
|
self.mock_supports_multimodal_inputs.start()
|
2026-02-10 14:08:59 +08:00
|
|
|
|
|
|
|
|
# Set the current vllm config
|
|
|
|
|
set_current_vllm_config(self.vllm_config)
|
2026-03-13 14:07:35 +08:00
|
|
|
self.proposer = AscendEagleProposer(vllm_config=self.vllm_config, device=self.device, runner=self.runner)
|
|
|
|
|
self.proposer.parallel_drafting = False
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
def tearDown(self):
|
|
|
|
|
self.mock_cpugpubuffer.stop()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs.stop()
|
2026-02-10 14:08:59 +08:00
|
|
|
# Clear the current vllm config
|
|
|
|
|
set_current_vllm_config(None)
|
2025-12-16 22:06:40 +08:00
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
|
2025-11-27 21:59:31 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_model")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_pp_group")
|
2026-03-13 14:07:35 +08:00
|
|
|
def test_load_model_pp1(self, mock_pp_group, mock_get_model, mock_get_layers):
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_pp_group.return_value.world_size = 1
|
2026-01-05 14:07:54 +08:00
|
|
|
mock_target_layer1 = MagicMock()
|
|
|
|
|
mock_target_layer2 = MagicMock()
|
|
|
|
|
mock_draft_layer1 = MagicMock()
|
|
|
|
|
mock_draft_layer3 = MagicMock()
|
2026-03-13 14:07:35 +08:00
|
|
|
mock_get_layers.side_effect = [
|
|
|
|
|
{"layer1": mock_target_layer1, "layer2": mock_target_layer2},
|
|
|
|
|
{},
|
|
|
|
|
{},
|
|
|
|
|
{"layer1": mock_draft_layer1, "layer3": mock_draft_layer3},
|
|
|
|
|
]
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-01-20 21:34:28 +08:00
|
|
|
weight = torch.zeros(0)
|
|
|
|
|
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_model = MagicMock()
|
2026-01-19 08:58:07 +08:00
|
|
|
mock_model.supports_multimodal = False
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_model.lm_head = MagicMock()
|
2025-12-15 19:54:23 +08:00
|
|
|
mock_model.multimodal_cpu_fields = None
|
|
|
|
|
mock_model.merge_by_field_config = None
|
2026-01-20 21:34:28 +08:00
|
|
|
mock_model.model.embed_tokens = MagicMock()
|
|
|
|
|
mock_model.model.embed_tokens.weight = weight
|
|
|
|
|
|
|
|
|
|
mock_get_model.return_value = MagicMock()
|
|
|
|
|
mock_get_model.return_value.model.embed_tokens.weight = weight
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
|
|
|
|
self.proposer.load_model(mock_model)
|
|
|
|
|
mock_get_model.assert_called_once()
|
|
|
|
|
self.assertEqual(self.proposer.attn_layer_names, ["layer3"])
|
2026-03-13 14:07:35 +08:00
|
|
|
self.assertIs(self.proposer.model.model.embed_tokens, mock_model.model.embed_tokens)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
|
2025-11-27 21:59:31 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_model")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_pp_group")
|
2026-03-13 14:07:35 +08:00
|
|
|
def test_load_model_pp_gt1(self, mock_pp_group, mock_get_model, mock_get_layers):
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_pp_group.return_value.world_size = 2
|
2026-01-05 14:07:54 +08:00
|
|
|
mock_target_layer1 = MagicMock()
|
|
|
|
|
mock_draft_layer2 = MagicMock()
|
|
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
mock_get_layers.side_effect = [{"layer1": mock_target_layer1}, {}, {}, {"layer2": mock_draft_layer2}]
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
mock_model = MagicMock()
|
|
|
|
|
original_embed = MagicMock()
|
2025-12-15 19:54:23 +08:00
|
|
|
mock_model.multimodal_cpu_fields = None
|
|
|
|
|
mock_model.merge_by_field_config = None
|
2026-03-13 14:07:35 +08:00
|
|
|
mock_get_model.return_value = MagicMock(model=MagicMock(embed_tokens=original_embed))
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
|
|
|
|
self.proposer.load_model(mock_model)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
self.assertIsNot(self.proposer.model.model.embed_tokens, mock_model.model.embed_tokens)
|
2026-02-10 14:08:59 +08:00
|
|
|
self.assertEqual(self.proposer.attn_layer_names, ["layer2"])
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
|
2025-11-27 21:59:31 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_model")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_pp_group")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.supports_multimodal")
|
2026-03-13 14:07:35 +08:00
|
|
|
def test_load_model_multimodal(self, mock_supports_multi, mock_pp_group, mock_get_model, mock_get_layers):
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_model = MagicMock()
|
|
|
|
|
mock_model.get_language_model.return_value.lm_head = MagicMock()
|
|
|
|
|
mock_supports_multi.return_value = True
|
|
|
|
|
original_embed = MagicMock()
|
2026-03-13 14:07:35 +08:00
|
|
|
mock_get_model.return_value = MagicMock(model=MagicMock(embed_tokens=original_embed))
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-01-05 14:07:54 +08:00
|
|
|
mock_target_layer1 = MagicMock()
|
|
|
|
|
mock_draft_layer2 = MagicMock()
|
|
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
mock_get_layers.side_effect = [{"layer1": mock_target_layer1}, {}, {}, {"layer2": mock_draft_layer2}]
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_pp_group.return_value.world_size = 2
|
|
|
|
|
|
|
|
|
|
self.proposer.model = MagicMock()
|
|
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
|
|
|
|
self.proposer.load_model(mock_model)
|
|
|
|
|
self.assertEqual(mock_model.get_language_model.call_count, 2)
|
2026-03-13 14:07:35 +08:00
|
|
|
self.assertIs(self.proposer.model.lm_head, mock_model.get_language_model.return_value.lm_head)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestEagleProposerDummyRun(TestBase):
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.vllm_config = MagicMock(spec=VllmConfig)
|
|
|
|
|
self.vllm_config.speculative_config = MagicMock()
|
2025-12-29 09:54:51 +08:00
|
|
|
self.vllm_config.speculative_config.num_speculative_tokens = 4
|
2025-11-27 21:59:31 +08:00
|
|
|
self.device = torch.device("cpu")
|
|
|
|
|
self.runner = MagicMock()
|
2026-01-15 10:24:35 +08:00
|
|
|
self.runner.pcp_size = 1
|
|
|
|
|
self.runner.dcp_size = 1
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.runner.pin_memory = False
|
2026-01-24 11:29:42 +08:00
|
|
|
self.runner._sync_metadata_across_dp.return_value = (8, torch.tensor([8]), False)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
self.vllm_config.cache_config.block_size = 16
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_seqs = 32
|
|
|
|
|
self.vllm_config.model_config.dtype = torch.float16
|
|
|
|
|
self.vllm_config.model_config.max_model_len = 2048
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.model_config.uses_xdrope_dim = 0
|
2026-01-15 10:24:35 +08:00
|
|
|
self.vllm_config.model_config.use_mla = False
|
2026-03-13 14:07:35 +08:00
|
|
|
self.vllm_config.model_config.hf_text_config = MagicMock(
|
|
|
|
|
spec=[]
|
|
|
|
|
) # Empty spec to prevent hasattr from returning True
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.model_config.hf_text_config.to_dict = MagicMock(return_value={})
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.parallel_config.data_parallel_rank = 0
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.parallel_config.data_parallel_size = 1
|
|
|
|
|
self.vllm_config.parallel_config.prefill_context_parallel_size = 1
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
2026-03-13 14:07:35 +08:00
|
|
|
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(4)])
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
|
|
|
|
|
self.vllm_config.speculative_config.disable_padded_drafter_batch = False
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.additional_config = None
|
|
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer.start()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs = patch(
|
2026-03-13 14:07:35 +08:00
|
|
|
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
|
2025-12-29 16:25:52 +08:00
|
|
|
)
|
|
|
|
|
self.mock_supports_multimodal_inputs.start()
|
2026-02-10 14:08:59 +08:00
|
|
|
|
|
|
|
|
# Mock parallel state functions
|
|
|
|
|
self.mock_tp_world_size = patch(
|
2026-03-13 14:07:35 +08:00
|
|
|
"vllm_ascend.ascend_forward_context.get_tensor_model_parallel_world_size", return_value=1
|
2026-02-10 14:08:59 +08:00
|
|
|
)
|
|
|
|
|
self.mock_tp_world_size.start()
|
|
|
|
|
|
|
|
|
|
mock_dp_group = MagicMock()
|
|
|
|
|
mock_dp_group.world_size = 1
|
2026-03-13 14:07:35 +08:00
|
|
|
self.mock_dp_group = patch("vllm_ascend.ascend_forward_context.get_dp_group", return_value=mock_dp_group)
|
2026-02-10 14:08:59 +08:00
|
|
|
self.mock_dp_group.start()
|
|
|
|
|
|
|
|
|
|
# Set the current vllm config
|
|
|
|
|
set_current_vllm_config(self.vllm_config)
|
2026-03-13 14:07:35 +08:00
|
|
|
self.proposer = AscendEagleProposer(vllm_config=self.vllm_config, device=self.device, runner=self.runner)
|
2025-11-27 21:59:31 +08:00
|
|
|
self.proposer.model = MagicMock()
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.proposer._runnable = MagicMock()
|
2025-12-29 09:54:51 +08:00
|
|
|
self.proposer.update_stream = MagicMock()
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
def tearDown(self):
|
|
|
|
|
self.mock_cpugpubuffer.stop()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs.stop()
|
2026-02-10 14:08:59 +08:00
|
|
|
self.mock_tp_world_size.stop()
|
|
|
|
|
self.mock_dp_group.stop()
|
|
|
|
|
# Clear the current vllm config
|
|
|
|
|
set_current_vllm_config(None)
|
2025-12-16 22:06:40 +08:00
|
|
|
|
2026-01-14 09:00:37 +08:00
|
|
|
# cpu does not support parallel-group, let alone `sp`
|
2026-03-13 09:11:46 +08:00
|
|
|
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
2026-01-14 09:00:37 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context",
|
2026-02-27 08:27:41 +08:00
|
|
|
**{"return_value.flash_comm_v1_enabled": False})
|
2025-11-27 21:59:31 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
2026-03-13 09:11:46 +08:00
|
|
|
def test_dummy_run_basic(self, mock_context, mock_get_context, mock_get_context_2):
|
2025-11-27 21:59:31 +08:00
|
|
|
num_tokens = 32
|
|
|
|
|
with_prefill = False
|
|
|
|
|
|
2026-01-08 15:33:52 +08:00
|
|
|
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
|
|
|
|
self.proposer.enable_shared_expert_dp = False
|
2026-03-13 14:07:35 +08:00
|
|
|
self.proposer.dummy_run(num_tokens=num_tokens, with_prefill=with_prefill)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-02-10 14:08:59 +08:00
|
|
|
self.assertTrue(self.proposer._runnable.call_count == 1)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-01-14 09:00:37 +08:00
|
|
|
# cpu does not support parallel-group, let alone `sp`
|
2026-03-13 09:11:46 +08:00
|
|
|
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
2026-01-14 09:00:37 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context",
|
2026-02-27 08:27:41 +08:00
|
|
|
**{"return_value.flash_comm_v1_enabled": False})
|
2025-11-27 21:59:31 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
2026-03-13 09:11:46 +08:00
|
|
|
def test_dummy_run_with_prefill(self, mock_context, mock_get_context, mock_get_context_2):
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_context.return_value.__enter__.return_value = None
|
2026-01-08 15:33:52 +08:00
|
|
|
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
|
|
|
|
self.proposer.enable_shared_expert_dp = False
|
|
|
|
|
self.proposer.dummy_run(num_tokens=64, with_prefill=True, num_reqs=4)
|
|
|
|
|
self.assertTrue(self.proposer._runnable.call_count == 1)
|
2025-12-29 09:54:51 +08:00
|
|
|
|
2026-03-13 09:11:46 +08:00
|
|
|
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
2026-01-26 09:04:54 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.update_full_graph_params")
|
2025-12-29 09:54:51 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
|
|
|
|
def test_dummy_run_in_graph_capture(self, mock_context, mock_get_context,
|
2026-03-13 09:11:46 +08:00
|
|
|
mock_update_full_graph_params, mock_get_context_2):
|
2025-12-29 09:54:51 +08:00
|
|
|
last_use_cuda_graph = self.proposer.use_cuda_graph
|
|
|
|
|
mock_return_context = MagicMock()
|
|
|
|
|
mock_return_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
|
|
|
|
mock_return_context.capturing = True
|
2026-01-14 09:00:37 +08:00
|
|
|
# cpu does not support parallel-group, let alone `sp`
|
2026-02-27 08:27:41 +08:00
|
|
|
mock_return_context.flash_comm_v1_enabled = False
|
2025-12-29 09:54:51 +08:00
|
|
|
mock_get_context.return_value = mock_return_context
|
2026-03-13 09:11:46 +08:00
|
|
|
mock_get_context_2.return_value = mock_return_context
|
2025-12-29 09:54:51 +08:00
|
|
|
self.proposer.use_cuda_graph = True
|
2026-01-08 15:33:52 +08:00
|
|
|
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
|
|
|
|
self.proposer.enable_shared_expert_dp = False
|
2026-03-13 14:07:35 +08:00
|
|
|
self.proposer.dummy_run(num_tokens=64, in_graph_capturing=True, aclgraph_runtime_mode=CUDAGraphMode.FULL)
|
2026-02-10 14:08:59 +08:00
|
|
|
self.assertTrue(self.proposer._runnable.call_count == 1)
|
|
|
|
|
mock_update_full_graph_params.assert_not_called()
|
|
|
|
|
self.proposer.use_cuda_graph = last_use_cuda_graph
|
2026-03-13 09:11:46 +08:00
|
|
|
|
|
|
|
|
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
2026-01-26 09:04:54 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.update_full_graph_params")
|
2025-12-29 09:54:51 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
|
|
|
|
def test_dummy_run_in_graph_run(self, mock_context, mock_get_context,
|
2026-03-13 09:11:46 +08:00
|
|
|
mock_update_full_graph_params, mock_get_context_2):
|
2025-12-29 09:54:51 +08:00
|
|
|
last_use_cuda_graph = self.proposer.use_cuda_graph
|
|
|
|
|
mock_return_context = MagicMock()
|
|
|
|
|
mock_return_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
|
|
|
|
mock_return_context.capturing = False
|
2026-01-14 09:00:37 +08:00
|
|
|
# cpu does not support parallel-group, let alone `sp`
|
2026-02-27 08:27:41 +08:00
|
|
|
mock_return_context.flash_comm_v1_enabled = False
|
2025-12-29 09:54:51 +08:00
|
|
|
mock_get_context.return_value = mock_return_context
|
2026-03-13 09:11:46 +08:00
|
|
|
mock_get_context_2.return_value = mock_return_context
|
2025-12-29 09:54:51 +08:00
|
|
|
self.proposer.use_cuda_graph = True
|
2026-03-21 16:57:22 +08:00
|
|
|
self.proposer.draft_attn_groups = [MagicMock()]
|
2026-01-08 15:33:52 +08:00
|
|
|
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
2026-02-10 14:08:59 +08:00
|
|
|
with set_current_vllm_config(self.vllm_config):
|
|
|
|
|
self.proposer.enable_shared_expert_dp = False
|
2026-03-13 14:07:35 +08:00
|
|
|
self.proposer.dummy_run(num_tokens=64, in_graph_capturing=False, aclgraph_runtime_mode=CUDAGraphMode.FULL)
|
2026-02-10 14:08:59 +08:00
|
|
|
self.assertTrue(self.proposer._runnable.call_count == 1)
|
|
|
|
|
self.assertTrue(mock_update_full_graph_params.call_count == 1)
|
|
|
|
|
self.proposer.use_cuda_graph = last_use_cuda_graph
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestEagleProposerHelperMethods(TestBase):
|
2025-12-16 22:06:40 +08:00
|
|
|
# TODO: Can add some tests about prepare_next_token_ids in future.
|
|
|
|
|
|
2025-11-27 21:59:31 +08:00
|
|
|
def setUp(self):
|
|
|
|
|
self.vllm_config = MagicMock(spec=VllmConfig)
|
|
|
|
|
self.vllm_config.scheduler_config = MagicMock(max_num_seqs=3)
|
|
|
|
|
self.device = torch.device("cpu")
|
|
|
|
|
self.runner = MagicMock()
|
|
|
|
|
self.runner.input_batch = MagicMock()
|
|
|
|
|
self.runner.input_batch.req_ids = [0, 1, 2]
|
|
|
|
|
self.runner.arange_np = np.arange(10)
|
|
|
|
|
self.runner.input_batch.num_reqs = 3
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.runner.pin_memory = False
|
2026-02-10 14:08:59 +08:00
|
|
|
self.runner.pcp_size = 1
|
|
|
|
|
self.runner.dcp_size = 1
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
self.vllm_config.cache_config.block_size = 16
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_seqs = 32
|
|
|
|
|
self.vllm_config.model_config.dtype = torch.float16
|
|
|
|
|
self.vllm_config.model_config.max_model_len = 2048
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.model_config.uses_xdrope_dim = 0
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.parallel_config.data_parallel_rank = 0
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.parallel_config.data_parallel_size = 1
|
|
|
|
|
self.vllm_config.parallel_config.prefill_context_parallel_size = 1
|
|
|
|
|
self.vllm_config.parallel_config.enable_expert_parallel = False
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
2026-03-13 14:07:35 +08:00
|
|
|
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(2)])
|
2026-02-05 19:31:17 +08:00
|
|
|
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
|
2026-02-10 14:08:59 +08:00
|
|
|
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
|
|
|
|
|
self.vllm_config.speculative_config.disable_padded_drafter_batch = False
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.additional_config = None
|
|
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer.start()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs = patch(
|
2026-03-13 14:07:35 +08:00
|
|
|
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
|
2025-12-29 16:25:52 +08:00
|
|
|
)
|
|
|
|
|
self.mock_supports_multimodal_inputs.start()
|
2026-02-10 14:08:59 +08:00
|
|
|
|
|
|
|
|
# Set the current vllm config
|
|
|
|
|
set_current_vllm_config(self.vllm_config)
|
2026-03-13 14:07:35 +08:00
|
|
|
self.proposer = AscendEagleProposer(vllm_config=self.vllm_config, device=self.device, runner=self.runner)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
def tearDown(self):
|
|
|
|
|
self.mock_cpugpubuffer.stop()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs.stop()
|
2026-02-10 14:08:59 +08:00
|
|
|
# Clear the current vllm config
|
|
|
|
|
set_current_vllm_config(None)
|
2025-12-16 22:06:40 +08:00
|
|
|
|
|
|
|
|
# TODO: This is equivalent to disable_padded_drafter_batch=True.
|
|
|
|
|
# We need to add a test_prepare_inputs_padded in future.
|
2025-11-27 21:59:31 +08:00
|
|
|
def test_prepare_inputs(self):
|
|
|
|
|
self.proposer.token_arange_np = np.arange(10)
|
|
|
|
|
mock_attn = MagicMock()
|
|
|
|
|
mock_attn.slot_mapping = torch.tensor([0, 1, 2, 3, 4, 5])
|
|
|
|
|
num_rejected = torch.tensor([1, 0, 1], device=self.device)
|
2025-12-16 22:06:40 +08:00
|
|
|
mock_return_attn = MagicMock()
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-03-13 14:07:35 +08:00
|
|
|
with (
|
|
|
|
|
set_current_vllm_config(self.vllm_config),
|
|
|
|
|
patch.object(self.proposer, "prepare_inputs", return_value=(mock_return_attn, torch.tensor([1, 2, 4]))),
|
|
|
|
|
):
|
|
|
|
|
return_attn, indices = self.proposer.prepare_inputs(mock_attn, num_rejected)
|
|
|
|
|
self.assertEqual(indices.tolist(), [1, 2, 4])
|