2025-11-27 21:59:31 +08:00
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import torch
|
2025-12-29 09:54:51 +08:00
|
|
|
from vllm.config import CacheConfig, CompilationMode, CUDAGraphMode, VllmConfig
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
from tests.ut.base import TestBase
|
2025-12-29 16:25:52 +08:00
|
|
|
from vllm_ascend.ascend_config import init_ascend_config
|
2025-11-27 21:59:31 +08:00
|
|
|
from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
|
|
|
|
|
from vllm_ascend.spec_decode.interface import SpecDcodeType
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestEagleProposerInitialization(TestBase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.vllm_config = MagicMock(spec=VllmConfig)
|
|
|
|
|
self.vllm_config.speculative_config = MagicMock()
|
|
|
|
|
self.vllm_config.cache_config = MagicMock(spec=CacheConfig)
|
|
|
|
|
self.vllm_config.scheduler_config = MagicMock()
|
|
|
|
|
self.vllm_config.model_config = MagicMock()
|
|
|
|
|
self.device = torch.device("cpu")
|
|
|
|
|
self.runner = MagicMock()
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.runner.pin_memory = False
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
self.vllm_config.cache_config.block_size = 16
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_seqs = 32
|
|
|
|
|
self.vllm_config.model_config.dtype = torch.float16
|
|
|
|
|
self.vllm_config.model_config.max_model_len = 2048
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
|
|
|
|
self.vllm_config.speculative_config.speculative_token_tree = str([
|
|
|
|
|
(i + 1) * (0, ) for i in range(2)
|
|
|
|
|
])
|
|
|
|
|
self.vllm_config.additional_config = None
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer = patch(
|
2025-12-29 16:25:52 +08:00
|
|
|
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer.start()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs = patch(
|
|
|
|
|
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
|
|
|
|
)
|
|
|
|
|
self.mock_supports_multimodal_inputs.start()
|
2025-12-16 22:06:40 +08:00
|
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
|
self.mock_cpugpubuffer.stop()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs.stop()
|
2025-12-16 22:06:40 +08:00
|
|
|
|
2025-12-29 09:54:51 +08:00
|
|
|
def test_initialization_eagle_graph(self):
|
2025-11-27 21:59:31 +08:00
|
|
|
self.vllm_config.speculative_config.method = "eagle"
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
|
|
|
|
|
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
|
|
|
|
self.vllm_config.model_config.enforce_eager = False
|
2026-01-19 08:58:07 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
2025-12-29 09:54:51 +08:00
|
|
|
self.vllm_config.speculative_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.scheduler_config.async_scheduling = False
|
2025-12-29 16:25:52 +08:00
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
proposer = EagleProposer(vllm_config=self.vllm_config,
|
|
|
|
|
device=self.device,
|
|
|
|
|
runner=self.runner)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(proposer.hidden_size, 4096)
|
|
|
|
|
self.assertTrue(proposer.use_cuda_graph)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(proposer.input_ids.shape, (1024, ))
|
|
|
|
|
self.assertEqual(proposer.positions.shape, (1024, ))
|
|
|
|
|
self.assertEqual(proposer.hidden_states.shape, (1024, 4096))
|
2025-12-16 22:06:40 +08:00
|
|
|
self.assertEqual(proposer.arange.shape, (1024, ))
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-29 09:54:51 +08:00
|
|
|
def test_initialization_eagle3_enforce_eager(self):
|
2025-11-27 21:59:31 +08:00
|
|
|
self.vllm_config.speculative_config.method = "eagle3"
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
|
|
|
|
|
self.vllm_config.compilation_config.mode = CompilationMode.NONE
|
|
|
|
|
self.vllm_config.model_config.enforce_eager = True
|
2025-12-29 16:25:52 +08:00
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
proposer = EagleProposer(vllm_config=self.vllm_config,
|
|
|
|
|
device=self.device,
|
|
|
|
|
runner=self.runner)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(proposer.hidden_size, 2048)
|
|
|
|
|
self.assertFalse(proposer.use_cuda_graph)
|
|
|
|
|
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
|
|
|
|
|
|
2025-12-29 09:54:51 +08:00
|
|
|
def test_initialization_eagle3_full_graph_async(self):
|
|
|
|
|
self.vllm_config.speculative_config.method = "eagle3"
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
|
|
|
|
|
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
|
|
|
|
self.vllm_config.model_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.speculative_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.scheduler_config.async_scheduling = True
|
2025-12-29 16:25:52 +08:00
|
|
|
init_ascend_config(self.vllm_config)
|
2025-12-29 09:54:51 +08:00
|
|
|
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
proposer = EagleProposer(vllm_config=self.vllm_config,
|
|
|
|
|
device=self.device,
|
|
|
|
|
runner=self.runner)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(proposer.hidden_size, 2048)
|
|
|
|
|
self.assertTrue(proposer.use_cuda_graph)
|
|
|
|
|
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
|
|
|
|
|
|
|
|
|
|
def test_initialization_mtp_full_graph_async(self):
|
|
|
|
|
self.vllm_config.speculative_config.method = "mtp"
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
|
|
|
|
|
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
|
|
|
|
self.vllm_config.model_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.speculative_config.enforce_eager = False
|
|
|
|
|
self.vllm_config.scheduler_config.async_scheduling = True
|
|
|
|
|
init_ascend_config(self.vllm_config)
|
|
|
|
|
|
2025-12-29 09:54:51 +08:00
|
|
|
proposer = EagleProposer(vllm_config=self.vllm_config,
|
|
|
|
|
device=self.device,
|
|
|
|
|
runner=self.runner)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(proposer.hidden_size, 2048)
|
|
|
|
|
self.assertFalse(proposer.use_cuda_graph)
|
|
|
|
|
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
|
|
|
|
|
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
class TestEagleProposerLoadModel(TestBase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.vllm_config = MagicMock(spec=VllmConfig)
|
|
|
|
|
self.vllm_config.speculative_config = MagicMock()
|
|
|
|
|
self.vllm_config.speculative_config.method = "eagle"
|
|
|
|
|
self.device = torch.device("cpu")
|
|
|
|
|
self.runner = MagicMock()
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.runner.pin_memory = False
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
self.vllm_config.cache_config.block_size = 16
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_seqs = 32
|
|
|
|
|
self.vllm_config.model_config.dtype = torch.float16
|
|
|
|
|
self.vllm_config.model_config.max_model_len = 2048
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
|
|
|
|
self.vllm_config.speculative_config.speculative_token_tree = str([
|
|
|
|
|
(i + 1) * (0, ) for i in range(2)
|
|
|
|
|
])
|
|
|
|
|
self.vllm_config.additional_config = None
|
|
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer = patch(
|
2025-12-29 16:25:52 +08:00
|
|
|
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer.start()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs = patch(
|
|
|
|
|
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
|
|
|
|
)
|
|
|
|
|
self.mock_supports_multimodal_inputs.start()
|
2025-11-27 21:59:31 +08:00
|
|
|
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
|
|
|
|
device=self.device,
|
|
|
|
|
runner=self.runner)
|
|
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
def tearDown(self):
|
|
|
|
|
self.mock_cpugpubuffer.stop()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs.stop()
|
2025-12-16 22:06:40 +08:00
|
|
|
|
2025-11-27 21:59:31 +08:00
|
|
|
@patch(
|
|
|
|
|
"vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_model")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_pp_group")
|
|
|
|
|
def test_load_model_pp1(self, mock_pp_group, mock_get_model,
|
|
|
|
|
mock_get_layers):
|
|
|
|
|
mock_pp_group.return_value.world_size = 1
|
2026-01-05 14:07:54 +08:00
|
|
|
mock_target_layer1 = MagicMock()
|
|
|
|
|
mock_target_layer2 = MagicMock()
|
|
|
|
|
mock_draft_layer1 = MagicMock()
|
|
|
|
|
mock_draft_layer3 = MagicMock()
|
|
|
|
|
mock_get_layers.side_effect = [{
|
|
|
|
|
"layer1": mock_target_layer1,
|
|
|
|
|
"layer2": mock_target_layer2
|
|
|
|
|
}, {}, {}, {
|
|
|
|
|
"layer1": mock_draft_layer1,
|
|
|
|
|
"layer3": mock_draft_layer3
|
|
|
|
|
}]
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-01-20 21:34:28 +08:00
|
|
|
weight = torch.zeros(0)
|
|
|
|
|
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_model = MagicMock()
|
2026-01-19 08:58:07 +08:00
|
|
|
mock_model.supports_multimodal = False
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_model.lm_head = MagicMock()
|
2025-12-15 19:54:23 +08:00
|
|
|
mock_model.multimodal_cpu_fields = None
|
|
|
|
|
mock_model.merge_by_field_config = None
|
2026-01-20 21:34:28 +08:00
|
|
|
mock_model.model.embed_tokens = MagicMock()
|
|
|
|
|
mock_model.model.embed_tokens.weight = weight
|
|
|
|
|
|
2025-11-27 21:59:31 +08:00
|
|
|
self.proposer.name = SpecDcodeType.EAGLE
|
2026-01-20 21:34:28 +08:00
|
|
|
mock_get_model.return_value = MagicMock()
|
|
|
|
|
mock_get_model.return_value.model.embed_tokens.weight = weight
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
self.proposer.load_model(mock_model)
|
|
|
|
|
mock_get_model.assert_called_once()
|
2026-01-15 10:24:35 +08:00
|
|
|
self.assertEqual(self.proposer.attn_layer_names, ["layer3"])
|
2025-11-27 21:59:31 +08:00
|
|
|
self.assertIs(self.proposer.model.model.embed_tokens,
|
|
|
|
|
mock_model.model.embed_tokens)
|
|
|
|
|
|
|
|
|
|
@patch(
|
|
|
|
|
"vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_model")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_pp_group")
|
|
|
|
|
def test_load_model_pp_gt1(self, mock_pp_group, mock_get_model,
|
|
|
|
|
mock_get_layers):
|
|
|
|
|
mock_pp_group.return_value.world_size = 2
|
2026-01-05 14:07:54 +08:00
|
|
|
mock_target_layer1 = MagicMock()
|
|
|
|
|
mock_draft_layer2 = MagicMock()
|
|
|
|
|
|
|
|
|
|
mock_get_layers.side_effect = [{
|
|
|
|
|
"layer1": mock_target_layer1
|
|
|
|
|
}, {}, {}, {
|
|
|
|
|
"layer2": mock_draft_layer2
|
|
|
|
|
}]
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
mock_model = MagicMock()
|
|
|
|
|
original_embed = MagicMock()
|
2025-12-15 19:54:23 +08:00
|
|
|
mock_model.multimodal_cpu_fields = None
|
|
|
|
|
mock_model.merge_by_field_config = None
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_get_model.return_value = MagicMock(model=MagicMock(
|
|
|
|
|
embed_tokens=original_embed))
|
|
|
|
|
|
|
|
|
|
self.proposer.load_model(mock_model)
|
|
|
|
|
|
|
|
|
|
self.assertIsNot(self.proposer.model.model.embed_tokens,
|
|
|
|
|
mock_model.model.embed_tokens)
|
2026-01-15 10:24:35 +08:00
|
|
|
self.assertEqual(self.proposer.attn_layer_names, ["layer2"])
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
@patch(
|
|
|
|
|
"vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_model")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_pp_group")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.supports_multimodal")
|
|
|
|
|
def test_load_model_multimodal(self, mock_supports_multi, mock_pp_group,
|
|
|
|
|
mock_get_model, mock_get_layers):
|
|
|
|
|
mock_model = MagicMock()
|
|
|
|
|
mock_model.get_language_model.return_value.lm_head = MagicMock()
|
|
|
|
|
mock_supports_multi.return_value = True
|
|
|
|
|
original_embed = MagicMock()
|
|
|
|
|
mock_get_model.return_value = MagicMock(model=MagicMock(
|
|
|
|
|
embed_tokens=original_embed))
|
|
|
|
|
|
2026-01-05 14:07:54 +08:00
|
|
|
mock_target_layer1 = MagicMock()
|
|
|
|
|
mock_draft_layer2 = MagicMock()
|
|
|
|
|
|
|
|
|
|
mock_get_layers.side_effect = [{
|
|
|
|
|
"layer1": mock_target_layer1
|
|
|
|
|
}, {}, {}, {
|
|
|
|
|
"layer2": mock_draft_layer2
|
|
|
|
|
}]
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_pp_group.return_value.world_size = 2
|
|
|
|
|
|
|
|
|
|
self.proposer.model = MagicMock()
|
|
|
|
|
self.proposer.name = SpecDcodeType.EAGLE
|
|
|
|
|
|
|
|
|
|
self.proposer.load_model(mock_model)
|
2026-01-19 08:58:07 +08:00
|
|
|
self.assertEqual(mock_model.get_language_model.call_count, 2)
|
2025-11-27 21:59:31 +08:00
|
|
|
self.assertIs(self.proposer.model.lm_head,
|
|
|
|
|
mock_model.get_language_model.return_value.lm_head)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestEagleProposerDummyRun(TestBase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.vllm_config = MagicMock(spec=VllmConfig)
|
|
|
|
|
self.vllm_config.speculative_config = MagicMock()
|
2025-12-29 09:54:51 +08:00
|
|
|
self.vllm_config.speculative_config.num_speculative_tokens = 4
|
2025-11-27 21:59:31 +08:00
|
|
|
self.device = torch.device("cpu")
|
|
|
|
|
self.runner = MagicMock()
|
2026-01-15 10:24:35 +08:00
|
|
|
self.runner.pcp_size = 1
|
|
|
|
|
self.runner.dcp_size = 1
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.runner.pin_memory = False
|
2026-01-24 11:29:42 +08:00
|
|
|
self.runner._sync_metadata_across_dp.return_value = (8, torch.tensor([8]), False)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
self.vllm_config.cache_config.block_size = 16
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_seqs = 32
|
|
|
|
|
self.vllm_config.model_config.dtype = torch.float16
|
|
|
|
|
self.vllm_config.model_config.max_model_len = 2048
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
2026-01-15 10:24:35 +08:00
|
|
|
self.vllm_config.model_config.use_mla = False
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.speculative_config.speculative_token_tree = str([
|
|
|
|
|
(i + 1) * (0, ) for i in range(4)
|
|
|
|
|
])
|
|
|
|
|
self.vllm_config.additional_config = None
|
|
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer = patch(
|
2025-12-29 16:25:52 +08:00
|
|
|
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer.start()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs = patch(
|
|
|
|
|
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
|
|
|
|
)
|
|
|
|
|
self.mock_supports_multimodal_inputs.start()
|
2025-11-27 21:59:31 +08:00
|
|
|
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
|
|
|
|
device=self.device,
|
|
|
|
|
runner=self.runner)
|
|
|
|
|
self.proposer.model = MagicMock()
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.proposer._runnable = MagicMock()
|
2025-12-29 09:54:51 +08:00
|
|
|
self.proposer.update_stream = MagicMock()
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
def tearDown(self):
|
|
|
|
|
self.mock_cpugpubuffer.stop()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs.stop()
|
2025-12-16 22:06:40 +08:00
|
|
|
|
2026-01-14 09:00:37 +08:00
|
|
|
# cpu does not support parallel-group, let alone `sp`
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context",
|
|
|
|
|
**{"return_value.sp_enabled": False})
|
2025-11-27 21:59:31 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
2025-12-29 09:54:51 +08:00
|
|
|
def test_dummy_run_basic(self, mock_context, mock_get_context):
|
2025-11-27 21:59:31 +08:00
|
|
|
num_tokens = 32
|
|
|
|
|
with_prefill = False
|
|
|
|
|
|
2026-01-08 15:33:52 +08:00
|
|
|
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
|
|
|
|
self.proposer.enable_shared_expert_dp = False
|
2025-11-27 21:59:31 +08:00
|
|
|
self.proposer.dummy_run(num_tokens=num_tokens,
|
|
|
|
|
with_prefill=with_prefill)
|
|
|
|
|
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.assertTrue(self.proposer._runnable.call_count == 1)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2026-01-14 09:00:37 +08:00
|
|
|
# cpu does not support parallel-group, let alone `sp`
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context",
|
|
|
|
|
**{"return_value.sp_enabled": False})
|
2025-11-27 21:59:31 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
2025-12-29 09:54:51 +08:00
|
|
|
def test_dummy_run_with_prefill(self, mock_context, mock_get_context):
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_context.return_value.__enter__.return_value = None
|
2026-01-08 15:33:52 +08:00
|
|
|
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
|
|
|
|
self.proposer.enable_shared_expert_dp = False
|
2025-11-27 21:59:31 +08:00
|
|
|
self.proposer.dummy_run(num_tokens=64, with_prefill=True, num_reqs=4)
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.assertTrue(self.proposer._runnable.call_count == 1)
|
2025-12-29 09:54:51 +08:00
|
|
|
|
2026-01-25 15:25:38 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.update_attn_params")
|
2025-12-29 09:54:51 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
|
|
|
|
def test_dummy_run_in_graph_capture(self, mock_context, mock_get_context,
|
2026-01-25 15:25:38 +08:00
|
|
|
mock_update_attn_params):
|
2025-12-29 09:54:51 +08:00
|
|
|
last_use_cuda_graph = self.proposer.use_cuda_graph
|
|
|
|
|
mock_return_context = MagicMock()
|
|
|
|
|
mock_return_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
|
|
|
|
mock_return_context.capturing = True
|
2026-01-14 09:00:37 +08:00
|
|
|
# cpu does not support parallel-group, let alone `sp`
|
|
|
|
|
mock_return_context.sp_enabled = False
|
2025-12-29 09:54:51 +08:00
|
|
|
mock_get_context.return_value = mock_return_context
|
|
|
|
|
self.proposer.use_cuda_graph = True
|
2026-01-08 15:33:52 +08:00
|
|
|
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
|
|
|
|
self.proposer.enable_shared_expert_dp = False
|
2025-12-29 09:54:51 +08:00
|
|
|
self.proposer.dummy_run(num_tokens=64,
|
|
|
|
|
in_graph_capturing=True,
|
|
|
|
|
aclgraph_runtime_mode=CUDAGraphMode.FULL)
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.assertTrue(self.proposer._runnable.call_count == 1)
|
2026-01-25 15:25:38 +08:00
|
|
|
mock_update_attn_params.assert_not_called()
|
2025-12-29 09:54:51 +08:00
|
|
|
self.proposer.use_cuda_graph = last_use_cuda_graph
|
|
|
|
|
|
2026-01-25 15:25:38 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.update_attn_params")
|
2025-12-29 09:54:51 +08:00
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
|
|
|
|
|
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
|
|
|
|
def test_dummy_run_in_graph_run(self, mock_context, mock_get_context,
|
2026-01-25 15:25:38 +08:00
|
|
|
mock_update_attn_params):
|
2025-12-29 09:54:51 +08:00
|
|
|
last_use_cuda_graph = self.proposer.use_cuda_graph
|
|
|
|
|
mock_return_context = MagicMock()
|
|
|
|
|
mock_return_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
|
|
|
|
mock_return_context.capturing = False
|
2026-01-14 09:00:37 +08:00
|
|
|
# cpu does not support parallel-group, let alone `sp`
|
|
|
|
|
mock_return_context.sp_enabled = False
|
2025-12-29 09:54:51 +08:00
|
|
|
mock_get_context.return_value = mock_return_context
|
|
|
|
|
self.proposer.use_cuda_graph = True
|
2026-01-08 15:33:52 +08:00
|
|
|
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
|
|
|
|
self.proposer.enable_shared_expert_dp = False
|
2025-12-29 09:54:51 +08:00
|
|
|
self.proposer.dummy_run(num_tokens=64,
|
|
|
|
|
in_graph_capturing=False,
|
|
|
|
|
aclgraph_runtime_mode=CUDAGraphMode.FULL)
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.assertTrue(self.proposer._runnable.call_count == 1)
|
2026-01-25 15:25:38 +08:00
|
|
|
self.assertTrue(mock_update_attn_params.call_count == 1)
|
2025-12-29 09:54:51 +08:00
|
|
|
self.proposer.use_cuda_graph = last_use_cuda_graph
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestEagleProposerHelperMethods(TestBase):
|
|
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
# TODO: Can add some tests about prepare_next_token_ids in future.
|
|
|
|
|
|
2025-11-27 21:59:31 +08:00
|
|
|
def setUp(self):
|
|
|
|
|
self.vllm_config = MagicMock(spec=VllmConfig)
|
|
|
|
|
self.vllm_config.scheduler_config = MagicMock(max_num_seqs=3)
|
|
|
|
|
self.device = torch.device("cpu")
|
|
|
|
|
self.runner = MagicMock()
|
|
|
|
|
self.runner.input_batch = MagicMock()
|
|
|
|
|
self.runner.input_batch.req_ids = [0, 1, 2]
|
|
|
|
|
self.runner.arange_np = np.arange(10)
|
|
|
|
|
self.runner.input_batch.num_reqs = 3
|
[Feat] Merge the multi eagle graphs to one graph (#5940)
### What this PR does / why we need it?
This PR merge all steps of draft model in fullgraph mode, to avoid the
synchronize between each graph, reduce the bubble time.
#### Key ideas:
- The "model forward" of the step 0 (first step) and remaining steps are
captured together as a "Callable", rather than capturing each model
individually.
- "update_attn_params" is moved outside the entire graph, meaning that
all "attn_metadata" required by all steps are constructed before
"replay", and the "attn_params" of all steps are updated at once.
- Remove synchronization between the main model graph and draft model
graph.
#### Key params/functions:
- params: draft_attn_metadatas, attn_metadata_multi_steps,
slot_mapping_group
- functions: _run_merged_draft, attn_update_stack_num_spec_norm,
update_attn_params, _propose, dummy_run
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2026-01-23 08:37:02 +08:00
|
|
|
self.runner.pin_memory = False
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
self.vllm_config.cache_config.block_size = 16
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
|
|
|
|
|
self.vllm_config.scheduler_config.max_num_seqs = 32
|
|
|
|
|
self.vllm_config.model_config.dtype = torch.float16
|
|
|
|
|
self.vllm_config.model_config.max_model_len = 2048
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.model_config.uses_mrope = False
|
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
|
|
|
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
2025-12-29 16:25:52 +08:00
|
|
|
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
|
|
|
|
self.vllm_config.speculative_config.speculative_token_tree = str([
|
|
|
|
|
(i + 1) * (0, ) for i in range(2)
|
|
|
|
|
])
|
|
|
|
|
self.vllm_config.additional_config = None
|
|
|
|
|
init_ascend_config(self.vllm_config)
|
2025-11-27 21:59:31 +08:00
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer = patch(
|
2025-12-29 16:25:52 +08:00
|
|
|
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
2025-12-16 22:06:40 +08:00
|
|
|
self.mock_cpugpubuffer.start()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs = patch(
|
|
|
|
|
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
|
|
|
|
)
|
|
|
|
|
self.mock_supports_multimodal_inputs.start()
|
2025-11-27 21:59:31 +08:00
|
|
|
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
|
|
|
|
device=self.device,
|
|
|
|
|
runner=self.runner)
|
|
|
|
|
|
2025-12-16 22:06:40 +08:00
|
|
|
def tearDown(self):
|
|
|
|
|
self.mock_cpugpubuffer.stop()
|
2025-12-29 16:25:52 +08:00
|
|
|
self.mock_supports_multimodal_inputs.stop()
|
2025-12-16 22:06:40 +08:00
|
|
|
|
|
|
|
|
# TODO: This is equivalent to disable_padded_drafter_batch=True.
|
|
|
|
|
# We need to add a test_prepare_inputs_padded in future.
|
2025-11-27 21:59:31 +08:00
|
|
|
def test_prepare_inputs(self):
|
|
|
|
|
self.proposer.token_arange_np = np.arange(10)
|
|
|
|
|
mock_attn = MagicMock()
|
|
|
|
|
mock_attn.slot_mapping = torch.tensor([0, 1, 2, 3, 4, 5])
|
|
|
|
|
num_rejected = torch.tensor([1, 0, 1], device=self.device)
|
2025-12-16 22:06:40 +08:00
|
|
|
mock_return_attn = MagicMock()
|
2025-11-27 21:59:31 +08:00
|
|
|
|
|
|
|
|
with patch.object(self.proposer,
|
2025-12-16 22:06:40 +08:00
|
|
|
'prepare_inputs',
|
|
|
|
|
return_value=(mock_return_attn,
|
2025-11-27 21:59:31 +08:00
|
|
|
torch.tensor([1, 2, 4]))):
|
2025-12-16 22:06:40 +08:00
|
|
|
return_attn, indices = self.proposer.prepare_inputs(
|
2025-11-27 21:59:31 +08:00
|
|
|
mock_attn, num_rejected)
|
|
|
|
|
self.assertEqual(indices.tolist(), [1, 2, 4])
|