[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#5519)
### What this PR does / why we need it?
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
# Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.13.0
- vLLM main:
45c1ca1ca1
Fixes vllm-project/vllm#31345
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
This commit is contained in:
@@ -27,6 +27,8 @@ class TestEagleProposerInitialization(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
@@ -114,6 +116,8 @@ class TestEagleProposerLoadModel(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
@@ -246,6 +250,8 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(4)
|
||||
])
|
||||
@@ -352,6 +358,8 @@ class TestEagleProposerHelperMethods(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
|
||||
@@ -42,6 +42,9 @@ class TestMtpProposer:
|
||||
config.model_config.max_model_len = 2048
|
||||
config.model_config.uses_mrope = False
|
||||
config.model_config.hf_text_config = None
|
||||
config.model_config.hf_config = None
|
||||
config.parallel_config.tensor_parallel_size = 1
|
||||
config.speculative_config.draft_tensor_parallel_size = 1
|
||||
|
||||
config.load_config = None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user