This reverts commitd886b81971. it breaks pd function - vLLM version: v0.13.0 - vLLM main:bde38c11dfSigned-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
This commit is contained in:
@@ -23,7 +23,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
@@ -124,11 +123,11 @@ def test_deepseek_mtp_correctness(model_name: str, num_speculative_tokens: int,
|
|||||||
@pytest.mark.parametrize("method", ["eagle", "eagle3"])
|
@pytest.mark.parametrize("method", ["eagle", "eagle3"])
|
||||||
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
|
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
|
||||||
@pytest.mark.parametrize("async_scheduling", [True, False])
|
@pytest.mark.parametrize("async_scheduling", [True, False])
|
||||||
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
|
def test_llama_qwen3_eagle_correctness(model_name: str, model_name_main: str,
|
||||||
def test_llama_qwen3_eagle_correctness(
|
num_speculative_tokens: int,
|
||||||
model_name: str, model_name_main: str, num_speculative_tokens: int,
|
method: str,
|
||||||
method: str, disable_padded_drafter_batch: bool,
|
disable_padded_drafter_batch: bool,
|
||||||
async_scheduling: bool, draft_tensor_parallel_size: Union[None, int]):
|
async_scheduling: bool):
|
||||||
|
|
||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@@ -163,8 +162,6 @@ def test_llama_qwen3_eagle_correctness(
|
|||||||
"method": method,
|
"method": method,
|
||||||
"model": model_name,
|
"model": model_name,
|
||||||
"num_speculative_tokens": num_speculative_tokens,
|
"num_speculative_tokens": num_speculative_tokens,
|
||||||
"draft_tensor_parallel_size":
|
|
||||||
draft_tensor_parallel_size,
|
|
||||||
"max_model_len": 128,
|
"max_model_len": 128,
|
||||||
"draft_vocab_size": 128256,
|
"draft_vocab_size": 128256,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from typing import Any, Union
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
@@ -217,11 +217,9 @@ def test_suffix_acceptance(
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_eagle3", [True], ids=["eagle3"])
|
@pytest.mark.parametrize("use_eagle3", [True], ids=["eagle3"])
|
||||||
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
|
|
||||||
def test_eagle_logprobs(
|
def test_eagle_logprobs(
|
||||||
model_name: str,
|
model_name: str,
|
||||||
use_eagle3: bool,
|
use_eagle3: bool,
|
||||||
draft_tensor_parallel_size: Union[None, int],
|
|
||||||
):
|
):
|
||||||
prompt = {"role": "user", "content": "Hello world " * 10}
|
prompt = {"role": "user", "content": "Hello world " * 10}
|
||||||
sampling_params = SamplingParams(temperature=0,
|
sampling_params = SamplingParams(temperature=0,
|
||||||
@@ -248,7 +246,6 @@ def test_eagle_logprobs(
|
|||||||
"method": "eagle3" if use_eagle3 else "eagle",
|
"method": "eagle3" if use_eagle3 else "eagle",
|
||||||
"model": spec_model_name,
|
"model": spec_model_name,
|
||||||
"num_speculative_tokens": 2,
|
"num_speculative_tokens": 2,
|
||||||
"draft_tensor_parallel_size": draft_tensor_parallel_size,
|
|
||||||
"max_model_len": 128,
|
"max_model_len": 128,
|
||||||
},
|
},
|
||||||
max_model_len=128,
|
max_model_len=128,
|
||||||
@@ -274,13 +271,11 @@ def test_eagle_logprobs(
|
|||||||
|
|
||||||
@pytest.mark.parametrize("method", MODELS.keys())
|
@pytest.mark.parametrize("method", MODELS.keys())
|
||||||
@pytest.mark.parametrize("num_speculative_tokens", [3])
|
@pytest.mark.parametrize("num_speculative_tokens", [3])
|
||||||
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
|
|
||||||
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
|
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
|
||||||
@pytest.mark.parametrize("async_scheduling", [True, False])
|
@pytest.mark.parametrize("async_scheduling", [True, False])
|
||||||
def test_llama_qwen_eagle_acceptance(
|
def test_llama_qwen_eagle_acceptance(
|
||||||
method: str,
|
method: str,
|
||||||
num_speculative_tokens: int,
|
num_speculative_tokens: int,
|
||||||
draft_tensor_parallel_size: Union[None, int],
|
|
||||||
disable_padded_drafter_batch: bool,
|
disable_padded_drafter_batch: bool,
|
||||||
async_scheduling: bool,
|
async_scheduling: bool,
|
||||||
):
|
):
|
||||||
@@ -331,7 +326,6 @@ def test_llama_qwen_eagle_acceptance(
|
|||||||
speculative_config = {
|
speculative_config = {
|
||||||
"method": method,
|
"method": method,
|
||||||
"num_speculative_tokens": num_speculative_tokens,
|
"num_speculative_tokens": num_speculative_tokens,
|
||||||
"draft_tensor_parallel_size": draft_tensor_parallel_size,
|
|
||||||
"disable_padded_drafter_batch": disable_padded_drafter_batch,
|
"disable_padded_drafter_batch": disable_padded_drafter_batch,
|
||||||
"model": spec_model_name,
|
"model": spec_model_name,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,8 +27,6 @@ class TestEagleProposerInitialization(TestBase):
|
|||||||
self.vllm_config.model_config.dtype = torch.float16
|
self.vllm_config.model_config.dtype = torch.float16
|
||||||
self.vllm_config.model_config.max_model_len = 2048
|
self.vllm_config.model_config.max_model_len = 2048
|
||||||
self.vllm_config.model_config.uses_mrope = False
|
self.vllm_config.model_config.uses_mrope = False
|
||||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
|
||||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
|
||||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||||
(i + 1) * (0, ) for i in range(2)
|
(i + 1) * (0, ) for i in range(2)
|
||||||
@@ -116,8 +114,6 @@ class TestEagleProposerLoadModel(TestBase):
|
|||||||
self.vllm_config.model_config.dtype = torch.float16
|
self.vllm_config.model_config.dtype = torch.float16
|
||||||
self.vllm_config.model_config.max_model_len = 2048
|
self.vllm_config.model_config.max_model_len = 2048
|
||||||
self.vllm_config.model_config.uses_mrope = False
|
self.vllm_config.model_config.uses_mrope = False
|
||||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
|
||||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
|
||||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||||
(i + 1) * (0, ) for i in range(2)
|
(i + 1) * (0, ) for i in range(2)
|
||||||
@@ -250,8 +246,6 @@ class TestEagleProposerDummyRun(TestBase):
|
|||||||
self.vllm_config.model_config.dtype = torch.float16
|
self.vllm_config.model_config.dtype = torch.float16
|
||||||
self.vllm_config.model_config.max_model_len = 2048
|
self.vllm_config.model_config.max_model_len = 2048
|
||||||
self.vllm_config.model_config.uses_mrope = False
|
self.vllm_config.model_config.uses_mrope = False
|
||||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
|
||||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
|
||||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||||
(i + 1) * (0, ) for i in range(4)
|
(i + 1) * (0, ) for i in range(4)
|
||||||
])
|
])
|
||||||
@@ -366,8 +360,6 @@ class TestEagleProposerHelperMethods(TestBase):
|
|||||||
self.vllm_config.model_config.dtype = torch.float16
|
self.vllm_config.model_config.dtype = torch.float16
|
||||||
self.vllm_config.model_config.max_model_len = 2048
|
self.vllm_config.model_config.max_model_len = 2048
|
||||||
self.vllm_config.model_config.uses_mrope = False
|
self.vllm_config.model_config.uses_mrope = False
|
||||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
|
||||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
|
||||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||||
(i + 1) * (0, ) for i in range(2)
|
(i + 1) * (0, ) for i in range(2)
|
||||||
|
|||||||
@@ -42,9 +42,6 @@ class TestMtpProposer:
|
|||||||
config.model_config.max_model_len = 2048
|
config.model_config.max_model_len = 2048
|
||||||
config.model_config.uses_mrope = False
|
config.model_config.uses_mrope = False
|
||||||
config.model_config.hf_text_config = None
|
config.model_config.hf_text_config = None
|
||||||
config.model_config.hf_config = None
|
|
||||||
config.parallel_config.tensor_parallel_size = 1
|
|
||||||
config.speculative_config.draft_tensor_parallel_size = 1
|
|
||||||
|
|
||||||
config.load_config = None
|
config.load_config = None
|
||||||
|
|
||||||
|
|||||||
@@ -130,27 +130,6 @@ class EagleProposer(VllmEagleProposer):
|
|||||||
|
|
||||||
self.use_sparse = hasattr(vllm_config.model_config.hf_text_config,
|
self.use_sparse = hasattr(vllm_config.model_config.hf_text_config,
|
||||||
"index_topk")
|
"index_topk")
|
||||||
# NOTE:
|
|
||||||
# `draft_tensor_parallel_size` does not take effect for Eagle:
|
|
||||||
# the draft model uses the same TP size as the target model in practice.
|
|
||||||
# so we applied this patch to set tp=1 of draft model separately.
|
|
||||||
# Due to verification of `_verify_and_get_draft_tp` in vllm,
|
|
||||||
# the value of `draft_tensor_parallel_size` here will either be 1 separately
|
|
||||||
# or the same as target model.
|
|
||||||
# TODO(zhaomingyu13): If we want to adapt to the case where draft model tp
|
|
||||||
# is not 1 and differs from target model, this part should be rewritten.
|
|
||||||
if (vllm_config.parallel_config.tensor_parallel_size
|
|
||||||
!= self.speculative_config.draft_tensor_parallel_size):
|
|
||||||
tp_group = init_model_parallel_group(
|
|
||||||
[[get_world_group().rank]],
|
|
||||||
get_world_group().rank,
|
|
||||||
torch.distributed.get_backend(get_world_group().device_group),
|
|
||||||
use_message_queue_broadcaster=True,
|
|
||||||
group_name="tp",
|
|
||||||
)
|
|
||||||
self.tp_group_context = patch_tensor_parallel_group(tp_group)
|
|
||||||
else:
|
|
||||||
self.tp_group_context = nullcontext()
|
|
||||||
|
|
||||||
# TODO: Remove it when the bug of fx-graph is solved
|
# TODO: Remove it when the bug of fx-graph is solved
|
||||||
self.maybe_eager_context: ContextManager[Any] = nullcontext()
|
self.maybe_eager_context: ContextManager[Any] = nullcontext()
|
||||||
|
|||||||
@@ -165,10 +165,6 @@ def graph_capture(device: torch.device):
|
|||||||
yield graph_capture_context
|
yield graph_capture_context
|
||||||
|
|
||||||
|
|
||||||
def get_tp_context(drafter):
|
|
||||||
return getattr(drafter, "tp_group_context", nullcontext())
|
|
||||||
|
|
||||||
|
|
||||||
class ExecuteModelState(NamedTuple):
|
class ExecuteModelState(NamedTuple):
|
||||||
"""Ephemeral cached state transferred between execute_model() and
|
"""Ephemeral cached state transferred between execute_model() and
|
||||||
sample_tokens(), after execute_model() returns None."""
|
sample_tokens(), after execute_model() returns None."""
|
||||||
@@ -2326,8 +2322,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
model_register(self.model, self.model_config)
|
model_register(self.model, self.model_config)
|
||||||
if self.drafter:
|
if self.drafter:
|
||||||
logger.info("Loading drafter model...")
|
logger.info("Loading drafter model...")
|
||||||
with get_tp_context(self.drafter):
|
self.drafter.load_model(self.model)
|
||||||
self.drafter.load_model(self.model)
|
|
||||||
if self.use_aux_hidden_state_outputs:
|
if self.use_aux_hidden_state_outputs:
|
||||||
self.model.set_aux_hidden_state_layers(
|
self.model.set_aux_hidden_state_layers(
|
||||||
self.model.get_eagle3_aux_hidden_state_layers())
|
self.model.get_eagle3_aux_hidden_state_layers())
|
||||||
@@ -2703,15 +2698,11 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
kernel_block_sizes = []
|
kernel_block_sizes = []
|
||||||
for kv_cache_group_id, kv_cache_group in enumerate(
|
for kv_cache_group_id, kv_cache_group in enumerate(
|
||||||
kv_cache_config.kv_cache_groups):
|
kv_cache_config.kv_cache_groups):
|
||||||
kv_cache_spec = kv_cache_group.kv_cache_spec
|
|
||||||
if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
|
if isinstance(kv_cache_group.kv_cache_spec,
|
||||||
# All layers in the UniformTypeKVCacheSpecs have the same type,
|
EncoderOnlyAttentionSpec):
|
||||||
# Pick an arbitrary one to dispatch.
|
|
||||||
kv_cache_spec = next(
|
|
||||||
iter(kv_cache_spec.kv_cache_specs.values()))
|
|
||||||
if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
|
|
||||||
continue
|
continue
|
||||||
elif isinstance(kv_cache_spec, AttentionSpec):
|
elif isinstance(kv_cache_group.kv_cache_spec, AttentionSpec):
|
||||||
# This is an attention backend that supports virtual
|
# This is an attention backend that supports virtual
|
||||||
# block splitting. Get the supported block sizes from
|
# block splitting. Get the supported block sizes from
|
||||||
# the backend.
|
# the backend.
|
||||||
|
|||||||
Reference in New Issue
Block a user