This reverts commitd886b81971. it breaks pd function - vLLM version: v0.13.0 - vLLM main:bde38c11dfSigned-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
This commit is contained in:
@@ -23,7 +23,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
@@ -124,11 +123,11 @@ def test_deepseek_mtp_correctness(model_name: str, num_speculative_tokens: int,
|
||||
@pytest.mark.parametrize("method", ["eagle", "eagle3"])
|
||||
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
|
||||
@pytest.mark.parametrize("async_scheduling", [True, False])
|
||||
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
|
||||
def test_llama_qwen3_eagle_correctness(
|
||||
model_name: str, model_name_main: str, num_speculative_tokens: int,
|
||||
method: str, disable_padded_drafter_batch: bool,
|
||||
async_scheduling: bool, draft_tensor_parallel_size: Union[None, int]):
|
||||
def test_llama_qwen3_eagle_correctness(model_name: str, model_name_main: str,
|
||||
num_speculative_tokens: int,
|
||||
method: str,
|
||||
disable_padded_drafter_batch: bool,
|
||||
async_scheduling: bool):
|
||||
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
@@ -163,8 +162,6 @@ def test_llama_qwen3_eagle_correctness(
|
||||
"method": method,
|
||||
"model": model_name,
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
"draft_tensor_parallel_size":
|
||||
draft_tensor_parallel_size,
|
||||
"max_model_len": 128,
|
||||
"draft_vocab_size": 128256,
|
||||
},
|
||||
|
||||
@@ -4,7 +4,7 @@ from __future__ import annotations
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from typing import Any, Union
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
@@ -217,11 +217,9 @@ def test_suffix_acceptance(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_eagle3", [True], ids=["eagle3"])
|
||||
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
|
||||
def test_eagle_logprobs(
|
||||
model_name: str,
|
||||
use_eagle3: bool,
|
||||
draft_tensor_parallel_size: Union[None, int],
|
||||
):
|
||||
prompt = {"role": "user", "content": "Hello world " * 10}
|
||||
sampling_params = SamplingParams(temperature=0,
|
||||
@@ -248,7 +246,6 @@ def test_eagle_logprobs(
|
||||
"method": "eagle3" if use_eagle3 else "eagle",
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 2,
|
||||
"draft_tensor_parallel_size": draft_tensor_parallel_size,
|
||||
"max_model_len": 128,
|
||||
},
|
||||
max_model_len=128,
|
||||
@@ -274,13 +271,11 @@ def test_eagle_logprobs(
|
||||
|
||||
@pytest.mark.parametrize("method", MODELS.keys())
|
||||
@pytest.mark.parametrize("num_speculative_tokens", [3])
|
||||
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
|
||||
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
|
||||
@pytest.mark.parametrize("async_scheduling", [True, False])
|
||||
def test_llama_qwen_eagle_acceptance(
|
||||
method: str,
|
||||
num_speculative_tokens: int,
|
||||
draft_tensor_parallel_size: Union[None, int],
|
||||
disable_padded_drafter_batch: bool,
|
||||
async_scheduling: bool,
|
||||
):
|
||||
@@ -331,7 +326,6 @@ def test_llama_qwen_eagle_acceptance(
|
||||
speculative_config = {
|
||||
"method": method,
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
"draft_tensor_parallel_size": draft_tensor_parallel_size,
|
||||
"disable_padded_drafter_batch": disable_padded_drafter_batch,
|
||||
"model": spec_model_name,
|
||||
}
|
||||
|
||||
@@ -27,8 +27,6 @@ class TestEagleProposerInitialization(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
@@ -116,8 +114,6 @@ class TestEagleProposerLoadModel(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
@@ -250,8 +246,6 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(4)
|
||||
])
|
||||
@@ -366,8 +360,6 @@ class TestEagleProposerHelperMethods(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
|
||||
@@ -42,9 +42,6 @@ class TestMtpProposer:
|
||||
config.model_config.max_model_len = 2048
|
||||
config.model_config.uses_mrope = False
|
||||
config.model_config.hf_text_config = None
|
||||
config.model_config.hf_config = None
|
||||
config.parallel_config.tensor_parallel_size = 1
|
||||
config.speculative_config.draft_tensor_parallel_size = 1
|
||||
|
||||
config.load_config = None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user