[Feat] Support async_scheduler and disable_padded_drafter_batch in eagle (#4893)
### What this PR does / why we need it?
We refactored the eagle_proposer.py to adapt the framework of eagle.py
in vllm-v0.12.0, to support the logit of padded drafter batch and
async-scheduler.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
Co-authored-by: drslark <slarksblood@qq.com>
This commit is contained in:
@@ -7,9 +7,10 @@ import random
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.conftest import VllmRunner, cleanup_dist_env_and_memory
|
||||
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
@@ -115,41 +116,67 @@ def test_eagle_correctness(
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using eagle speculative decoding.
|
||||
'''
|
||||
pytest.skip("To be aligned with GPU")
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
|
||||
# NOTE: e2e of eagle has many problems before.
|
||||
# We first check whether it is functioning properly.
|
||||
# Should fix the e2e with VllmRunner in future.
|
||||
spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
max_num_seqs=1,
|
||||
max_num_batched_tokens=2048,
|
||||
gpu_memory_utilization=0.6,
|
||||
speculative_config={
|
||||
"method": "eagle3" if use_eagle3 else "eagle",
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 2,
|
||||
"max_model_len": 128,
|
||||
},
|
||||
max_model_len=128,
|
||||
enforce_eager=False,
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
prompts = [{
|
||||
"role": "user",
|
||||
"content": "Hello, my name is"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "The president of the United States is"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "The capital of France is"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "The future of AI is"
|
||||
}]
|
||||
prompts = [
|
||||
tokenizer.apply_chat_template(
|
||||
[prompt],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
) for prompt in prompts
|
||||
]
|
||||
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=300,
|
||||
temperature=0.0,
|
||||
ignore_eos=False,
|
||||
)
|
||||
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
data_parallel_size=1,
|
||||
disable_log_stats=False,
|
||||
max_model_len=4096,
|
||||
seed=1024,
|
||||
async_scheduling=True,
|
||||
compilation_config={
|
||||
"level": 3,
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_num_of_warmups": 1,
|
||||
"cudagraph_capture_sizes": [12],
|
||||
},
|
||||
speculative_config={
|
||||
"disable_padded_drafter_batch": False,
|
||||
"method": "eagle3" if use_eagle3 else "eagle",
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 2,
|
||||
"max_model_len": 128,
|
||||
"draft_vocab_size": 128256,
|
||||
},
|
||||
)
|
||||
llm.generate(prompts, sampling_params)
|
||||
cleanup_dist_env_and_memory()
|
||||
del llm
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
|
||||
@@ -26,6 +26,13 @@ class TestEagleProposerInitialization(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
|
||||
def test_initialization_eagle(self):
|
||||
self.vllm_config.speculative_config.method = "eagle"
|
||||
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
|
||||
@@ -44,7 +51,7 @@ class TestEagleProposerInitialization(TestBase):
|
||||
self.assertEqual(proposer.input_ids.shape, (1024, ))
|
||||
self.assertEqual(proposer.positions.shape, (1024, ))
|
||||
self.assertEqual(proposer.hidden_states.shape, (1024, 4096))
|
||||
self.assertEqual(proposer.arange.shape, (33, ))
|
||||
self.assertEqual(proposer.arange.shape, (1024, ))
|
||||
|
||||
def test_initialization_eagle3(self):
|
||||
self.vllm_config.speculative_config.method = "eagle3"
|
||||
@@ -77,10 +84,16 @@ class TestEagleProposerLoadModel(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
|
||||
@patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.get_model")
|
||||
@@ -172,11 +185,17 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
self.proposer.model = MagicMock()
|
||||
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
||||
def test_dummy_run_basic(self, mock_context):
|
||||
num_tokens = 32
|
||||
@@ -216,6 +235,9 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
@@ -223,7 +245,12 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
self.proposer._propose = MagicMock(
|
||||
return_value=torch.tensor([[1, 2], [3, 4], [5, 6]]))
|
||||
|
||||
def test_generate_token_ids_without_metadata(self):
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
|
||||
# TODO: This is equivalent to disable_padded_drafter_batch=True.
|
||||
# We need to add some cases about disable_padded_drafter_batch=False in future.
|
||||
def test_generate_token_ids(self):
|
||||
valid_sampled = [[20, 30, 40]]
|
||||
scheduler_output = MagicMock()
|
||||
scheduler_output.num_scheduled_tokens = [2, 1, 3]
|
||||
@@ -239,7 +266,7 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
return_value={"layer_0": mock_attn_metadata})
|
||||
|
||||
result = self.proposer.generate_token_ids(
|
||||
valid_sampled_token_ids=valid_sampled,
|
||||
sampled_token_ids=valid_sampled,
|
||||
scheduler_output=scheduler_output,
|
||||
positions=positions,
|
||||
num_scheduled_tokens=num_scheduled,
|
||||
@@ -247,36 +274,13 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
)
|
||||
|
||||
self.proposer._propose.assert_called_once()
|
||||
self.assertEqual(result, [[1, 2], [3, 4], [5, 6]])
|
||||
|
||||
def test_generate_token_ids_with_metadata(self):
|
||||
valid_sampled = [[5], [6, 7], [8, 9, 10]]
|
||||
spec_metadata = MagicMock()
|
||||
spec_metadata.num_draft_tokens = [2, 3, 4]
|
||||
|
||||
mock_attn_metadata = MagicMock()
|
||||
mock_attn_metadata.slot_mapping = torch.tensor([0, 1, 2, 3, 4, 5])
|
||||
mock_attn_metadata.query_start_loc = torch.tensor([0, 1, 3, 6])
|
||||
mock_attn_metadata.block_tables = MagicMock()
|
||||
self.proposer._get_eagle_atten_dict = MagicMock(
|
||||
return_value={"layer_0": mock_attn_metadata})
|
||||
self.proposer._prepare_inputs = MagicMock(
|
||||
return_value=(torch.tensor([0, 2, 5]), torch.tensor([1, 3, 5])))
|
||||
|
||||
result = self.proposer.generate_token_ids(
|
||||
valid_sampled_token_ids=valid_sampled,
|
||||
spec_decode_metadata=spec_metadata,
|
||||
positions=torch.randn(6, 1),
|
||||
hidden_states=torch.randn(6, 4096),
|
||||
)
|
||||
|
||||
self.proposer._prepare_inputs.assert_called_once()
|
||||
self.assertEqual(self.proposer._propose.call_count, 1)
|
||||
self.assertEqual(len(result), 3)
|
||||
self.assertEqual(result.numpy().tolist(), [[1, 2], [3, 4], [5, 6]])
|
||||
|
||||
|
||||
class TestEagleProposerHelperMethods(TestBase):
|
||||
|
||||
# TODO: Can add some tests about prepare_next_token_ids in future.
|
||||
|
||||
def setUp(self):
|
||||
self.vllm_config = MagicMock(spec=VllmConfig)
|
||||
self.vllm_config.scheduler_config = MagicMock(max_num_seqs=3)
|
||||
@@ -293,21 +297,29 @@ class TestEagleProposerHelperMethods(TestBase):
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
|
||||
# TODO: This is equivalent to disable_padded_drafter_batch=True.
|
||||
# We need to add a test_prepare_inputs_padded in future.
|
||||
def test_prepare_inputs(self):
|
||||
self.proposer.token_arange_np = np.arange(10)
|
||||
mock_attn = MagicMock()
|
||||
mock_attn.slot_mapping = torch.tensor([0, 1, 2, 3, 4, 5])
|
||||
num_rejected = torch.tensor([1, 0, 1], device=self.device)
|
||||
mock_return_attn = MagicMock()
|
||||
|
||||
with patch.object(self.proposer,
|
||||
'_prepare_inputs',
|
||||
return_value=(torch.tensor([0, 2, 5]),
|
||||
'prepare_inputs',
|
||||
return_value=(mock_return_attn,
|
||||
torch.tensor([1, 2, 4]))):
|
||||
cu_num_tokens, indices = self.proposer._prepare_inputs(
|
||||
return_attn, indices = self.proposer.prepare_inputs(
|
||||
mock_attn, num_rejected)
|
||||
self.assertEqual(cu_num_tokens.tolist(), [0, 2, 5])
|
||||
self.assertEqual(indices.tolist(), [1, 2, 4])
|
||||
|
||||
Reference in New Issue
Block a user