[Refactor][EAGLE] 1/N delete __init__ in mtp_proposer (#5176)
### What this PR does / why we need it?
This PR aims to refactor eagle-related modules in vllm-ascend.
This is the starting PR of eagle refactoring. Provided with vllm-eagle,
ascend-eagle and ascend-mtp, we first let ascend-mtp inherit from
ascend-eagle and let ascend-eagle inherit from vllm-eagle. As a
initialization, we just delete `__init__` in mtp_proposer and simplify
the corresponding logic in eagle_proposer.
Based on "vllm-eagle <----- ascend-eagle <----- ascend-mtp", our target
is to gradually delete ascend-mtp and enable ascend-eagle to converge to
vllm-eagle. So the main workspace is eagle_proposer. In this way, we
hope that contributors can concurrently refactor eagle.
Incoming changes:
1. delete common methods in vllm-eagle & ascend-eagle & ascend-mtp
2. delete `load_model` in mtp_proposer
3. delete `dummy_run` and `propose` in mtp_proposer
4. ......
RFC: #5467
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
by ci
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: Zetong Li <slippersss@126.com>
This commit is contained in:
@@ -5,6 +5,7 @@ import torch
|
||||
from vllm.config import CacheConfig, CompilationMode, CUDAGraphMode, VllmConfig
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.ascend_config import init_ascend_config
|
||||
from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
|
||||
from vllm_ascend.spec_decode.interface import SpecDcodeType
|
||||
|
||||
@@ -25,13 +26,24 @@ class TestEagleProposerInitialization(TestBase):
|
||||
self.vllm_config.scheduler_config.max_num_seqs = 32
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
])
|
||||
self.vllm_config.additional_config = None
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
self.mock_supports_multimodal_inputs = patch(
|
||||
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
||||
)
|
||||
self.mock_supports_multimodal_inputs.start()
|
||||
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
self.mock_supports_multimodal_inputs.stop()
|
||||
|
||||
def test_initialization_eagle_graph(self):
|
||||
self.vllm_config.speculative_config.method = "eagle"
|
||||
@@ -40,12 +52,12 @@ class TestEagleProposerInitialization(TestBase):
|
||||
self.vllm_config.model_config.enforce_eager = False
|
||||
self.vllm_config.speculative_config.enforce_eager = False
|
||||
self.vllm_config.scheduler_config.async_scheduling = False
|
||||
init_ascend_config(self.vllm_config)
|
||||
|
||||
proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
|
||||
self.assertEqual(proposer.name, SpecDcodeType.EAGLE)
|
||||
self.assertEqual(proposer.block_size, 16)
|
||||
self.assertEqual(proposer.hidden_size, 4096)
|
||||
self.assertTrue(proposer.use_cuda_graph)
|
||||
@@ -60,12 +72,12 @@ class TestEagleProposerInitialization(TestBase):
|
||||
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
|
||||
self.vllm_config.compilation_config.mode = CompilationMode.NONE
|
||||
self.vllm_config.model_config.enforce_eager = True
|
||||
init_ascend_config(self.vllm_config)
|
||||
|
||||
proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
|
||||
self.assertEqual(proposer.name, SpecDcodeType.EAGLE3)
|
||||
self.assertEqual(proposer.hidden_size, 2048)
|
||||
self.assertFalse(proposer.use_cuda_graph)
|
||||
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
|
||||
@@ -77,12 +89,12 @@ class TestEagleProposerInitialization(TestBase):
|
||||
self.vllm_config.model_config.enforce_eager = False
|
||||
self.vllm_config.speculative_config.enforce_eager = False
|
||||
self.vllm_config.scheduler_config.async_scheduling = True
|
||||
init_ascend_config(self.vllm_config)
|
||||
|
||||
proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
|
||||
self.assertEqual(proposer.name, SpecDcodeType.EAGLE3)
|
||||
self.assertEqual(proposer.hidden_size, 2048)
|
||||
self.assertFalse(proposer.use_cuda_graph)
|
||||
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
|
||||
@@ -102,16 +114,28 @@ class TestEagleProposerLoadModel(TestBase):
|
||||
self.vllm_config.scheduler_config.max_num_seqs = 32
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
])
|
||||
self.vllm_config.additional_config = None
|
||||
init_ascend_config(self.vllm_config)
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
self.mock_supports_multimodal_inputs = patch(
|
||||
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
||||
)
|
||||
self.mock_supports_multimodal_inputs.start()
|
||||
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
self.mock_supports_multimodal_inputs.stop()
|
||||
|
||||
@patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
|
||||
@@ -204,10 +228,20 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
self.vllm_config.scheduler_config.max_num_seqs = 32
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(4)
|
||||
])
|
||||
self.vllm_config.additional_config = None
|
||||
init_ascend_config(self.vllm_config)
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
self.mock_supports_multimodal_inputs = patch(
|
||||
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
||||
)
|
||||
self.mock_supports_multimodal_inputs.start()
|
||||
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
@@ -216,6 +250,7 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
self.mock_supports_multimodal_inputs.stop()
|
||||
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
||||
@@ -287,16 +322,28 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
1: MagicMock(get_token_id=lambda x: 101),
|
||||
2: MagicMock(get_token_id=lambda x: 102),
|
||||
}
|
||||
self.runner.pcp_size = 1
|
||||
|
||||
self.vllm_config.cache_config.block_size = 16
|
||||
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
|
||||
self.vllm_config.scheduler_config.max_num_seqs = 32
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
])
|
||||
self.vllm_config.additional_config = None
|
||||
init_ascend_config(self.vllm_config)
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
self.mock_supports_multimodal_inputs = patch(
|
||||
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
||||
)
|
||||
self.mock_supports_multimodal_inputs.start()
|
||||
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
@@ -306,6 +353,7 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
self.mock_supports_multimodal_inputs.stop()
|
||||
|
||||
# TODO: This is equivalent to disable_padded_drafter_batch=True.
|
||||
# We need to add some cases about disable_padded_drafter_batch=False in future.
|
||||
@@ -355,16 +403,28 @@ class TestEagleProposerHelperMethods(TestBase):
|
||||
self.vllm_config.scheduler_config.max_num_seqs = 32
|
||||
self.vllm_config.model_config.dtype = torch.float16
|
||||
self.vllm_config.model_config.max_model_len = 2048
|
||||
self.vllm_config.model_config.uses_mrope = False
|
||||
self.vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
self.vllm_config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
])
|
||||
self.vllm_config.additional_config = None
|
||||
init_ascend_config(self.vllm_config)
|
||||
|
||||
self.mock_cpugpubuffer = patch(
|
||||
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
|
||||
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
self.mock_cpugpubuffer.start()
|
||||
self.mock_supports_multimodal_inputs = patch(
|
||||
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
||||
)
|
||||
self.mock_supports_multimodal_inputs.start()
|
||||
self.proposer = EagleProposer(vllm_config=self.vllm_config,
|
||||
device=self.device,
|
||||
runner=self.runner)
|
||||
|
||||
def tearDown(self):
|
||||
self.mock_cpugpubuffer.stop()
|
||||
self.mock_supports_multimodal_inputs.stop()
|
||||
|
||||
# TODO: This is equivalent to disable_padded_drafter_batch=True.
|
||||
# We need to add a test_prepare_inputs_padded in future.
|
||||
|
||||
@@ -16,12 +16,18 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
from vllm_ascend.ascend_config import init_ascend_config
|
||||
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
||||
from vllm_ascend.spec_decode.interface import SpecDcodeType
|
||||
from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
|
||||
|
||||
|
||||
class TestMtpProposer:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def patch_supports_multimodal_inputs(self):
|
||||
with patch(
|
||||
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
|
||||
):
|
||||
yield
|
||||
|
||||
@pytest.fixture
|
||||
def vllm_config(self):
|
||||
config = MagicMock(spec=VllmConfig)
|
||||
@@ -31,6 +37,9 @@ class TestMtpProposer:
|
||||
config.speculative_config.method = "deepseek_mtp"
|
||||
config.speculative_config.draft_model_config = MagicMock()
|
||||
config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
|
||||
config.speculative_config.speculative_token_tree = str([
|
||||
(i + 1) * (0, ) for i in range(2)
|
||||
])
|
||||
|
||||
config.model_config = MagicMock(spec=ModelConfig)
|
||||
config.model_config.dtype = torch.float16
|
||||
@@ -68,7 +77,7 @@ class TestMtpProposer:
|
||||
runner.reserved_mc2_mask = None
|
||||
return runner
|
||||
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
def test_init(self, mock_cpu_gpu_buffer, vllm_config, runner):
|
||||
mock_buffer_instance = MagicMock()
|
||||
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
|
||||
@@ -76,7 +85,6 @@ class TestMtpProposer:
|
||||
# Test basic initialization
|
||||
proposer = MtpProposer(vllm_config, torch.device("cpu"), runner)
|
||||
|
||||
assert proposer.name == SpecDcodeType.MTP
|
||||
assert proposer.vllm_config == vllm_config
|
||||
assert proposer.device == torch.device("cpu")
|
||||
assert proposer.dtype == torch.float16
|
||||
@@ -89,7 +97,7 @@ class TestMtpProposer:
|
||||
assert not hasattr(proposer, "mrope_positions")
|
||||
assert proposer.use_sparse is False
|
||||
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
def test_init_with_aclgraph(self, mock_cpu_gpu_buffer, vllm_config,
|
||||
runner):
|
||||
mock_buffer_instance = MagicMock()
|
||||
@@ -105,7 +113,7 @@ class TestMtpProposer:
|
||||
"vllm_ascend.spec_decode.mtp_proposer.process_weights_after_loading")
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.set_default_torch_dtype")
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.set_current_vllm_config")
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
def test_load_model(self, mock_cpu_gpu_buffer, mock_set_config,
|
||||
mock_set_dtype, mock_process_weights, mock_get_loader,
|
||||
mock_get_layers, vllm_config, runner):
|
||||
@@ -148,7 +156,7 @@ class TestMtpProposer:
|
||||
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.get_forward_context")
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.set_ascend_forward_context")
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
def test_dummy_run(self, mock_cpu_gpu_buffer, mock_set_context,
|
||||
mock_get_forward_context, vllm_config, runner):
|
||||
mock_buffer_instance = MagicMock()
|
||||
@@ -173,7 +181,7 @@ class TestMtpProposer:
|
||||
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.get_forward_context")
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.set_ascend_forward_context")
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
def test_dummy_run_full_graph(self, mock_cpu_gpu_buffer, mock_set_context,
|
||||
mock_get_forward_context, vllm_config,
|
||||
runner):
|
||||
@@ -201,7 +209,7 @@ class TestMtpProposer:
|
||||
# Check that model was called correct number of times
|
||||
assert proposer.model.call_count == vllm_config.speculative_config.num_speculative_tokens
|
||||
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
def test_generate_token_ids(self, mock_cpu_gpu_buffer):
|
||||
mock_buffer_instance = MagicMock()
|
||||
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
|
||||
@@ -272,7 +280,7 @@ class TestMtpProposer:
|
||||
proposer._propose.assert_called_once()
|
||||
assert torch.equal(draft_token_ids, proposer._propose.return_value)
|
||||
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
def test_prepare_next_token_ids_cpu(self, mock_cpu_gpu_buffer):
|
||||
mock_buffer_instance = MagicMock()
|
||||
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
|
||||
@@ -295,7 +303,7 @@ class TestMtpProposer:
|
||||
assert torch.all(
|
||||
result == torch.tensor([30, 50, 60], dtype=torch.int32))
|
||||
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
def test_prepare_next_token_ids_padded(self, mock_cpu_gpu_buffer):
|
||||
mock_common_attn_metadata = MagicMock(spec=CommonAttentionMetadata)
|
||||
mock_common_attn_metadata.seq_lens_cpu = torch.tensor(
|
||||
@@ -377,7 +385,7 @@ class TestMtpProposer:
|
||||
device=torch.device("cpu"))
|
||||
assert torch.equal(next_token_ids, expected_next_tokens)
|
||||
|
||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
|
||||
def test_prepare_inputs_padded(self, mock_cpu_gpu_buffer):
|
||||
mock_buffer_instance = MagicMock()
|
||||
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
|
||||
|
||||
Reference in New Issue
Block a user