Files
xc-llm-ascend/tests/ut/spec_decode/test_mtp_proposer.py
meihanc fea197ad50 [Main2Main] Upgrade vllm commit to 0123 (#6169)
### What this PR does / why we need it?
1.  Upgrade vllm commit to: 0115
(8471b27df97c3eb79f891802fc0e858f8f7ac6a0)
Modify import paths due to the refactors:
https://github.com/vllm-project/vllm/pull/32245
https://github.com/vllm-project/vllm/pull/32060
Test result:
https://github.com/vllm-project/vllm-ascend/actions/runs/21034239336/job/60490156965?pr=5913
2. Upgrade vllm commit to: 0119
(9a1f16da1e423ede2c2f52a9850cbfbb39cefe96)
Fix `WorkerProc.__init__() missing 1 required positional argument:
'is_driver_worker'` due to
https://github.com/vllm-project/vllm/pull/28506
Test result:
https://github.com/vllm-project/vllm-ascend/actions/runs/21156263050/job/60841668755?5569
3. Upgrade vllm commit to:
0120(148117ea2e689cd43df4be6892671a17cdae5833)
1. Add `skip_compiled` param in `set_forward_context` due to
https://github.com/vllm-project/vllm/pull/30385
2. Modify `tests/ut/spec_decode/test_eagle_proposer.py` due to
https://github.com/vllm-project/vllm/pull/24322
change `self.max_num_tokens =
vllm_config.scheduler_config.max_num_batched_tokens + max_batch_size`
3. Modify UT import paths due to the
refactors:https://github.com/vllm-project/vllm/pull/32060
Test result:
https://github.com/vllm-project/vllm-ascend/actions/runs/21204851770/job/60999046946
4. Upgrade vllm commit to:
0121(f23fb5a7c1b61350c5c40ca1115d3bf8cf2b8cc9)
1. vLLM switched `uses_mrope` from target to draft model config, making
`positions`/`mrope_positions` mutually exclusive, breaking vllm-ascend's
direct self.positions access and tests missing
`draft_model_config.uses_mrope`.
https://github.com/vllm-project/vllm/pull/32048
2. Moved bs_to_padded_graph_size from CompilationConfig to
CudagraphDispatcher due to the refactor
https://github.com/vllm-project/vllm/pull/30143
3. Remove unused `maybe_setup_kv_connector` due to
https://github.com/vllm-project/vllm/pull/32077
Test result:
https://github.com/vllm-project/vllm-ascend/actions/runs/21217728738/job/61043738834
6. Upgrade vllm commit to:
0122(8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5)
Updating FusedMoEParallelConfig (added enable_eplb) and FusedMoEConfig
due to https://github.com/vllm-project/vllm/pull/32414
Test result:
https://github.com/vllm-project/vllm-ascend/actions/runs/21249922546/job/61148613054
8. Upgrade vllm commit to:
0123(dc917cceb877dfd13f98c538c4c96158047d98bd)
Setting temperature=0.0 due to the removal of the default temperature
value in https://github.com/vllm-project/vllm/pull/32723
Test result:
https://github.com/vllm-project/vllm-ascend/actions/runs/21280796875
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.14.0
- vLLM main:
d68209402d

---------

Signed-off-by: wjunLu <wjunlu217@gmail.com>
Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
Co-authored-by: wjunLu <wjunlu217@gmail.com>
2026-01-27 08:44:36 +08:00

345 lines
15 KiB
Python

from unittest.mock import MagicMock, patch
import numpy as np
import pytest
import torch
from vllm.config import (CacheConfig, CompilationConfig, CUDAGraphMode,
ModelConfig, SchedulerConfig, SpeculativeConfig,
VllmConfig)
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
from vllm_ascend.ascend_config import init_ascend_config
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
class TestMtpProposer:
@pytest.fixture(autouse=True)
def patch_supports_multimodal_inputs(self):
with patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
):
yield
@pytest.fixture
def vllm_config(self):
config = MagicMock(spec=VllmConfig)
config.additional_config = None
config.speculative_config = MagicMock(spec=SpeculativeConfig)
config.speculative_config.num_speculative_tokens = 2
config.speculative_config.method = "mtp"
config.speculative_config.draft_model_config = MagicMock()
config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
config.speculative_config.draft_model_config.uses_mrope = False
config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
config.model_config = MagicMock(spec=ModelConfig)
config.model_config.dtype = torch.float16
config.model_config.max_model_len = 2048
config.model_config.uses_mrope = False
config.model_config.hf_text_config = None
config.model_config.hf_config = None
config.parallel_config.tensor_parallel_size = 1
config.speculative_config.draft_tensor_parallel_size = 1
config.load_config = None
config.cache_config = MagicMock(spec=CacheConfig)
config.cache_config.block_size = 16
config.scheduler_config = MagicMock(spec=SchedulerConfig)
config.scheduler_config.max_num_batched_tokens = 4096
config.scheduler_config.max_num_seqs = 256
config.compilation_config = MagicMock(spec=CompilationConfig)
config.compilation_config.cudagraph_capture_sizes = [1, 2, 4, 8]
config.compilation_config.static_forward_context = dict()
config.device_config = MagicMock()
config.device_config.device = torch.device("cpu")
init_ascend_config(config)
return config
@pytest.fixture
def runner(self):
runner = MagicMock()
runner.pcp_size = 1
runner.dcp_size = 1
runner.pcp_rank = 0
runner.max_num_tokens = 4096
runner.max_num_reqs = 256
runner._use_aclgraph.return_value = False
runner.reserved_mc2_mask = None
runner.pin_memory = False
return runner
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_init(self, mock_cpu_gpu_buffer, vllm_config, runner):
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
# Test basic initialization
proposer = MtpProposer(vllm_config, torch.device("cpu"), runner)
assert proposer.vllm_config == vllm_config
assert proposer.device == torch.device("cpu")
assert proposer.dtype == torch.float16
assert proposer.num_speculative_tokens == 2
assert proposer.hidden_size == 4096
# Test with mrope enabled
assert hasattr(proposer, "positions")
assert not hasattr(proposer, "mrope_positions")
assert proposer.use_sparse is False
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_init_with_aclgraph(self, mock_cpu_gpu_buffer, vllm_config,
runner):
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
runner._use_aclgraph.return_value = True
vllm_config.scheduler_config.async_scheduling = False
vllm_config.speculative_config.enforce_eager = False
proposer = MtpProposer(vllm_config, torch.device("cpu"), runner)
assert proposer.use_cuda_graph is True
@patch("vllm_ascend.spec_decode.mtp_proposer.get_forward_context")
@patch("vllm_ascend.spec_decode.mtp_proposer.set_ascend_forward_context")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_dummy_run(self, mock_cpu_gpu_buffer, mock_set_context,
mock_get_forward_context, vllm_config, runner):
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
proposer = MtpProposer(vllm_config, torch.device("cpu"), runner)
proposer.model = MagicMock()
proposer.enable_shared_expert_dp = False
runner._sync_metadata_across_dp.return_value = (8, 8, False)
mock_get_forward_context = MagicMock()
mock_get_forward_context.cudagraph_runtime_mode = None
mock_get_forward_context.capturing = True
# Execute
proposer.dummy_run(8)
# Verify
runner._sync_metadata_across_dp.assert_called_once()
mock_set_context.assert_called()
# Check that model was called correct number of times
assert proposer.model.call_count == vllm_config.speculative_config.num_speculative_tokens
@patch("vllm_ascend.spec_decode.mtp_proposer.get_forward_context")
@patch("vllm_ascend.spec_decode.mtp_proposer.set_ascend_forward_context")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_dummy_run_full_graph(self, mock_cpu_gpu_buffer, mock_set_context,
mock_get_forward_context, vllm_config,
runner):
# Setup
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
proposer = MtpProposer(vllm_config, torch.device("cpu"), runner)
proposer.enable_shared_expert_dp = False
proposer.model = MagicMock()
runner._sync_metadata_across_dp.return_value = (8, 8, False)
runner.attn_groups = []
mock_get_forward_context = MagicMock()
mock_get_forward_context.cudagraph_runtime_mode = None
mock_get_forward_context.capturing = True
# Execute
proposer.dummy_run(num_tokens=8,
num_reqs=5,
aclgraph_runtime_mode=CUDAGraphMode.FULL)
# Verify
runner._sync_metadata_across_dp.assert_called_once()
mock_set_context.assert_called()
# Check that model was called correct number of times
assert proposer.model.call_count == vllm_config.speculative_config.num_speculative_tokens
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_prepare_next_token_ids_cpu(self, mock_cpu_gpu_buffer):
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
sampled_token_ids = [[10, 20, 30], [40, 50], [60]]
mock_gpu_batch = MagicMock()
mock_gpu_batch.req_ids = ["req1", "req2", "req3"]
mock_num_scheduled = {"req1": 0, "req2": 0, "req3": 0}
proposer = MagicMock(spec=MtpProposer)
proposer.input_ids = MagicMock(device=torch.device("cpu"))
proposer.prepare_next_token_ids_cpu = MtpProposer.prepare_next_token_ids_cpu.__get__(
proposer)
result = proposer.prepare_next_token_ids_cpu(
sampled_token_ids=sampled_token_ids,
requests={},
gpu_input_batch=mock_gpu_batch,
num_scheduled_tokens=mock_num_scheduled)
assert torch.all(
result == torch.tensor([30, 50, 60], dtype=torch.int32))
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_prepare_next_token_ids_padded(self, mock_cpu_gpu_buffer):
mock_common_attn_metadata = MagicMock(spec=CommonAttentionMetadata)
mock_common_attn_metadata.seq_lens_cpu = torch.tensor(
[10, 8, 5, 12], dtype=torch.int32)
mock_sampled_token_ids = torch.tensor([
[101, 102, 103],
[201, -1, 203],
[-1, -1, -1],
[301, 10000, 303],
],
dtype=torch.int32,
device=torch.device("cpu"))
mock_requests = {} # dict[str, CachedRequestState]
req0 = MagicMock(spec=CachedRequestState)
req0.get_token_id = MagicMock(return_value=1000)
mock_requests["req_0"] = req0
req1 = MagicMock(spec=CachedRequestState)
req1.get_token_id = MagicMock(return_value=2000)
mock_requests["req_1"] = req1
req2 = MagicMock(spec=CachedRequestState)
req2.get_token_id = MagicMock(return_value=3000)
mock_requests["req_2"] = req2
req3 = MagicMock(spec=CachedRequestState)
req3.get_token_id = MagicMock(return_value=4000)
mock_requests["req_3"] = req3
mock_gpu_input_batch = MagicMock(spec=InputBatch)
mock_gpu_input_batch.num_reqs = 4
mock_gpu_input_batch.req_ids = ["req_0", "req_1", "req_2", "req_3"]
mock_gpu_input_batch.vocab_size = 5000
mock_backup = MagicMock()
mock_backup.np = np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int32)
mock_backup.gpu = torch.tensor([1, 2, 3, 4, 5, 6, 7],
dtype=torch.int32)
mock_backup.copy_to_gpu = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_backup
proposer = MagicMock(spec=MtpProposer)
proposer.backup_next_token_ids = mock_backup
proposer.input_ids = MagicMock(device=torch.device("cpu"))
proposer.prepare_next_token_ids_padded = MtpProposer.prepare_next_token_ids_padded.__get__(
proposer)
discard_request_indices = torch.tensor([1, 3], dtype=torch.int64)
num_discarded_requests = 2
next_token_ids, valid_sampled_tokens_count = proposer.prepare_next_token_ids_padded(
common_attn_metadata=mock_common_attn_metadata,
sampled_token_ids=mock_sampled_token_ids,
requests=mock_requests,
gpu_input_batch=mock_gpu_input_batch,
discard_request_indices=discard_request_indices,
num_discarded_requests=num_discarded_requests)
mock_backup_output = proposer.backup_next_token_ids
expected_backup_cpu = np.array(
[1000, 2000, 3000, 4000, 0, 0, 0, 0, 0, 0])
assert np.array_equal(mock_backup_output.np[:4],
expected_backup_cpu[:4])
mock_backup_output.copy_to_gpu.assert_called_once_with(4)
modified_sampled = mock_sampled_token_ids.clone()
modified_sampled.index_fill_(
0, discard_request_indices[:num_discarded_requests], -1)
assert valid_sampled_tokens_count[1].item() == 0
assert valid_sampled_tokens_count[3].item() == 0
expected_valid_count = torch.tensor([3, 0, 0, 0], dtype=torch.int32)
assert torch.equal(valid_sampled_tokens_count, expected_valid_count)
expected_next_tokens = torch.tensor([103, 2, 3, 4],
dtype=torch.int32,
device=torch.device("cpu"))
assert torch.equal(next_token_ids, expected_next_tokens)
@patch("vllm_ascend.spec_decode.eagle_proposer.HAS_TRITON", False)
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_prepare_inputs_padded(self, mock_cpu_gpu_buffer):
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
mock_common_attn_metadata = MagicMock(spec=CommonAttentionMetadata)
mock_common_attn_metadata.query_start_loc_cpu = torch.tensor(
[0, 8, 16, 24], dtype=torch.int32)
mock_common_attn_metadata.seq_lens_cpu = torch.tensor(
[8, 8, 8], dtype=torch.int32)
mock_common_attn_metadata.num_input_tokens = 3
mock_common_attn_metadata.query_start_loc = torch.tensor(
[0, 8, 16, 24], dtype=torch.int32)
mock_common_attn_metadata.seq_lens = torch.tensor([8, 8, 8],
dtype=torch.int32)
mock_common_attn_metadata.num_actual_tokens = 24
mock_common_attn_metadata.num_reqs = 3
mock_common_attn_metadata.num_computed_tokens_cpu = torch.tensor(
[5, 6, 7], dtype=torch.int32)
mock_common_attn_metadata.block_table_tensor = MagicMock()
mock_common_attn_metadata.slot_mapping = MagicMock()
mock_common_attn_metadata.positions = MagicMock()
mock_spec_decode_metadata = MagicMock(spec=SpecDecodeMetadata)
mock_spec_decode_metadata.cu_num_draft_tokens = torch.tensor(
[3, 5, 7], dtype=torch.int32)
mock_runner = MagicMock()
mock_runner.actual_seq_lengths_q = MagicMock()
mock_runner.attn_state = MagicMock()
mock_runner.graph_pad_size = 0
mock_runner.pcp_size = 1
mock_runner.decode_token_per_req = MagicMock()
proposer = MagicMock(spec=MtpProposer)
proposer.runner = mock_runner
proposer.pcp_size = 1
proposer.arange = torch.arange(100, dtype=torch.int32)
proposer.prepare_inputs_padded = MtpProposer.prepare_inputs_padded.__get__(
proposer)
mock_valid_sampled_tokens_count = torch.tensor([2, 1, 2],
dtype=torch.int32)
(spec_common_attn_metadata, token_indices,
token_indices_to_sample) = proposer.prepare_inputs_padded(
common_attn_metadata=mock_common_attn_metadata,
spec_decode_metadata=mock_spec_decode_metadata,
valid_sampled_tokens_count=mock_valid_sampled_tokens_count)
total_num_tokens = mock_common_attn_metadata.query_start_loc_cpu[
-1].item()
expected_token_indices = proposer.arange[:total_num_tokens]
assert torch.equal(token_indices, expected_token_indices)
assert token_indices.shape == (24, )
assert token_indices.dtype == torch.int32
expected_sample_indices = torch.tensor([5, 13, 22], dtype=torch.int32)
assert torch.equal(token_indices_to_sample, expected_sample_indices)
assert isinstance(spec_common_attn_metadata,
AscendCommonAttentionMetadata)
assert torch.equal(spec_common_attn_metadata.query_start_loc,
mock_common_attn_metadata.query_start_loc)
assert torch.equal(spec_common_attn_metadata.query_start_loc_cpu,
mock_common_attn_metadata.query_start_loc_cpu)
assert torch.equal(spec_common_attn_metadata.seq_lens_cpu,
mock_common_attn_metadata.seq_lens)
assert spec_common_attn_metadata.num_reqs == mock_common_attn_metadata.num_reqs
assert spec_common_attn_metadata.num_actual_tokens == total_num_tokens
assert spec_common_attn_metadata.max_query_len == 8
assert spec_common_attn_metadata.actual_seq_lengths_q == proposer.runner.actual_seq_lengths_q