xc-llm-ascend/tests/singlecard/spec_decode/test_dynamic_spec_decode.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
# Adapted from vllm-project/vllm/tests/spec_decode/test_dynamic_spec_decode.py
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from unittest.mock import MagicMock, patch

import pytest
import torch
from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.metrics import AsyncMetricsCollector
from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
from vllm.spec_decode.top1_proposer import Top1Proposer

from tests.singlecard.spec_decode.test_utils import mock_spec_decode_sampler
from tests.singlecard.spec_decode.utils import create_batch, mock_worker


@pytest.mark.parametrize('queue_size', [4])
@pytest.mark.parametrize('batch_size', [1])
@pytest.mark.parametrize('k', [1])
@pytest.mark.parametrize("acceptance_sampler_method",
                         ["rejection_sampler", "typical_acceptance_sampler"])
@torch.inference_mode()
def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
                             acceptance_sampler_method: str):
    """Verify that speculative tokens are disabled when the batch size
    exceeds the threshold.
    """
    disable_by_batch_size = 3
    draft_worker = mock_worker(cls=MultiStepWorker)
    target_worker = mock_worker()
    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
    worker = SpecDecodeWorker(proposer_worker=draft_worker,
                              scorer_worker=target_worker,
                              spec_decode_sampler=mock_spec_decode_sampler(
                                  acceptance_sampler_method),
                              disable_logprobs=False,
                              metrics_collector=metrics_collector,
                              disable_by_batch_size=disable_by_batch_size)

    exception_secret = 'artificial stop'
    draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)

    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
    execute_model_req = ExecuteModelRequest(
        seq_group_metadata_list=seq_group_metadata_list,
        num_lookahead_slots=k,
        running_queue_size=queue_size)

    if queue_size > disable_by_batch_size:
        with patch.object(worker,
                          '_run_no_spec',
                          side_effect=ValueError(exception_secret)), \
            pytest.raises(ValueError, match=exception_secret):
            worker.execute_model(execute_model_req=execute_model_req)

    # When the batch size is larger than the threshold,
    # we expect no speculative tokens (0).
    expected_num_spec_tokens = None if queue_size < disable_by_batch_size else 0
    assert seq_group_metadata_list[
        0].num_speculative_tokens == expected_num_spec_tokens

    draft_worker.sampler_output.side_effect = ValueError(exception_secret)

    proposer = Top1Proposer(
        worker=draft_worker,
        device='cpu',  # not used
        vocab_size=100,  # not used
        # Must be long enough to avoid being skipped due to length.
        max_proposal_len=1024,
    )

    if queue_size < disable_by_batch_size:
        # Should raise exception when executing the mocked draft model.
        with pytest.raises(ValueError, match=exception_secret):
            proposer.get_spec_proposals(
                execute_model_req=ExecuteModelRequest(
                    seq_group_metadata_list=seq_group_metadata_list,
                    num_lookahead_slots=k),
                seq_ids_with_bonus_token_in_last_step=set())
    else:
        # Should not execute the draft model because spec decode is disabled
        # for all requests. Accordingly, the proposal length should be 0.
        proposals = proposer.get_spec_proposals(
            execute_model_req=ExecuteModelRequest(
                seq_group_metadata_list=seq_group_metadata_list,
                num_lookahead_slots=k),
            seq_ids_with_bonus_token_in_last_step=set())
        assert proposals.proposal_lens.tolist() == [0] * batch_size
[SpecDecode] Add spec decode support (#500) ### What this PR does / why we need it? Backport: https://github.com/vllm-project/vllm-ascend/pull/252 This support speculative decoding in Ascend, including speculating with a draft model、by matching n-grams in the prompt、using MLP speculators and using EAGLE based draft models. Backport: https://github.com/vllm-project/vllm-ascend/pull/423 spec decode MultiStepWorker support TP1DraftModelRunner fully, support run the draft_model_runner with multi-step prepare on the NPU directly and support draft_model_runner use MLA. 1. before this pr, `MultiStepWorker` would not step into the branch using NPU prepare, but only into the branch using CPU prepare (`line 52` of `vllm_ascend/patch/patch_multi_step_worker.py`). Although this has `no effect` on the `correct operation` of speculative decoding and the performance of the two branches is basically the same as of the current version, I support entering this branch in this PR. In general, there are two main changes in `patch_multi_step_worker.py`: first, the `is_cuda_like()` check is removed and the `TP1DraftModelRunner` rewritten in vllm_ascend is used; second, the `supports_gpu_multi_step()` function is made to return true on NPU devices when outer Multi_step_worker could work correct. 3. before this pr, `TP1DraftModelRunner` only supports Attention on NPU, but not MLA. The relevant adaptation is in `vllm_ascend/worker/draft_model_runner.py`. Although I don’t know why the `input_positions` of `model_input.attn_metadata` in vllm-ascend needs to be added in `execute_model`, it is done in `model_runner.py`, so I also made corresponding changes. Otherwise, when atten_backend is MLA, it will prompt that input_positions cannot be found. 4. I commented out two lines in `draft_model_runner.py` in `line118` to support the scenario of K>1. ``` # lora_mapping=model_input.lora_mapping, # lora_requests=model_input.lora_requests, ``` I added comments. In the future, when vllm-ascend supports lora feature, the changes here can be restored. TODO： - [ ] revert the patch when the related issues are addressed in vllm ### How was this patch tested? CI passed with new added test. - e2e test for medusa proposer: tests/singlecard/spec_decode/e2e/test_medusa_correctness.py - e2e test for mlp proposer: tests/singlecard/spec_decode/e2e/test_mlp_correctness.py - e2e test for n-gram proposer: tests/singlecard/spec_decode/e2e/test_ngram_correctness.py Tests for patched files: - tests/singlecard/spec_decode/test_dynamic_spec_decode.py - tests/singlecard/spec_decode/test_multi_step_worker.py - tests/singlecard/spec_decode/test_ngram_worker.py - tests/singlecard/spec_decode/test_spec_decode_worker.py --------- Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: mengwei805 <mengwei25@huawei.com> 2025-04-17 20:16:32 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# This file is a part of the vllm-ascend project.`
			`# Adapted from vllm-project/vllm/tests/spec_decode/test_dynamic_spec_decode.py`
			`# Copyright 2023 The vLLM team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from unittest.mock import MagicMock, patch`

			`import pytest`
			`import torch`
			`from vllm.sequence import ExecuteModelRequest`
			`from vllm.spec_decode.metrics import AsyncMetricsCollector`
			`from vllm.spec_decode.multi_step_worker import MultiStepWorker`
			`from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker`
			`from vllm.spec_decode.top1_proposer import Top1Proposer`

			`from tests.singlecard.spec_decode.test_utils import mock_spec_decode_sampler`
			`from tests.singlecard.spec_decode.utils import create_batch, mock_worker`


			`@pytest.mark.parametrize('queue_size', [4])`
			`@pytest.mark.parametrize('batch_size', [1])`
			`@pytest.mark.parametrize('k', [1])`
			`@pytest.mark.parametrize("acceptance_sampler_method",`
			`["rejection_sampler", "typical_acceptance_sampler"])`
			`@torch.inference_mode()`
			`def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,`
			`acceptance_sampler_method: str):`
			`"""Verify that speculative tokens are disabled when the batch size`
			`exceeds the threshold.`
			`"""`
			`disable_by_batch_size = 3`
			`draft_worker = mock_worker(cls=MultiStepWorker)`
			`target_worker = mock_worker()`
			`metrics_collector = MagicMock(spec=AsyncMetricsCollector)`
			`worker = SpecDecodeWorker(proposer_worker=draft_worker,`
			`scorer_worker=target_worker,`
			`spec_decode_sampler=mock_spec_decode_sampler(`
			`acceptance_sampler_method),`
			`disable_logprobs=False,`
			`metrics_collector=metrics_collector,`
			`disable_by_batch_size=disable_by_batch_size)`

			`exception_secret = 'artificial stop'`
			`draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)`

			`seq_group_metadata_list, _, _ = create_batch(batch_size, k)`
			`execute_model_req = ExecuteModelRequest(`
			`seq_group_metadata_list=seq_group_metadata_list,`
			`num_lookahead_slots=k,`
			`running_queue_size=queue_size)`

			`if queue_size > disable_by_batch_size:`
			`with patch.object(worker,`
			`'_run_no_spec',`
			`side_effect=ValueError(exception_secret)), \`
			`pytest.raises(ValueError, match=exception_secret):`
			`worker.execute_model(execute_model_req=execute_model_req)`

			`# When the batch size is larger than the threshold,`
			`# we expect no speculative tokens (0).`
			`expected_num_spec_tokens = None if queue_size < disable_by_batch_size else 0`
			`assert seq_group_metadata_list[`
			`0].num_speculative_tokens == expected_num_spec_tokens`

			`draft_worker.sampler_output.side_effect = ValueError(exception_secret)`

			`proposer = Top1Proposer(`
			`worker=draft_worker,`
			`device='cpu', # not used`
			`vocab_size=100, # not used`
			`# Must be long enough to avoid being skipped due to length.`
			`max_proposal_len=1024,`
			`)`

			`if queue_size < disable_by_batch_size:`
			`# Should raise exception when executing the mocked draft model.`
			`with pytest.raises(ValueError, match=exception_secret):`
			`proposer.get_spec_proposals(`
			`execute_model_req=ExecuteModelRequest(`
			`seq_group_metadata_list=seq_group_metadata_list,`
			`num_lookahead_slots=k),`
			`seq_ids_with_bonus_token_in_last_step=set())`
			`else:`
			`# Should not execute the draft model because spec decode is disabled`
			`# for all requests. Accordingly, the proposal length should be 0.`
			`proposals = proposer.get_spec_proposals(`
			`execute_model_req=ExecuteModelRequest(`
			`seq_group_metadata_list=seq_group_metadata_list,`
			`num_lookahead_slots=k),`
			`seq_ids_with_bonus_token_in_last_step=set())`
			`assert proposals.proposal_lens.tolist() == [0] * batch_size`