[SpecDecode] Add spec decode support (#500)
### What this PR does / why we need it?
Backport: https://github.com/vllm-project/vllm-ascend/pull/252
This support speculative decoding in Ascend, including speculating with
a draft model、by matching n-grams in the prompt、using MLP speculators
and using EAGLE based draft models.
Backport: https://github.com/vllm-project/vllm-ascend/pull/423
spec decode MultiStepWorker support TP1DraftModelRunner fully, support
run the draft_model_runner with multi-step prepare on the NPU directly
and support draft_model_runner use MLA.
1. before this pr, `MultiStepWorker` would not step into the branch
using NPU prepare, but only into the branch using CPU prepare (`line 52`
of `vllm_ascend/patch/patch_multi_step_worker.py`). Although this has
`no effect` on the `correct operation` of speculative decoding and the
performance of the two branches is basically the same as of the current
version, I support entering this branch in this PR. In general, there
are two main changes in `patch_multi_step_worker.py`: first, the
`is_cuda_like()` check is removed and the `TP1DraftModelRunner`
rewritten in vllm_ascend is used; second, the
`supports_gpu_multi_step()` function is made to return true on NPU
devices when outer Multi_step_worker could work correct.
3. before this pr, `TP1DraftModelRunner` only supports Attention on NPU,
but not MLA. The relevant adaptation is in
`vllm_ascend/worker/draft_model_runner.py`. Although I don’t know why
the `input_positions` of `model_input.attn_metadata` in vllm-ascend
needs to be added in `execute_model`, it is done in `model_runner.py`,
so I also made corresponding changes. Otherwise, when atten_backend is
MLA, it will prompt that input_positions cannot be found.
4. I commented out two lines in `draft_model_runner.py` in `line118` to
support the scenario of K>1.
```
# lora_mapping=model_input.lora_mapping,
# lora_requests=model_input.lora_requests,
```
I added comments. In the future, when vllm-ascend supports lora feature,
the changes here can be restored.
TODO:
- [ ] revert the patch when the related issues are addressed in vllm
### How was this patch tested?
CI passed with new added test.
- e2e test for medusa proposer:
tests/singlecard/spec_decode/e2e/test_medusa_correctness.py
- e2e test for mlp proposer:
tests/singlecard/spec_decode/e2e/test_mlp_correctness.py
- e2e test for n-gram proposer:
tests/singlecard/spec_decode/e2e/test_ngram_correctness.py
Tests for patched files:
- tests/singlecard/spec_decode/test_dynamic_spec_decode.py
- tests/singlecard/spec_decode/test_multi_step_worker.py
- tests/singlecard/spec_decode/test_ngram_worker.py
- tests/singlecard/spec_decode/test_spec_decode_worker.py
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: mengwei805 <mengwei25@huawei.com>
2025-04-17 20:16:32 +08:00
|
|
|
#
|
|
|
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
# Adapted from vllm-project/vllm/tests/spec_decode/test_dynamic_spec_decode.py
|
|
|
|
|
# Copyright 2023 The vLLM team.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
import torch
|
|
|
|
|
from vllm.sequence import ExecuteModelRequest
|
|
|
|
|
from vllm.spec_decode.metrics import AsyncMetricsCollector
|
|
|
|
|
from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
|
|
|
|
from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
|
|
|
|
|
from vllm.spec_decode.top1_proposer import Top1Proposer
|
|
|
|
|
|
2025-05-28 06:31:35 +08:00
|
|
|
from tests.long_term.spec_decode.test_utils import mock_spec_decode_sampler
|
|
|
|
|
from tests.long_term.spec_decode.utils import create_batch, mock_worker
|
[SpecDecode] Add spec decode support (#500)
### What this PR does / why we need it?
Backport: https://github.com/vllm-project/vllm-ascend/pull/252
This support speculative decoding in Ascend, including speculating with
a draft model、by matching n-grams in the prompt、using MLP speculators
and using EAGLE based draft models.
Backport: https://github.com/vllm-project/vllm-ascend/pull/423
spec decode MultiStepWorker support TP1DraftModelRunner fully, support
run the draft_model_runner with multi-step prepare on the NPU directly
and support draft_model_runner use MLA.
1. before this pr, `MultiStepWorker` would not step into the branch
using NPU prepare, but only into the branch using CPU prepare (`line 52`
of `vllm_ascend/patch/patch_multi_step_worker.py`). Although this has
`no effect` on the `correct operation` of speculative decoding and the
performance of the two branches is basically the same as of the current
version, I support entering this branch in this PR. In general, there
are two main changes in `patch_multi_step_worker.py`: first, the
`is_cuda_like()` check is removed and the `TP1DraftModelRunner`
rewritten in vllm_ascend is used; second, the
`supports_gpu_multi_step()` function is made to return true on NPU
devices when outer Multi_step_worker could work correct.
3. before this pr, `TP1DraftModelRunner` only supports Attention on NPU,
but not MLA. The relevant adaptation is in
`vllm_ascend/worker/draft_model_runner.py`. Although I don’t know why
the `input_positions` of `model_input.attn_metadata` in vllm-ascend
needs to be added in `execute_model`, it is done in `model_runner.py`,
so I also made corresponding changes. Otherwise, when atten_backend is
MLA, it will prompt that input_positions cannot be found.
4. I commented out two lines in `draft_model_runner.py` in `line118` to
support the scenario of K>1.
```
# lora_mapping=model_input.lora_mapping,
# lora_requests=model_input.lora_requests,
```
I added comments. In the future, when vllm-ascend supports lora feature,
the changes here can be restored.
TODO:
- [ ] revert the patch when the related issues are addressed in vllm
### How was this patch tested?
CI passed with new added test.
- e2e test for medusa proposer:
tests/singlecard/spec_decode/e2e/test_medusa_correctness.py
- e2e test for mlp proposer:
tests/singlecard/spec_decode/e2e/test_mlp_correctness.py
- e2e test for n-gram proposer:
tests/singlecard/spec_decode/e2e/test_ngram_correctness.py
Tests for patched files:
- tests/singlecard/spec_decode/test_dynamic_spec_decode.py
- tests/singlecard/spec_decode/test_multi_step_worker.py
- tests/singlecard/spec_decode/test_ngram_worker.py
- tests/singlecard/spec_decode/test_spec_decode_worker.py
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: mengwei805 <mengwei25@huawei.com>
2025-04-17 20:16:32 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('queue_size', [4])
|
|
|
|
|
@pytest.mark.parametrize('batch_size', [1])
|
|
|
|
|
@pytest.mark.parametrize('k', [1])
|
|
|
|
|
@pytest.mark.parametrize("acceptance_sampler_method",
|
|
|
|
|
["rejection_sampler", "typical_acceptance_sampler"])
|
|
|
|
|
@torch.inference_mode()
|
|
|
|
|
def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
|
|
|
|
|
acceptance_sampler_method: str):
|
|
|
|
|
"""Verify that speculative tokens are disabled when the batch size
|
|
|
|
|
exceeds the threshold.
|
|
|
|
|
"""
|
|
|
|
|
disable_by_batch_size = 3
|
|
|
|
|
draft_worker = mock_worker(cls=MultiStepWorker)
|
|
|
|
|
target_worker = mock_worker()
|
|
|
|
|
metrics_collector = MagicMock(spec=AsyncMetricsCollector)
|
|
|
|
|
worker = SpecDecodeWorker(proposer_worker=draft_worker,
|
|
|
|
|
scorer_worker=target_worker,
|
|
|
|
|
spec_decode_sampler=mock_spec_decode_sampler(
|
|
|
|
|
acceptance_sampler_method),
|
|
|
|
|
disable_logprobs=False,
|
|
|
|
|
metrics_collector=metrics_collector,
|
|
|
|
|
disable_by_batch_size=disable_by_batch_size)
|
|
|
|
|
|
|
|
|
|
exception_secret = 'artificial stop'
|
|
|
|
|
draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
|
|
|
|
|
|
|
|
|
|
seq_group_metadata_list, _, _ = create_batch(batch_size, k)
|
|
|
|
|
execute_model_req = ExecuteModelRequest(
|
|
|
|
|
seq_group_metadata_list=seq_group_metadata_list,
|
|
|
|
|
num_lookahead_slots=k,
|
|
|
|
|
running_queue_size=queue_size)
|
|
|
|
|
|
|
|
|
|
if queue_size > disable_by_batch_size:
|
|
|
|
|
with patch.object(worker,
|
|
|
|
|
'_run_no_spec',
|
|
|
|
|
side_effect=ValueError(exception_secret)), \
|
|
|
|
|
pytest.raises(ValueError, match=exception_secret):
|
|
|
|
|
worker.execute_model(execute_model_req=execute_model_req)
|
|
|
|
|
|
|
|
|
|
# When the batch size is larger than the threshold,
|
|
|
|
|
# we expect no speculative tokens (0).
|
|
|
|
|
expected_num_spec_tokens = None if queue_size < disable_by_batch_size else 0
|
|
|
|
|
assert seq_group_metadata_list[
|
|
|
|
|
0].num_speculative_tokens == expected_num_spec_tokens
|
|
|
|
|
|
|
|
|
|
draft_worker.sampler_output.side_effect = ValueError(exception_secret)
|
|
|
|
|
|
|
|
|
|
proposer = Top1Proposer(
|
|
|
|
|
worker=draft_worker,
|
|
|
|
|
device='cpu', # not used
|
|
|
|
|
vocab_size=100, # not used
|
|
|
|
|
# Must be long enough to avoid being skipped due to length.
|
|
|
|
|
max_proposal_len=1024,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if queue_size < disable_by_batch_size:
|
|
|
|
|
# Should raise exception when executing the mocked draft model.
|
|
|
|
|
with pytest.raises(ValueError, match=exception_secret):
|
|
|
|
|
proposer.get_spec_proposals(
|
|
|
|
|
execute_model_req=ExecuteModelRequest(
|
|
|
|
|
seq_group_metadata_list=seq_group_metadata_list,
|
|
|
|
|
num_lookahead_slots=k),
|
|
|
|
|
seq_ids_with_bonus_token_in_last_step=set())
|
|
|
|
|
else:
|
|
|
|
|
# Should not execute the draft model because spec decode is disabled
|
|
|
|
|
# for all requests. Accordingly, the proposal length should be 0.
|
|
|
|
|
proposals = proposer.get_spec_proposals(
|
|
|
|
|
execute_model_req=ExecuteModelRequest(
|
|
|
|
|
seq_group_metadata_list=seq_group_metadata_list,
|
|
|
|
|
num_lookahead_slots=k),
|
|
|
|
|
seq_ids_with_bonus_token_in_last_step=set())
|
|
|
|
|
assert proposals.proposal_lens.tolist() == [0] * batch_size
|