### What this PR does / why we need it? Backport: https://github.com/vllm-project/vllm-ascend/pull/252 This support speculative decoding in Ascend, including speculating with a draft model、by matching n-grams in the prompt、using MLP speculators and using EAGLE based draft models. Backport: https://github.com/vllm-project/vllm-ascend/pull/423 spec decode MultiStepWorker support TP1DraftModelRunner fully, support run the draft_model_runner with multi-step prepare on the NPU directly and support draft_model_runner use MLA. 1. before this pr, `MultiStepWorker` would not step into the branch using NPU prepare, but only into the branch using CPU prepare (`line 52` of `vllm_ascend/patch/patch_multi_step_worker.py`). Although this has `no effect` on the `correct operation` of speculative decoding and the performance of the two branches is basically the same as of the current version, I support entering this branch in this PR. In general, there are two main changes in `patch_multi_step_worker.py`: first, the `is_cuda_like()` check is removed and the `TP1DraftModelRunner` rewritten in vllm_ascend is used; second, the `supports_gpu_multi_step()` function is made to return true on NPU devices when outer Multi_step_worker could work correct. 3. before this pr, `TP1DraftModelRunner` only supports Attention on NPU, but not MLA. The relevant adaptation is in `vllm_ascend/worker/draft_model_runner.py`. Although I don’t know why the `input_positions` of `model_input.attn_metadata` in vllm-ascend needs to be added in `execute_model`, it is done in `model_runner.py`, so I also made corresponding changes. Otherwise, when atten_backend is MLA, it will prompt that input_positions cannot be found. 4. I commented out two lines in `draft_model_runner.py` in `line118` to support the scenario of K>1. ``` # lora_mapping=model_input.lora_mapping, # lora_requests=model_input.lora_requests, ``` I added comments. In the future, when vllm-ascend supports lora feature, the changes here can be restored. TODO: - [ ] revert the patch when the related issues are addressed in vllm ### How was this patch tested? CI passed with new added test. - e2e test for medusa proposer: tests/singlecard/spec_decode/e2e/test_medusa_correctness.py - e2e test for mlp proposer: tests/singlecard/spec_decode/e2e/test_mlp_correctness.py - e2e test for n-gram proposer: tests/singlecard/spec_decode/e2e/test_ngram_correctness.py Tests for patched files: - tests/singlecard/spec_decode/test_dynamic_spec_decode.py - tests/singlecard/spec_decode/test_multi_step_worker.py - tests/singlecard/spec_decode/test_ngram_worker.py - tests/singlecard/spec_decode/test_spec_decode_worker.py --------- Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: mengwei805 <mengwei25@huawei.com>
239 lines
7.9 KiB
Python
239 lines
7.9 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# This file is a part of the vllm-ascend project.
|
|
# Adapted from vllm-project/vllm/tests/spec_decode/test_ngram_worker.py
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import torch
|
|
from vllm.sequence import ExecuteModelRequest
|
|
from vllm.spec_decode.ngram_worker import NGramWorker
|
|
from vllm.spec_decode.top1_proposer import Top1Proposer
|
|
|
|
from tests.singlecard.spec_decode.utils import (
|
|
create_seq_group_metadata_from_prompts, create_worker)
|
|
from vllm_ascend.patch.worker import patch_common # noqa: F401
|
|
|
|
|
|
def test_ngram_algo_correctness_for_single_no_match():
|
|
"""Verify our ngram algo find the right candidate in the prompt
|
|
|
|
For the scenario cannot find any candidate in one single batch
|
|
"""
|
|
block_size = 32
|
|
num_gpu_blocks = 2048 // block_size
|
|
seed = 100
|
|
model_name = 'JackFram/llama-68m'
|
|
vocab_size = 32_000
|
|
device = 'npu:0'
|
|
|
|
ngram_worker = create_worker(
|
|
NGramWorker,
|
|
model_name,
|
|
block_size,
|
|
num_gpu_blocks,
|
|
seed,
|
|
)
|
|
|
|
proposer = Top1Proposer(
|
|
worker=ngram_worker,
|
|
device=device,
|
|
vocab_size=vocab_size,
|
|
max_proposal_len=20,
|
|
)
|
|
|
|
# set ngram window [1, 3], which is window=1/2/3
|
|
ngram_worker.set_ngram_window_size(1, 3)
|
|
|
|
prompts = [
|
|
# shall find no candidate
|
|
[1, 2, 3, 4, 5, 6, 7],
|
|
]
|
|
|
|
proposal_len = 5
|
|
final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
|
|
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
|
|
prompts,
|
|
num_gpu_blocks,
|
|
block_size,
|
|
final_prompt_lens=final_prompt_lens)
|
|
|
|
proposals = proposer.get_spec_proposals(
|
|
execute_model_req=ExecuteModelRequest(
|
|
seq_group_metadata_list=seq_group_metadata_list,
|
|
num_lookahead_slots=proposal_len),
|
|
seq_ids_with_bonus_token_in_last_step=None)
|
|
|
|
assert torch.is_tensor(proposals.proposal_token_ids)
|
|
assert torch.is_tensor(proposals.proposal_probs)
|
|
|
|
assert proposals.proposal_token_ids.shape == torch.Size([1, proposal_len])
|
|
assert proposals.proposal_probs.shape[:-1] == torch.Size([1, proposal_len])
|
|
assert proposals.proposal_lens.shape == torch.Size([1])
|
|
assert proposals.proposal_lens.tolist() == [0]
|
|
|
|
|
|
def test_ngram_algo_correctness_for_batches_not_match_all():
|
|
"""Verify our ngram algo find the right candidate in the prompt
|
|
|
|
For the scenario find some candidate not full in batchs
|
|
"""
|
|
block_size = 32
|
|
num_gpu_blocks = 2048 // block_size
|
|
seed = 100
|
|
model_name = 'JackFram/llama-68m'
|
|
vocab_size = 32_000
|
|
device = 'npu:0'
|
|
|
|
ngram_worker = create_worker(
|
|
NGramWorker,
|
|
model_name,
|
|
block_size,
|
|
num_gpu_blocks,
|
|
seed,
|
|
)
|
|
|
|
proposer = Top1Proposer(
|
|
worker=ngram_worker,
|
|
device=device,
|
|
vocab_size=vocab_size,
|
|
max_proposal_len=20,
|
|
)
|
|
|
|
# set ngram window [1, 3], which is window=1/2/3
|
|
ngram_worker.set_ngram_window_size(1, 3)
|
|
|
|
prompts = [
|
|
# shall find no candidate
|
|
[1, 2, 3, 4, 5, 6, 7],
|
|
# shall find candidate 12,13,14,15,16
|
|
[11, 12, 13, 14, 15, 16, 11],
|
|
# shall find candidate 23,24,25,26,21
|
|
[21, 21, 22, 23, 24, 25, 26, 21, 22],
|
|
# shall find candidate 34,35,36,37,38
|
|
[31, 32, 31, 32, 33, 34, 35, 36, 37, 38, 31, 32, 33],
|
|
# shall find no candidate as exceed max_proposal_len
|
|
[
|
|
31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 33, 34, 35, 36, 37,
|
|
38, 31, 32, 33
|
|
],
|
|
]
|
|
|
|
proposal_len = 5
|
|
final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
|
|
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
|
|
prompts,
|
|
num_gpu_blocks,
|
|
block_size,
|
|
final_prompt_lens=final_prompt_lens)
|
|
for sg in seq_group_metadata_list:
|
|
sg.is_prompt = False
|
|
proposals = proposer.get_spec_proposals(
|
|
execute_model_req=ExecuteModelRequest(
|
|
seq_group_metadata_list=seq_group_metadata_list,
|
|
num_lookahead_slots=proposal_len),
|
|
seq_ids_with_bonus_token_in_last_step=None)
|
|
|
|
assert torch.is_tensor(proposals.proposal_token_ids)
|
|
assert torch.is_tensor(proposals.proposal_probs)
|
|
|
|
assert proposals.proposal_token_ids.shape == torch.Size([5, proposal_len])
|
|
assert proposals.proposal_probs.shape[:-1] == torch.Size([5, proposal_len])
|
|
assert proposals.proposal_lens.shape == torch.Size([5])
|
|
|
|
# the first sequence has no match so proposal_len should be overwritten to 0
|
|
assert proposals.proposal_lens.tolist(
|
|
) == [0] + [proposal_len for _ in range(3)] + [0]
|
|
|
|
for i in range(proposal_len):
|
|
assert proposals.proposal_token_ids[0][i] == -1
|
|
assert proposals.proposal_token_ids[1][i] == prompts[1][i + 1]
|
|
assert proposals.proposal_token_ids[2][i] == prompts[2][i + 3]
|
|
assert proposals.proposal_token_ids[3][i] == prompts[3][i + 5]
|
|
assert proposals.proposal_token_ids[4][i] == -1
|
|
|
|
|
|
def test_ngram_algo_correctness_for_batches_match_all():
|
|
"""Verify our ngram algo find the right candidate in the prompt
|
|
|
|
For the scenario find candidate in all batches
|
|
"""
|
|
|
|
block_size = 32
|
|
num_gpu_blocks = 2048 // block_size
|
|
seed = 100
|
|
model_name = 'JackFram/llama-68m'
|
|
vocab_size = 32_000
|
|
device = 'npu:0'
|
|
|
|
ngram_worker = create_worker(
|
|
NGramWorker,
|
|
model_name,
|
|
block_size,
|
|
num_gpu_blocks,
|
|
seed,
|
|
)
|
|
|
|
proposer = Top1Proposer(
|
|
worker=ngram_worker,
|
|
device=device,
|
|
vocab_size=vocab_size,
|
|
max_proposal_len=20,
|
|
)
|
|
|
|
# set ngram window [0, 3], which is window=1/2/3
|
|
ngram_worker.set_ngram_window_size(1, 3)
|
|
|
|
prompts = [
|
|
# shall find candidate 12,13,14,15,16
|
|
[11, 12, 13, 14, 15, 16, 11],
|
|
# shall find candidate 23,24,25,26,21
|
|
[21, 21, 22, 23, 24, 25, 26, 21, 22],
|
|
# shall find candidate 34,35,36,37,38
|
|
[31, 32, 31, 32, 33, 34, 35, 36, 37, 38, 31, 32, 33],
|
|
]
|
|
|
|
proposal_len = 5
|
|
final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
|
|
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
|
|
prompts,
|
|
num_gpu_blocks,
|
|
block_size,
|
|
final_prompt_lens=final_prompt_lens)
|
|
|
|
# Normally drafter is run on decode requests only; here we check the output
|
|
# of the ngram worker as it is the sole proposer that has no forward.
|
|
for sg in seq_group_metadata_list:
|
|
sg.is_prompt = False
|
|
proposals = proposer.get_spec_proposals(
|
|
execute_model_req=ExecuteModelRequest(
|
|
seq_group_metadata_list=seq_group_metadata_list,
|
|
num_lookahead_slots=proposal_len),
|
|
seq_ids_with_bonus_token_in_last_step=None)
|
|
|
|
assert torch.is_tensor(proposals.proposal_token_ids)
|
|
assert torch.is_tensor(proposals.proposal_probs)
|
|
|
|
assert proposals.proposal_token_ids.shape == torch.Size([3, proposal_len])
|
|
assert proposals.proposal_probs.shape[:-1] == torch.Size([3, proposal_len])
|
|
assert proposals.proposal_lens.shape == torch.Size([3])
|
|
|
|
assert proposals.proposal_lens.tolist() == [proposal_len for _ in range(3)]
|
|
|
|
for i in range(proposal_len):
|
|
assert proposals.proposal_token_ids[0][i] == prompts[0][i + 1]
|
|
assert proposals.proposal_token_ids[1][i] == prompts[1][i + 3]
|
|
assert proposals.proposal_token_ids[2][i] == prompts[2][i + 5]
|