[CI/UT][bugfix] fix v0 spec decode (#1321)

### What this PR does / why we need it?
1. [PR913](https://github.com/vllm-project/vllm-ascend/pull/913)
introduced an error that caused V0's spec decode function to fail.
[PR1109](https://github.com/vllm-project/vllm-ascend/pull/1109) wanted
to fix this problem. Unfortunately, the fix broke the ngram function. I
fixed the ngram function in this PR. **PS**: Q: Why is there a problem
when ngram is not found when pr1109 is merged? A: The newly introduced
problem will only appear when tp>1, and the use cases on CI are all tp=1
2. In versions after 0.7.3, vllm-ascend deleted some spec decode UTs to
avoid CI taking too long, including eagle speculative UTs, which made CI
unable to take care of the eagle function. I added
it(`test_eagle_correctness.py`) back in this PR
3. Because of the reason mentioned in 2, the current version of Eagle
has a problem. I located and fixed this problem. It was because vllm's
`draft_model_runner.py` was changed and vllm-ascend was not synchronized
in time.
4. Currently, the UTs of v0 and v1 are mixed in the spec_decode
directory. I split them into two directories: spec_decode_v0 and
spec_decode_v1.
5. i found
`vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor`
and
`vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace`
have changed in vllm, so i remove it in this pr.

### Does this PR introduce _any_ user-facing change?
This PR fixes the functions of ngram and eagle spec decode in the v0
engine

### How was this patch tested?
tested by CI

Signed-off-by: mengwei805 <mengwei25@huawei.com>
This commit is contained in:
wemaster
2025-06-23 09:05:13 +08:00
committed by GitHub
parent 7e6efbf2a9
commit 339d6894f6
22 changed files with 384 additions and 58 deletions

View File

@@ -97,13 +97,16 @@ jobs:
- name: Run vllm-project/vllm-ascend long term test
run: |
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
# spec decode test
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
# v0 spec decode test
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py # it needs a clean process
pytest -sv tests/e2e/long_term/spec_decode_v0 --ignore=tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
# v1 spec decode test
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process
pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py
# accuracy test single card
pytest -sv tests/e2e/long_term/test_accuracy.py
else
# accuracy test multi card
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
fi

View File

@@ -0,0 +1,344 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
# Adapted from vllm-project/vllm/tests/spec_decode/e2e/test_eagle_correctness.py
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""This docstring details important information on the testing methodology.
Most of the tests rely on "greedy equality", where we expect the output of
speculative decoding on a sequence to exactly match the output of normal non-
speculative decoding.
Since speculative decoding with rejection sampling guarantees that the output
distribution matches the target model's output distribution (up to hardware
numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
equality.
However, we still need to verify below scenario could be passed:
* Batch size 1 greedy equality
* Batch size >1 greedy equality
* Test greedy equality under preemption
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, EAGLE would not break the
correctness for the target model outputs.
"""
import pytest
from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
run_equality_correctness_test
# main model
MAIN_MODEL = "JackFram/llama-68m"
# speculative model
SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random"
# max. number of speculative tokens: this corresponds to
# num_heads in the config.json of the speculator model.
MAX_SPEC_TOKENS = 4
# precision
# TODO The vLLM here uses float32, but some op on the vllm-ascend
# do not support float32, such as ROPE, When it is fixed, it is
# recommended to change this to float32.
PRECISION = "float16"
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
},
])
@pytest.mark.parametrize("output_len", [
128,
])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size: int, output_len: int,
seed: int):
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_logprobs": False,
},
}, {
"speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_logprobs": True,
},
}])
@pytest.mark.parametrize("output_len", [
128,
])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("logprobs", [1, 6])
def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size: int, output_len: int, seed: int,
logprobs: int):
run_equality_correctness_test(
vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
output_len,
seed,
logprobs=logprobs,
prompt_logprobs=logprobs,
disable_logprobs=test_llm_kwargs["speculative_config"]
["disable_logprobs"])
@pytest.mark.skipif(True, reason="Open it when graph mode ready.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"enforce_eager": False,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
},
])
@pytest.mark.parametrize("output_len", [
128,
])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
def test_eagle_e2e_greedy_correctness_cuda_graph(
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
seed: int):
"""Verify greedy equality with cuda graph enabled and different
batch sizes."""
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
@pytest.mark.skipif(True, reason="Open it when preempt ready.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"block_size": 8,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
},
])
@pytest.mark.parametrize(
"output_len",
[
# Use small output len for fast test.
128,
])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
def test_eagle_e2e_greedy_correctness_with_preemption(
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
seed: int):
"""Verify greedy equality, even when some sequences are preempted mid-
generation.
"""
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize(
"test_llm_kwargs",
[
{
"speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": k,
},
}
# Try a range of num. speculative tokens
for k in range(1, 1 + MAX_SPEC_TOKENS)
])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("seed", [1])
def test_eagle_different_k(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs, baseline_llm_kwargs,
test_llm_kwargs, batch_size: int, output_len: int,
seed: int):
"""Verify that eagle speculative decoding produces exact equality
to without spec decode with different values of num_speculative_tokens.
"""
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_by_batch_size": 4,
},
}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("seed", [1])
def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs, baseline_llm_kwargs,
test_llm_kwargs, batch_size: int, output_len: int,
seed: int):
"""Verify that eagle speculative decoding produces exact equality
to without spec decode when speculation is disabled for large
batch sizes.
"""
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
if __name__ == "__main__":
import pytest
pytest.main([__file__])

View File

@@ -41,9 +41,10 @@ import os
import pytest
from tests.e2e.long_term.spec_decode.e2e.conftest import \
from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
run_equality_correctness_test
from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill
from tests.e2e.long_term.spec_decode_v0.utils import \
maybe_enable_chunked_prefill
# main model
# lmsys/vicuna-7b-v1.3 was to be used but it's causing

View File

@@ -41,9 +41,10 @@ import pytest
from vllm.model_executor.layers.vocab_parallel_embedding import \
pad_vocab_size # noqa: F401
from tests.e2e.long_term.spec_decode.e2e.conftest import \
from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
run_equality_correctness_test
from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill
from tests.e2e.long_term.spec_decode_v0.utils import \
maybe_enable_chunked_prefill
# main model
MAIN_MODEL = "JackFram/llama-160m"

View File

@@ -44,9 +44,10 @@ for the target model outputs.
import pytest
from tests.e2e.long_term.spec_decode.e2e.conftest import \
from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
run_equality_correctness_test
from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill
from tests.e2e.long_term.spec_decode_v0.utils import \
maybe_enable_chunked_prefill
@pytest.mark.parametrize(

View File

@@ -27,8 +27,9 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
from vllm.spec_decode.top1_proposer import Top1Proposer
from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler
from tests.e2e.long_term.spec_decode.utils import create_batch, mock_worker
from tests.e2e.long_term.spec_decode_v0.test_utils import \
mock_spec_decode_sampler
from tests.e2e.long_term.spec_decode_v0.utils import create_batch, mock_worker
@pytest.mark.parametrize('queue_size', [4])

View File

@@ -29,7 +29,7 @@ from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob,
from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.top1_proposer import Top1Proposer
from tests.e2e.long_term.spec_decode.utils import (
from tests.e2e.long_term.spec_decode_v0.utils import (
assert_logprobs_dict_allclose, create_batch,
create_seq_group_metadata_from_prompts, create_worker,
patch_execute_model_with_seeds, zero_kv_cache)

View File

@@ -22,7 +22,7 @@ from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.ngram_worker import NGramWorker
from vllm.spec_decode.top1_proposer import Top1Proposer
from tests.e2e.long_term.spec_decode.utils import (
from tests.e2e.long_term.spec_decode_v0.utils import (
create_seq_group_metadata_from_prompts, create_worker)

View File

@@ -35,10 +35,10 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
split_num_cache_blocks_evenly)
from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler
from tests.e2e.long_term.spec_decode.utils import (create_batch,
create_sampler_output_list,
create_worker, mock_worker)
from tests.e2e.long_term.spec_decode_v0.test_utils import \
mock_spec_decode_sampler
from tests.e2e.long_term.spec_decode_v0.utils import (
create_batch, create_sampler_output_list, create_worker, mock_worker)
from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
from vllm_ascend.worker.worker import NPUWorker

View File

@@ -100,18 +100,6 @@
# Future Plan:
# Revert it when the related pr is merged in vllm and vllm-ascend.
#
# 2. `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor` and
# `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace`
# Why:
# vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change.
# How
# Use vLLM 0.8.4 method to patch it.
# Related PR (if no, explain why):
# - https://github.com/vllm-project/vllm/pull/15195
# - https://github.com/vllm-project/vllm-ascend/pull/395
# Future Plan:
# Remove it when we identify the reasons clearly.
#
# ** File: worker/patch_common/patch_spec_decode_worker.py **
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker`

View File

@@ -88,20 +88,4 @@ def sampler_output(
return filtered_model_outputs, True
def set_include_gpu_probs_tensor(self) -> None:
# Need include_gpu_probs_tensor for MultiSteoWorker
if hasattr(self.model_runner.model, "sampler"):
self.model_runner.model.sampler.include_gpu_probs_tensor = True
self.model_runner.sampler.include_gpu_probs_tensor = True
def set_should_modify_greedy_probs_inplace(self) -> None:
if hasattr(self.model_runner.model, "sampler"):
self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
True)
self.model_runner.sampler.should_modify_greedy_probs_inplace = True
MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output)
MultiStepWorker.set_include_gpu_probs_tensor = set_include_gpu_probs_tensor
MultiStepWorker.set_should_modify_greedy_probs_inplace = set_should_modify_greedy_probs_inplace

View File

@@ -57,11 +57,6 @@ def create_worker(
ngram_prompt_lookup_min = (
draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
# TODO(Yizhou): A quick fix, must be refactored ASAP
draft_worker_kwargs["vllm_config"].parallel_config.expert_parallel_size = 1
draft_worker_kwargs[
"vllm_config"].parallel_config.expert_tensor_parallel_size = 1
draft_model_config = draft_worker_kwargs["vllm_config"].model_config
draft_parallel_config: ParallelConfig = draft_worker_kwargs[
'vllm_config'].parallel_config
@@ -72,6 +67,13 @@ def create_worker(
proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
ngram_prompt_lookup_max)
else:
# TODO(Yizhou): A quick fix, must be refactored ASAP
# ngram need not this fix.
draft_worker_kwargs[
"vllm_config"].parallel_config.expert_parallel_size = 1
draft_worker_kwargs[
"vllm_config"].parallel_config.expert_tensor_parallel_size = 1
draft_tp = draft_parallel_config.tensor_parallel_size
target_tp = scorer_worker.parallel_config.tensor_parallel_size

View File

@@ -51,12 +51,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
"""
def __init__(self, model_runner: ModelRunnerBase):
if hasattr(
model_runner,
"return_hidden_states") and model_runner.return_hidden_states:
raise ValueError(
"return_hidden_states is not supported for TP1DraftModelRunner."
)
super().__init__(model_runner)
self.indices_of_seq_with_bonus_tokens = None
@@ -211,6 +205,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
if self.prompt_adapter_config is not None:
raise ValueError("TP1DraftModelRunner has no support for "
"prompt_adapter_config")
if model_input.inputs_embeds is not None:
raise ValueError("TP1DraftModelRunner has no support for "
"inputs_embeds")
if model_input.multi_modal_kwargs:
raise ValueError(
"TP1DraftModelRunner has no support for multi_modal_kwargs"
@@ -272,6 +269,7 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
hidden_states = model_executable(
input_ids=model_input.input_tokens,
inputs_embeds=None,
positions=model_input.input_positions,
intermediate_tensors=intermediate_tensors,
**MultiModalKwargs.as_kwargs(multi_modal_kwargs,
@@ -293,6 +291,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
)
outputs.append(output)
if self.return_hidden_states and is_fallback:
output.hidden_states = hidden_states
if model_input.attn_metadata.num_prefills == 0 \
and self.indices_of_seq_with_bonus_tokens is not None:
assert output.sampled_token_ids is not None