From 339d6894f649b92f57675583ec235f10dd858152 Mon Sep 17 00:00:00 2001 From: wemaster <54620334+mengwei805@users.noreply.github.com> Date: Mon, 23 Jun 2025 09:05:13 +0800 Subject: [PATCH] [CI/UT][bugfix] fix v0 spec decode (#1321) ### What this PR does / why we need it? 1. [PR913](https://github.com/vllm-project/vllm-ascend/pull/913) introduced an error that caused V0's spec decode function to fail. [PR1109](https://github.com/vllm-project/vllm-ascend/pull/1109) wanted to fix this problem. Unfortunately, the fix broke the ngram function. I fixed the ngram function in this PR. **PS**: Q: Why is there a problem when ngram is not found when pr1109 is merged? A: The newly introduced problem will only appear when tp>1, and the use cases on CI are all tp=1 2. In versions after 0.7.3, vllm-ascend deleted some spec decode UTs to avoid CI taking too long, including eagle speculative UTs, which made CI unable to take care of the eagle function. I added it(`test_eagle_correctness.py`) back in this PR 3. Because of the reason mentioned in 2, the current version of Eagle has a problem. I located and fixed this problem. It was because vllm's `draft_model_runner.py` was changed and vllm-ascend was not synchronized in time. 4. Currently, the UTs of v0 and v1 are mixed in the spec_decode directory. I split them into two directories: spec_decode_v0 and spec_decode_v1. 5. i found `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor` and `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace` have changed in vllm, so i remove it in this pr. ### Does this PR introduce _any_ user-facing change? This PR fixes the functions of ngram and eagle spec decode in the v0 engine ### How was this patch tested? tested by CI Signed-off-by: mengwei805 --- .../workflows/vllm_ascend_test_long_term.yaml | 13 +- .../__init__.py | 0 .../conftest.py | 0 .../e2e/__init__.py | 0 .../e2e/conftest.py | 0 .../e2e/test_eagle_correctness.py | 344 ++++++++++++++++++ .../e2e/test_medusa_correctness.py | 5 +- .../e2e/test_mlp_correctness.py | 5 +- .../e2e/test_mtp_correctness.py | 0 .../e2e/test_ngram_correctness.py | 5 +- .../test_dynamic_spec_decode.py | 5 +- .../test_multi_step_worker.py | 2 +- .../test_ngram_worker.py | 2 +- .../test_spec_decode_worker.py | 8 +- .../test_utils.py | 0 .../{spec_decode => spec_decode_v0}/utils.py | 0 .../test_v1_mtp_correctness.py | 0 .../test_v1_spec_decode.py | 0 vllm_ascend/patch/__init__.py | 12 - .../patch_common/patch_multi_step_worker.py | 16 - .../patch_common/patch_spec_decode_worker.py | 12 +- vllm_ascend/worker/draft_model_runner.py | 13 +- 22 files changed, 384 insertions(+), 58 deletions(-) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/__init__.py (100%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/conftest.py (100%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/__init__.py (100%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/conftest.py (100%) create mode 100644 tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/test_medusa_correctness.py (99%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/test_mlp_correctness.py (99%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/test_mtp_correctness.py (100%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/test_ngram_correctness.py (98%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_dynamic_spec_decode.py (96%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_multi_step_worker.py (99%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_ngram_worker.py (99%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_spec_decode_worker.py (99%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_utils.py (100%) rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/utils.py (100%) rename tests/e2e/long_term/{spec_decode/e2e => spec_decode_v1}/test_v1_mtp_correctness.py (100%) rename tests/e2e/long_term/{spec_decode/e2e => spec_decode_v1}/test_v1_spec_decode.py (100%) diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index b413896..dc26ed9 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -97,13 +97,16 @@ jobs: - name: Run vllm-project/vllm-ascend long term test run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then - # spec decode test - VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py + # v0 spec decode test + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py # it needs a clean process + pytest -sv tests/e2e/long_term/spec_decode_v0 --ignore=tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py + # v1 spec decode test + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed - VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py - VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process - pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py + # accuracy test single card pytest -sv tests/e2e/long_term/test_accuracy.py else + # accuracy test multi card VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py fi diff --git a/tests/e2e/long_term/spec_decode/__init__.py b/tests/e2e/long_term/spec_decode_v0/__init__.py similarity index 100% rename from tests/e2e/long_term/spec_decode/__init__.py rename to tests/e2e/long_term/spec_decode_v0/__init__.py diff --git a/tests/e2e/long_term/spec_decode/conftest.py b/tests/e2e/long_term/spec_decode_v0/conftest.py similarity index 100% rename from tests/e2e/long_term/spec_decode/conftest.py rename to tests/e2e/long_term/spec_decode_v0/conftest.py diff --git a/tests/e2e/long_term/spec_decode/e2e/__init__.py b/tests/e2e/long_term/spec_decode_v0/e2e/__init__.py similarity index 100% rename from tests/e2e/long_term/spec_decode/e2e/__init__.py rename to tests/e2e/long_term/spec_decode_v0/e2e/__init__.py diff --git a/tests/e2e/long_term/spec_decode/e2e/conftest.py b/tests/e2e/long_term/spec_decode_v0/e2e/conftest.py similarity index 100% rename from tests/e2e/long_term/spec_decode/e2e/conftest.py rename to tests/e2e/long_term/spec_decode_v0/e2e/conftest.py diff --git a/tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py new file mode 100644 index 0000000..b44dc3c --- /dev/null +++ b/tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py @@ -0,0 +1,344 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/tests/spec_decode/e2e/test_eagle_correctness.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""This docstring details important information on the testing methodology. + +Most of the tests rely on "greedy equality", where we expect the output of +speculative decoding on a sequence to exactly match the output of normal non- +speculative decoding. + +Since speculative decoding with rejection sampling guarantees that the output +distribution matches the target model's output distribution (up to hardware +numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy +equality. + +However, we still need to verify below scenario could be passed: + * Batch size 1 greedy equality + * Batch size >1 greedy equality + * Test greedy equality under preemption + * Test greedy equality under various number of speculative tokens. + +With those tests, we can say at least, EAGLE would not break the +correctness for the target model outputs. +""" + +import pytest + +from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \ + run_equality_correctness_test + +# main model +MAIN_MODEL = "JackFram/llama-68m" + +# speculative model +SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random" + +# max. number of speculative tokens: this corresponds to +# num_heads in the config.json of the speculator model. +MAX_SPEC_TOKENS = 4 + +# precision +# TODO The vLLM here uses float32, but some op on the vllm-ascend +# do not support float32, such as ROPE, When it is fixed, it is +# recommended to change this to float32. +PRECISION = "float16" + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Print spec metrics. + "disable_log_stats": False, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + }, + }, +]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size: int, output_len: int, + seed: int): + + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Print spec metrics. + "disable_log_stats": False, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + "disable_logprobs": False, + }, +}, { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + "disable_logprobs": True, + }, +}]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("logprobs", [1, 6]) +def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size: int, output_len: int, seed: int, + logprobs: int): + + run_equality_correctness_test( + vllm_runner, + common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, + test_llm_kwargs, + batch_size, + output_len, + seed, + logprobs=logprobs, + prompt_logprobs=logprobs, + disable_logprobs=test_llm_kwargs["speculative_config"] + ["disable_logprobs"]) + + +@pytest.mark.skipif(True, reason="Open it when graph mode ready.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "enforce_eager": False, + + # Print spec metrics. + "disable_log_stats": False, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + }, + }, +]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_e2e_greedy_correctness_cuda_graph( + vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, + seed: int): + """Verify greedy equality with cuda graph enabled and different + batch sizes.""" + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +@pytest.mark.skipif(True, reason="Open it when preempt ready.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "block_size": 8, + # 2 for small prompt, 256//8 for generated. + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + }, + }, +]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 128, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_e2e_greedy_correctness_with_preemption( + vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, + seed: int): + """Verify greedy equality, even when some sequences are preempted mid- + generation. + """ + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": k, + }, + } + # Try a range of num. speculative tokens + for k in range(1, 1 + MAX_SPEC_TOKENS) + ]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_different_k(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, baseline_llm_kwargs, + test_llm_kwargs, batch_size: int, output_len: int, + seed: int): + """Verify that eagle speculative decoding produces exact equality + to without spec decode with different values of num_speculative_tokens. + """ + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + "disable_by_batch_size": 4, + }, +}]) +@pytest.mark.parametrize("batch_size", [1, 5]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_disable_queue(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, baseline_llm_kwargs, + test_llm_kwargs, batch_size: int, output_len: int, + seed: int): + """Verify that eagle speculative decoding produces exact equality + to without spec decode when speculation is disabled for large + batch sizes. + """ + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +if __name__ == "__main__": + import pytest + pytest.main([__file__]) \ No newline at end of file diff --git a/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_medusa_correctness.py similarity index 99% rename from tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py rename to tests/e2e/long_term/spec_decode_v0/e2e/test_medusa_correctness.py index e0c2efd..26398e2 100644 --- a/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/e2e/long_term/spec_decode_v0/e2e/test_medusa_correctness.py @@ -41,9 +41,10 @@ import os import pytest -from tests.e2e.long_term.spec_decode.e2e.conftest import \ +from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \ run_equality_correctness_test -from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.e2e.long_term.spec_decode_v0.utils import \ + maybe_enable_chunked_prefill # main model # lmsys/vicuna-7b-v1.3 was to be used but it's causing diff --git a/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_mlp_correctness.py similarity index 99% rename from tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py rename to tests/e2e/long_term/spec_decode_v0/e2e/test_mlp_correctness.py index 56db617..37003e4 100644 --- a/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/e2e/long_term/spec_decode_v0/e2e/test_mlp_correctness.py @@ -41,9 +41,10 @@ import pytest from vllm.model_executor.layers.vocab_parallel_embedding import \ pad_vocab_size # noqa: F401 -from tests.e2e.long_term.spec_decode.e2e.conftest import \ +from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \ run_equality_correctness_test -from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.e2e.long_term.spec_decode_v0.utils import \ + maybe_enable_chunked_prefill # main model MAIN_MODEL = "JackFram/llama-160m" diff --git a/tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py similarity index 100% rename from tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py rename to tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py diff --git a/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_ngram_correctness.py similarity index 98% rename from tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py rename to tests/e2e/long_term/spec_decode_v0/e2e/test_ngram_correctness.py index b99187f..1cc20ab 100644 --- a/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/e2e/long_term/spec_decode_v0/e2e/test_ngram_correctness.py @@ -44,9 +44,10 @@ for the target model outputs. import pytest -from tests.e2e.long_term.spec_decode.e2e.conftest import \ +from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \ run_equality_correctness_test -from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.e2e.long_term.spec_decode_v0.utils import \ + maybe_enable_chunked_prefill @pytest.mark.parametrize( diff --git a/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py b/tests/e2e/long_term/spec_decode_v0/test_dynamic_spec_decode.py similarity index 96% rename from tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py rename to tests/e2e/long_term/spec_decode_v0/test_dynamic_spec_decode.py index 8e9480e..63e4e1d 100644 --- a/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py +++ b/tests/e2e/long_term/spec_decode_v0/test_dynamic_spec_decode.py @@ -27,8 +27,9 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler -from tests.e2e.long_term.spec_decode.utils import create_batch, mock_worker +from tests.e2e.long_term.spec_decode_v0.test_utils import \ + mock_spec_decode_sampler +from tests.e2e.long_term.spec_decode_v0.utils import create_batch, mock_worker @pytest.mark.parametrize('queue_size', [4]) diff --git a/tests/e2e/long_term/spec_decode/test_multi_step_worker.py b/tests/e2e/long_term/spec_decode_v0/test_multi_step_worker.py similarity index 99% rename from tests/e2e/long_term/spec_decode/test_multi_step_worker.py rename to tests/e2e/long_term/spec_decode_v0/test_multi_step_worker.py index b3017a9..1dc50dd 100644 --- a/tests/e2e/long_term/spec_decode/test_multi_step_worker.py +++ b/tests/e2e/long_term/spec_decode_v0/test_multi_step_worker.py @@ -29,7 +29,7 @@ from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob, from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.e2e.long_term.spec_decode.utils import ( +from tests.e2e.long_term.spec_decode_v0.utils import ( assert_logprobs_dict_allclose, create_batch, create_seq_group_metadata_from_prompts, create_worker, patch_execute_model_with_seeds, zero_kv_cache) diff --git a/tests/e2e/long_term/spec_decode/test_ngram_worker.py b/tests/e2e/long_term/spec_decode_v0/test_ngram_worker.py similarity index 99% rename from tests/e2e/long_term/spec_decode/test_ngram_worker.py rename to tests/e2e/long_term/spec_decode_v0/test_ngram_worker.py index 078a4d2..30177b6 100644 --- a/tests/e2e/long_term/spec_decode/test_ngram_worker.py +++ b/tests/e2e/long_term/spec_decode_v0/test_ngram_worker.py @@ -22,7 +22,7 @@ from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.e2e.long_term.spec_decode.utils import ( +from tests.e2e.long_term.spec_decode_v0.utils import ( create_seq_group_metadata_from_prompts, create_worker) diff --git a/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py b/tests/e2e/long_term/spec_decode_v0/test_spec_decode_worker.py similarity index 99% rename from tests/e2e/long_term/spec_decode/test_spec_decode_worker.py rename to tests/e2e/long_term/spec_decode_v0/test_spec_decode_worker.py index 94a1bcf..ffcb2f6 100644 --- a/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py +++ b/tests/e2e/long_term/spec_decode_v0/test_spec_decode_worker.py @@ -35,10 +35,10 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) -from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler -from tests.e2e.long_term.spec_decode.utils import (create_batch, - create_sampler_output_list, - create_worker, mock_worker) +from tests.e2e.long_term.spec_decode_v0.test_utils import \ + mock_spec_decode_sampler +from tests.e2e.long_term.spec_decode_v0.utils import ( + create_batch, create_sampler_output_list, create_worker, mock_worker) from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner from vllm_ascend.worker.worker import NPUWorker diff --git a/tests/e2e/long_term/spec_decode/test_utils.py b/tests/e2e/long_term/spec_decode_v0/test_utils.py similarity index 100% rename from tests/e2e/long_term/spec_decode/test_utils.py rename to tests/e2e/long_term/spec_decode_v0/test_utils.py diff --git a/tests/e2e/long_term/spec_decode/utils.py b/tests/e2e/long_term/spec_decode_v0/utils.py similarity index 100% rename from tests/e2e/long_term/spec_decode/utils.py rename to tests/e2e/long_term/spec_decode_v0/utils.py diff --git a/tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py b/tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py similarity index 100% rename from tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py rename to tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py diff --git a/tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py b/tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py similarity index 100% rename from tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py rename to tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index d817f90..59d6035 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -100,18 +100,6 @@ # Future Plan: # Revert it when the related pr is merged in vllm and vllm-ascend. # -# 2. `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor` and -# `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace` -# Why: -# vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change. -# How: -# Use vLLM 0.8.4 method to patch it. -# Related PR (if no, explain why): -# - https://github.com/vllm-project/vllm/pull/15195 -# - https://github.com/vllm-project/vllm-ascend/pull/395 -# Future Plan: -# Remove it when we identify the reasons clearly. -# # ** File: worker/patch_common/patch_spec_decode_worker.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker` diff --git a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py b/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py index ca87729..53ce312 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py +++ b/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py @@ -88,20 +88,4 @@ def sampler_output( return filtered_model_outputs, True -def set_include_gpu_probs_tensor(self) -> None: - # Need include_gpu_probs_tensor for MultiSteoWorker - if hasattr(self.model_runner.model, "sampler"): - self.model_runner.model.sampler.include_gpu_probs_tensor = True - self.model_runner.sampler.include_gpu_probs_tensor = True - - -def set_should_modify_greedy_probs_inplace(self) -> None: - if hasattr(self.model_runner.model, "sampler"): - self.model_runner.model.sampler.should_modify_greedy_probs_inplace = ( - True) - self.model_runner.sampler.should_modify_greedy_probs_inplace = True - - MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output) -MultiStepWorker.set_include_gpu_probs_tensor = set_include_gpu_probs_tensor -MultiStepWorker.set_should_modify_greedy_probs_inplace = set_should_modify_greedy_probs_inplace diff --git a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py b/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py index 66e7aa5..d271e65 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py +++ b/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py @@ -57,11 +57,6 @@ def create_worker( ngram_prompt_lookup_min = ( draft_worker_kwargs.pop("ngram_prompt_lookup_min")) - # TODO(Yizhou): A quick fix, must be refactored ASAP - draft_worker_kwargs["vllm_config"].parallel_config.expert_parallel_size = 1 - draft_worker_kwargs[ - "vllm_config"].parallel_config.expert_tensor_parallel_size = 1 - draft_model_config = draft_worker_kwargs["vllm_config"].model_config draft_parallel_config: ParallelConfig = draft_worker_kwargs[ 'vllm_config'].parallel_config @@ -72,6 +67,13 @@ def create_worker( proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, ngram_prompt_lookup_max) else: + # TODO(Yizhou): A quick fix, must be refactored ASAP + # ngram need not this fix. + draft_worker_kwargs[ + "vllm_config"].parallel_config.expert_parallel_size = 1 + draft_worker_kwargs[ + "vllm_config"].parallel_config.expert_tensor_parallel_size = 1 + draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size diff --git a/vllm_ascend/worker/draft_model_runner.py b/vllm_ascend/worker/draft_model_runner.py index 1306b1e..b070da1 100644 --- a/vllm_ascend/worker/draft_model_runner.py +++ b/vllm_ascend/worker/draft_model_runner.py @@ -51,12 +51,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): """ def __init__(self, model_runner: ModelRunnerBase): - if hasattr( - model_runner, - "return_hidden_states") and model_runner.return_hidden_states: - raise ValueError( - "return_hidden_states is not supported for TP1DraftModelRunner." - ) super().__init__(model_runner) self.indices_of_seq_with_bonus_tokens = None @@ -211,6 +205,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): if self.prompt_adapter_config is not None: raise ValueError("TP1DraftModelRunner has no support for " "prompt_adapter_config") + if model_input.inputs_embeds is not None: + raise ValueError("TP1DraftModelRunner has no support for " + "inputs_embeds") if model_input.multi_modal_kwargs: raise ValueError( "TP1DraftModelRunner has no support for multi_modal_kwargs" @@ -272,6 +269,7 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): hidden_states = model_executable( input_ids=model_input.input_tokens, + inputs_embeds=None, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs(multi_modal_kwargs, @@ -293,6 +291,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): ) outputs.append(output) + if self.return_hidden_states and is_fallback: + output.hidden_states = hidden_states + if model_input.attn_metadata.num_prefills == 0 \ and self.indices_of_seq_with_bonus_tokens is not None: assert output.sampled_token_ids is not None