[CI/UT][bugfix] fix v0 spec decode (#1321)
### What this PR does / why we need it? 1. [PR913](https://github.com/vllm-project/vllm-ascend/pull/913) introduced an error that caused V0's spec decode function to fail. [PR1109](https://github.com/vllm-project/vllm-ascend/pull/1109) wanted to fix this problem. Unfortunately, the fix broke the ngram function. I fixed the ngram function in this PR. **PS**: Q: Why is there a problem when ngram is not found when pr1109 is merged? A: The newly introduced problem will only appear when tp>1, and the use cases on CI are all tp=1 2. In versions after 0.7.3, vllm-ascend deleted some spec decode UTs to avoid CI taking too long, including eagle speculative UTs, which made CI unable to take care of the eagle function. I added it(`test_eagle_correctness.py`) back in this PR 3. Because of the reason mentioned in 2, the current version of Eagle has a problem. I located and fixed this problem. It was because vllm's `draft_model_runner.py` was changed and vllm-ascend was not synchronized in time. 4. Currently, the UTs of v0 and v1 are mixed in the spec_decode directory. I split them into two directories: spec_decode_v0 and spec_decode_v1. 5. i found `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor` and `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace` have changed in vllm, so i remove it in this pr. ### Does this PR introduce _any_ user-facing change? This PR fixes the functions of ngram and eagle spec decode in the v0 engine ### How was this patch tested? tested by CI Signed-off-by: mengwei805 <mengwei25@huawei.com>
This commit is contained in:
@@ -97,13 +97,16 @@ jobs:
|
|||||||
- name: Run vllm-project/vllm-ascend long term test
|
- name: Run vllm-project/vllm-ascend long term test
|
||||||
run: |
|
run: |
|
||||||
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
||||||
# spec decode test
|
# v0 spec decode test
|
||||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py # it needs a clean process
|
||||||
|
pytest -sv tests/e2e/long_term/spec_decode_v0 --ignore=tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
|
||||||
|
# v1 spec decode test
|
||||||
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py
|
||||||
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
|
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
|
||||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py
|
||||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process
|
# accuracy test single card
|
||||||
pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
|
|
||||||
pytest -sv tests/e2e/long_term/test_accuracy.py
|
pytest -sv tests/e2e/long_term/test_accuracy.py
|
||||||
else
|
else
|
||||||
|
# accuracy test multi card
|
||||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
|
||||||
fi
|
fi
|
||||||
|
|||||||
344
tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py
Normal file
344
tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||||
|
# This file is a part of the vllm-ascend project.
|
||||||
|
# Adapted from vllm-project/vllm/tests/spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
# Copyright 2023 The vLLM team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
"""This docstring details important information on the testing methodology.
|
||||||
|
|
||||||
|
Most of the tests rely on "greedy equality", where we expect the output of
|
||||||
|
speculative decoding on a sequence to exactly match the output of normal non-
|
||||||
|
speculative decoding.
|
||||||
|
|
||||||
|
Since speculative decoding with rejection sampling guarantees that the output
|
||||||
|
distribution matches the target model's output distribution (up to hardware
|
||||||
|
numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
|
||||||
|
equality.
|
||||||
|
|
||||||
|
However, we still need to verify below scenario could be passed:
|
||||||
|
* Batch size 1 greedy equality
|
||||||
|
* Batch size >1 greedy equality
|
||||||
|
* Test greedy equality under preemption
|
||||||
|
* Test greedy equality under various number of speculative tokens.
|
||||||
|
|
||||||
|
With those tests, we can say at least, EAGLE would not break the
|
||||||
|
correctness for the target model outputs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
|
||||||
|
run_equality_correctness_test
|
||||||
|
|
||||||
|
# main model
|
||||||
|
MAIN_MODEL = "JackFram/llama-68m"
|
||||||
|
|
||||||
|
# speculative model
|
||||||
|
SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random"
|
||||||
|
|
||||||
|
# max. number of speculative tokens: this corresponds to
|
||||||
|
# num_heads in the config.json of the speculator model.
|
||||||
|
MAX_SPEC_TOKENS = 4
|
||||||
|
|
||||||
|
# precision
|
||||||
|
# TODO The vLLM here uses float32, but some op on the vllm-ascend
|
||||||
|
# do not support float32, such as ROPE, When it is fixed, it is
|
||||||
|
# recommended to change this to float32.
|
||||||
|
PRECISION = "float16"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"common_llm_kwargs",
|
||||||
|
[{
|
||||||
|
# Skip cuda graph recording for fast test.
|
||||||
|
"enforce_eager": True,
|
||||||
|
|
||||||
|
# Print spec metrics.
|
||||||
|
"disable_log_stats": False,
|
||||||
|
|
||||||
|
# Precision
|
||||||
|
"dtype": PRECISION,
|
||||||
|
|
||||||
|
# Main model
|
||||||
|
"model_name": MAIN_MODEL,
|
||||||
|
}])
|
||||||
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||||
|
{
|
||||||
|
"speculative_config": {
|
||||||
|
"model": SPEC_MODEL,
|
||||||
|
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("output_len", [
|
||||||
|
128,
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
|
|
||||||
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size, output_len, seed)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"common_llm_kwargs",
|
||||||
|
[{
|
||||||
|
# Skip cuda graph recording for fast test.
|
||||||
|
"enforce_eager": True,
|
||||||
|
|
||||||
|
# Print spec metrics.
|
||||||
|
"disable_log_stats": False,
|
||||||
|
|
||||||
|
# Precision
|
||||||
|
"dtype": PRECISION,
|
||||||
|
|
||||||
|
# Main model
|
||||||
|
"model_name": MAIN_MODEL,
|
||||||
|
}])
|
||||||
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||||
|
"speculative_config": {
|
||||||
|
"model": SPEC_MODEL,
|
||||||
|
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||||
|
"disable_logprobs": False,
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
"speculative_config": {
|
||||||
|
"model": SPEC_MODEL,
|
||||||
|
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||||
|
"disable_logprobs": True,
|
||||||
|
},
|
||||||
|
}])
|
||||||
|
@pytest.mark.parametrize("output_len", [
|
||||||
|
128,
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("batch_size", [8])
|
||||||
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
@pytest.mark.parametrize("logprobs", [1, 6])
|
||||||
|
def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int, seed: int,
|
||||||
|
logprobs: int):
|
||||||
|
|
||||||
|
run_equality_correctness_test(
|
||||||
|
vllm_runner,
|
||||||
|
common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
output_len,
|
||||||
|
seed,
|
||||||
|
logprobs=logprobs,
|
||||||
|
prompt_logprobs=logprobs,
|
||||||
|
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||||
|
["disable_logprobs"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(True, reason="Open it when graph mode ready.")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"common_llm_kwargs",
|
||||||
|
[{
|
||||||
|
"enforce_eager": False,
|
||||||
|
|
||||||
|
# Print spec metrics.
|
||||||
|
"disable_log_stats": False,
|
||||||
|
|
||||||
|
# Precision
|
||||||
|
"dtype": PRECISION,
|
||||||
|
|
||||||
|
# Main model
|
||||||
|
"model_name": MAIN_MODEL,
|
||||||
|
}])
|
||||||
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||||
|
{
|
||||||
|
"speculative_config": {
|
||||||
|
"model": SPEC_MODEL,
|
||||||
|
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("output_len", [
|
||||||
|
128,
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
def test_eagle_e2e_greedy_correctness_cuda_graph(
|
||||||
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
|
"""Verify greedy equality with cuda graph enabled and different
|
||||||
|
batch sizes."""
|
||||||
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size, output_len, seed)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(True, reason="Open it when preempt ready.")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"common_llm_kwargs",
|
||||||
|
[{
|
||||||
|
"block_size": 8,
|
||||||
|
# 2 for small prompt, 256//8 for generated.
|
||||||
|
"num_gpu_blocks_override": 2 + 256 // 8,
|
||||||
|
"max_model_len": (2 + 256 // 8) * 8,
|
||||||
|
|
||||||
|
# Skip cuda graph recording for fast test.
|
||||||
|
"enforce_eager": True,
|
||||||
|
|
||||||
|
# Precision
|
||||||
|
"dtype": PRECISION,
|
||||||
|
|
||||||
|
# Main model
|
||||||
|
"model_name": MAIN_MODEL,
|
||||||
|
}])
|
||||||
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||||
|
{
|
||||||
|
"speculative_config": {
|
||||||
|
"model": SPEC_MODEL,
|
||||||
|
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"output_len",
|
||||||
|
[
|
||||||
|
# Use small output len for fast test.
|
||||||
|
128,
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
def test_eagle_e2e_greedy_correctness_with_preemption(
|
||||||
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
|
"""Verify greedy equality, even when some sequences are preempted mid-
|
||||||
|
generation.
|
||||||
|
"""
|
||||||
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size, output_len, seed)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"common_llm_kwargs",
|
||||||
|
[{
|
||||||
|
# Skip cuda graph recording for fast test.
|
||||||
|
"enforce_eager": True,
|
||||||
|
|
||||||
|
# Precision
|
||||||
|
"dtype": PRECISION,
|
||||||
|
|
||||||
|
# Main model
|
||||||
|
"model_name": MAIN_MODEL,
|
||||||
|
}])
|
||||||
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"test_llm_kwargs",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"speculative_config": {
|
||||||
|
"model": SPEC_MODEL,
|
||||||
|
"num_speculative_tokens": k,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
# Try a range of num. speculative tokens
|
||||||
|
for k in range(1, 1 + MAX_SPEC_TOKENS)
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("batch_size", [2])
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"output_len",
|
||||||
|
[
|
||||||
|
# Use smaller output len for fast test.
|
||||||
|
32,
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
def test_eagle_different_k(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
|
"""Verify that eagle speculative decoding produces exact equality
|
||||||
|
to without spec decode with different values of num_speculative_tokens.
|
||||||
|
"""
|
||||||
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size, output_len, seed)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"common_llm_kwargs",
|
||||||
|
[{
|
||||||
|
# Skip cuda graph recording for fast test.
|
||||||
|
"enforce_eager": True,
|
||||||
|
|
||||||
|
# Precision
|
||||||
|
"dtype": PRECISION,
|
||||||
|
|
||||||
|
# Main model
|
||||||
|
"model_name": MAIN_MODEL,
|
||||||
|
}])
|
||||||
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||||
|
"speculative_config": {
|
||||||
|
"model": SPEC_MODEL,
|
||||||
|
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||||
|
"disable_by_batch_size": 4,
|
||||||
|
},
|
||||||
|
}])
|
||||||
|
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"output_len",
|
||||||
|
[
|
||||||
|
# Use smaller output len for fast test.
|
||||||
|
32,
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
|
"""Verify that eagle speculative decoding produces exact equality
|
||||||
|
to without spec decode when speculation is disabled for large
|
||||||
|
batch sizes.
|
||||||
|
"""
|
||||||
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size, output_len, seed)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import pytest
|
||||||
|
pytest.main([__file__])
|
||||||
@@ -41,9 +41,10 @@ import os
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.e2e.long_term.spec_decode.e2e.conftest import \
|
from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
|
||||||
run_equality_correctness_test
|
run_equality_correctness_test
|
||||||
from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill
|
from tests.e2e.long_term.spec_decode_v0.utils import \
|
||||||
|
maybe_enable_chunked_prefill
|
||||||
|
|
||||||
# main model
|
# main model
|
||||||
# lmsys/vicuna-7b-v1.3 was to be used but it's causing
|
# lmsys/vicuna-7b-v1.3 was to be used but it's causing
|
||||||
@@ -41,9 +41,10 @@ import pytest
|
|||||||
from vllm.model_executor.layers.vocab_parallel_embedding import \
|
from vllm.model_executor.layers.vocab_parallel_embedding import \
|
||||||
pad_vocab_size # noqa: F401
|
pad_vocab_size # noqa: F401
|
||||||
|
|
||||||
from tests.e2e.long_term.spec_decode.e2e.conftest import \
|
from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
|
||||||
run_equality_correctness_test
|
run_equality_correctness_test
|
||||||
from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill
|
from tests.e2e.long_term.spec_decode_v0.utils import \
|
||||||
|
maybe_enable_chunked_prefill
|
||||||
|
|
||||||
# main model
|
# main model
|
||||||
MAIN_MODEL = "JackFram/llama-160m"
|
MAIN_MODEL = "JackFram/llama-160m"
|
||||||
@@ -44,9 +44,10 @@ for the target model outputs.
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.e2e.long_term.spec_decode.e2e.conftest import \
|
from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
|
||||||
run_equality_correctness_test
|
run_equality_correctness_test
|
||||||
from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill
|
from tests.e2e.long_term.spec_decode_v0.utils import \
|
||||||
|
maybe_enable_chunked_prefill
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -27,8 +27,9 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
|||||||
from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
|
from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
|
||||||
from vllm.spec_decode.top1_proposer import Top1Proposer
|
from vllm.spec_decode.top1_proposer import Top1Proposer
|
||||||
|
|
||||||
from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler
|
from tests.e2e.long_term.spec_decode_v0.test_utils import \
|
||||||
from tests.e2e.long_term.spec_decode.utils import create_batch, mock_worker
|
mock_spec_decode_sampler
|
||||||
|
from tests.e2e.long_term.spec_decode_v0.utils import create_batch, mock_worker
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('queue_size', [4])
|
@pytest.mark.parametrize('queue_size', [4])
|
||||||
@@ -29,7 +29,7 @@ from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob,
|
|||||||
from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
||||||
from vllm.spec_decode.top1_proposer import Top1Proposer
|
from vllm.spec_decode.top1_proposer import Top1Proposer
|
||||||
|
|
||||||
from tests.e2e.long_term.spec_decode.utils import (
|
from tests.e2e.long_term.spec_decode_v0.utils import (
|
||||||
assert_logprobs_dict_allclose, create_batch,
|
assert_logprobs_dict_allclose, create_batch,
|
||||||
create_seq_group_metadata_from_prompts, create_worker,
|
create_seq_group_metadata_from_prompts, create_worker,
|
||||||
patch_execute_model_with_seeds, zero_kv_cache)
|
patch_execute_model_with_seeds, zero_kv_cache)
|
||||||
@@ -22,7 +22,7 @@ from vllm.sequence import ExecuteModelRequest
|
|||||||
from vllm.spec_decode.ngram_worker import NGramWorker
|
from vllm.spec_decode.ngram_worker import NGramWorker
|
||||||
from vllm.spec_decode.top1_proposer import Top1Proposer
|
from vllm.spec_decode.top1_proposer import Top1Proposer
|
||||||
|
|
||||||
from tests.e2e.long_term.spec_decode.utils import (
|
from tests.e2e.long_term.spec_decode_v0.utils import (
|
||||||
create_seq_group_metadata_from_prompts, create_worker)
|
create_seq_group_metadata_from_prompts, create_worker)
|
||||||
|
|
||||||
|
|
||||||
@@ -35,10 +35,10 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
|||||||
from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
|
from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
|
||||||
split_num_cache_blocks_evenly)
|
split_num_cache_blocks_evenly)
|
||||||
|
|
||||||
from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler
|
from tests.e2e.long_term.spec_decode_v0.test_utils import \
|
||||||
from tests.e2e.long_term.spec_decode.utils import (create_batch,
|
mock_spec_decode_sampler
|
||||||
create_sampler_output_list,
|
from tests.e2e.long_term.spec_decode_v0.utils import (
|
||||||
create_worker, mock_worker)
|
create_batch, create_sampler_output_list, create_worker, mock_worker)
|
||||||
from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
|
from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
|
||||||
from vllm_ascend.worker.worker import NPUWorker
|
from vllm_ascend.worker.worker import NPUWorker
|
||||||
|
|
||||||
@@ -100,18 +100,6 @@
|
|||||||
# Future Plan:
|
# Future Plan:
|
||||||
# Revert it when the related pr is merged in vllm and vllm-ascend.
|
# Revert it when the related pr is merged in vllm and vllm-ascend.
|
||||||
#
|
#
|
||||||
# 2. `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor` and
|
|
||||||
# `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace`
|
|
||||||
# Why:
|
|
||||||
# vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change.
|
|
||||||
# How:
|
|
||||||
# Use vLLM 0.8.4 method to patch it.
|
|
||||||
# Related PR (if no, explain why):
|
|
||||||
# - https://github.com/vllm-project/vllm/pull/15195
|
|
||||||
# - https://github.com/vllm-project/vllm-ascend/pull/395
|
|
||||||
# Future Plan:
|
|
||||||
# Remove it when we identify the reasons clearly.
|
|
||||||
#
|
|
||||||
# ** File: worker/patch_common/patch_spec_decode_worker.py **
|
# ** File: worker/patch_common/patch_spec_decode_worker.py **
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
# 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker`
|
# 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker`
|
||||||
|
|||||||
@@ -88,20 +88,4 @@ def sampler_output(
|
|||||||
return filtered_model_outputs, True
|
return filtered_model_outputs, True
|
||||||
|
|
||||||
|
|
||||||
def set_include_gpu_probs_tensor(self) -> None:
|
|
||||||
# Need include_gpu_probs_tensor for MultiSteoWorker
|
|
||||||
if hasattr(self.model_runner.model, "sampler"):
|
|
||||||
self.model_runner.model.sampler.include_gpu_probs_tensor = True
|
|
||||||
self.model_runner.sampler.include_gpu_probs_tensor = True
|
|
||||||
|
|
||||||
|
|
||||||
def set_should_modify_greedy_probs_inplace(self) -> None:
|
|
||||||
if hasattr(self.model_runner.model, "sampler"):
|
|
||||||
self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
|
|
||||||
True)
|
|
||||||
self.model_runner.sampler.should_modify_greedy_probs_inplace = True
|
|
||||||
|
|
||||||
|
|
||||||
MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output)
|
MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output)
|
||||||
MultiStepWorker.set_include_gpu_probs_tensor = set_include_gpu_probs_tensor
|
|
||||||
MultiStepWorker.set_should_modify_greedy_probs_inplace = set_should_modify_greedy_probs_inplace
|
|
||||||
|
|||||||
@@ -57,11 +57,6 @@ def create_worker(
|
|||||||
ngram_prompt_lookup_min = (
|
ngram_prompt_lookup_min = (
|
||||||
draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
|
draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
|
||||||
|
|
||||||
# TODO(Yizhou): A quick fix, must be refactored ASAP
|
|
||||||
draft_worker_kwargs["vllm_config"].parallel_config.expert_parallel_size = 1
|
|
||||||
draft_worker_kwargs[
|
|
||||||
"vllm_config"].parallel_config.expert_tensor_parallel_size = 1
|
|
||||||
|
|
||||||
draft_model_config = draft_worker_kwargs["vllm_config"].model_config
|
draft_model_config = draft_worker_kwargs["vllm_config"].model_config
|
||||||
draft_parallel_config: ParallelConfig = draft_worker_kwargs[
|
draft_parallel_config: ParallelConfig = draft_worker_kwargs[
|
||||||
'vllm_config'].parallel_config
|
'vllm_config'].parallel_config
|
||||||
@@ -72,6 +67,13 @@ def create_worker(
|
|||||||
proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
|
proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
|
||||||
ngram_prompt_lookup_max)
|
ngram_prompt_lookup_max)
|
||||||
else:
|
else:
|
||||||
|
# TODO(Yizhou): A quick fix, must be refactored ASAP
|
||||||
|
# ngram need not this fix.
|
||||||
|
draft_worker_kwargs[
|
||||||
|
"vllm_config"].parallel_config.expert_parallel_size = 1
|
||||||
|
draft_worker_kwargs[
|
||||||
|
"vllm_config"].parallel_config.expert_tensor_parallel_size = 1
|
||||||
|
|
||||||
draft_tp = draft_parallel_config.tensor_parallel_size
|
draft_tp = draft_parallel_config.tensor_parallel_size
|
||||||
target_tp = scorer_worker.parallel_config.tensor_parallel_size
|
target_tp = scorer_worker.parallel_config.tensor_parallel_size
|
||||||
|
|
||||||
|
|||||||
@@ -51,12 +51,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, model_runner: ModelRunnerBase):
|
def __init__(self, model_runner: ModelRunnerBase):
|
||||||
if hasattr(
|
|
||||||
model_runner,
|
|
||||||
"return_hidden_states") and model_runner.return_hidden_states:
|
|
||||||
raise ValueError(
|
|
||||||
"return_hidden_states is not supported for TP1DraftModelRunner."
|
|
||||||
)
|
|
||||||
super().__init__(model_runner)
|
super().__init__(model_runner)
|
||||||
|
|
||||||
self.indices_of_seq_with_bonus_tokens = None
|
self.indices_of_seq_with_bonus_tokens = None
|
||||||
@@ -211,6 +205,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
|
|||||||
if self.prompt_adapter_config is not None:
|
if self.prompt_adapter_config is not None:
|
||||||
raise ValueError("TP1DraftModelRunner has no support for "
|
raise ValueError("TP1DraftModelRunner has no support for "
|
||||||
"prompt_adapter_config")
|
"prompt_adapter_config")
|
||||||
|
if model_input.inputs_embeds is not None:
|
||||||
|
raise ValueError("TP1DraftModelRunner has no support for "
|
||||||
|
"inputs_embeds")
|
||||||
if model_input.multi_modal_kwargs:
|
if model_input.multi_modal_kwargs:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"TP1DraftModelRunner has no support for multi_modal_kwargs"
|
"TP1DraftModelRunner has no support for multi_modal_kwargs"
|
||||||
@@ -272,6 +269,7 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
|
|||||||
|
|
||||||
hidden_states = model_executable(
|
hidden_states = model_executable(
|
||||||
input_ids=model_input.input_tokens,
|
input_ids=model_input.input_tokens,
|
||||||
|
inputs_embeds=None,
|
||||||
positions=model_input.input_positions,
|
positions=model_input.input_positions,
|
||||||
intermediate_tensors=intermediate_tensors,
|
intermediate_tensors=intermediate_tensors,
|
||||||
**MultiModalKwargs.as_kwargs(multi_modal_kwargs,
|
**MultiModalKwargs.as_kwargs(multi_modal_kwargs,
|
||||||
@@ -293,6 +291,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
|
|||||||
)
|
)
|
||||||
outputs.append(output)
|
outputs.append(output)
|
||||||
|
|
||||||
|
if self.return_hidden_states and is_fallback:
|
||||||
|
output.hidden_states = hidden_states
|
||||||
|
|
||||||
if model_input.attn_metadata.num_prefills == 0 \
|
if model_input.attn_metadata.num_prefills == 0 \
|
||||||
and self.indices_of_seq_with_bonus_tokens is not None:
|
and self.indices_of_seq_with_bonus_tokens is not None:
|
||||||
assert output.sampled_token_ids is not None
|
assert output.sampled_token_ids is not None
|
||||||
|
|||||||
Reference in New Issue
Block a user