From 339d6894f649b92f57675583ec235f10dd858152 Mon Sep 17 00:00:00 2001
From: wemaster <54620334+mengwei805@users.noreply.github.com>
Date: Mon, 23 Jun 2025 09:05:13 +0800
Subject: [PATCH] [CI/UT][bugfix] fix v0 spec decode (#1321)

### What this PR does / why we need it?
1. [PR913](https://github.com/vllm-project/vllm-ascend/pull/913)
introduced an error that caused V0's spec decode function to fail.
[PR1109](https://github.com/vllm-project/vllm-ascend/pull/1109) wanted
to fix this problem. Unfortunately, the fix broke the ngram function. I
fixed the ngram function in this PR. **PS**: Q: Why is there a problem
when ngram is not found when pr1109 is merged? A: The newly introduced
problem will only appear when tp>1, and the use cases on CI are all tp=1
2. In versions after 0.7.3, vllm-ascend deleted some spec decode UTs to
avoid CI taking too long, including eagle speculative UTs, which made CI
unable to take care of the eagle function. I added
it(`test_eagle_correctness.py`) back in this PR
3. Because of the reason mentioned in 2, the current version of Eagle
has a problem. I located and fixed this problem. It was because vllm's
`draft_model_runner.py` was changed and vllm-ascend was not synchronized
in time.
4. Currently, the UTs of v0 and v1 are mixed in the spec_decode
directory. I split them into two directories: spec_decode_v0 and
spec_decode_v1.
5. i found
`vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor`
and
`vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace`
have changed in vllm, so i remove it in this pr.

### Does this PR introduce _any_ user-facing change?
This PR fixes the functions of ngram and eagle spec decode in the v0
engine

### How was this patch tested?
tested by CI

Signed-off-by: mengwei805 <mengwei25@huawei.com>
---
 .../workflows/vllm_ascend_test_long_term.yaml |  13 +-
 .../__init__.py                               |   0
 .../conftest.py                               |   0
 .../e2e/__init__.py                           |   0
 .../e2e/conftest.py                           |   0
 .../e2e/test_eagle_correctness.py             | 344 ++++++++++++++++++
 .../e2e/test_medusa_correctness.py            |   5 +-
 .../e2e/test_mlp_correctness.py               |   5 +-
 .../e2e/test_mtp_correctness.py               |   0
 .../e2e/test_ngram_correctness.py             |   5 +-
 .../test_dynamic_spec_decode.py               |   5 +-
 .../test_multi_step_worker.py                 |   2 +-
 .../test_ngram_worker.py                      |   2 +-
 .../test_spec_decode_worker.py                |   8 +-
 .../test_utils.py                             |   0
 .../{spec_decode => spec_decode_v0}/utils.py  |   0
 .../test_v1_mtp_correctness.py                |   0
 .../test_v1_spec_decode.py                    |   0
 vllm_ascend/patch/__init__.py                 |  12 -
 .../patch_common/patch_multi_step_worker.py   |  16 -
 .../patch_common/patch_spec_decode_worker.py  |  12 +-
 vllm_ascend/worker/draft_model_runner.py      |  13 +-
 22 files changed, 384 insertions(+), 58 deletions(-)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/__init__.py (100%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/conftest.py (100%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/__init__.py (100%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/conftest.py (100%)
 create mode 100644 tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/test_medusa_correctness.py (99%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/test_mlp_correctness.py (99%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/test_mtp_correctness.py (100%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/e2e/test_ngram_correctness.py (98%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_dynamic_spec_decode.py (96%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_multi_step_worker.py (99%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_ngram_worker.py (99%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_spec_decode_worker.py (99%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/test_utils.py (100%)
 rename tests/e2e/long_term/{spec_decode => spec_decode_v0}/utils.py (100%)
 rename tests/e2e/long_term/{spec_decode/e2e => spec_decode_v1}/test_v1_mtp_correctness.py (100%)
 rename tests/e2e/long_term/{spec_decode/e2e => spec_decode_v1}/test_v1_spec_decode.py (100%)

diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
index b413896..dc26ed9 100644
--- a/.github/workflows/vllm_ascend_test_long_term.yaml
+++ b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -97,13 +97,16 @@ jobs:
       - name: Run vllm-project/vllm-ascend long term test
         run: |
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
-            # spec decode test
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
+            # v0 spec decode test
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py  # it needs a clean process
+            pytest -sv tests/e2e/long_term/spec_decode_v0 --ignore=tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
+            # v1 spec decode test
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py
             # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py  # it needs a clean process
-            pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py
+            # accuracy test single card
             pytest -sv tests/e2e/long_term/test_accuracy.py
           else
+            # accuracy test multi card
             VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
           fi
diff --git a/tests/e2e/long_term/spec_decode/__init__.py b/tests/e2e/long_term/spec_decode_v0/__init__.py
similarity index 100%
rename from tests/e2e/long_term/spec_decode/__init__.py
rename to tests/e2e/long_term/spec_decode_v0/__init__.py
diff --git a/tests/e2e/long_term/spec_decode/conftest.py b/tests/e2e/long_term/spec_decode_v0/conftest.py
similarity index 100%
rename from tests/e2e/long_term/spec_decode/conftest.py
rename to tests/e2e/long_term/spec_decode_v0/conftest.py
diff --git a/tests/e2e/long_term/spec_decode/e2e/__init__.py b/tests/e2e/long_term/spec_decode_v0/e2e/__init__.py
similarity index 100%
rename from tests/e2e/long_term/spec_decode/e2e/__init__.py
rename to tests/e2e/long_term/spec_decode_v0/e2e/__init__.py
diff --git a/tests/e2e/long_term/spec_decode/e2e/conftest.py b/tests/e2e/long_term/spec_decode_v0/e2e/conftest.py
similarity index 100%
rename from tests/e2e/long_term/spec_decode/e2e/conftest.py
rename to tests/e2e/long_term/spec_decode_v0/e2e/conftest.py
diff --git a/tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py
new file mode 100644
index 0000000..b44dc3c
--- /dev/null
+++ b/tests/e2e/long_term/spec_decode_v0/e2e/test_eagle_correctness.py
@@ -0,0 +1,344 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/tests/spec_decode/e2e/test_eagle_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, EAGLE would not break the
+correctness for the target model outputs.
+"""
+
+import pytest
+
+from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
+    run_equality_correctness_test
+
+# main model
+MAIN_MODEL = "JackFram/llama-68m"
+
+# speculative model
+SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random"
+
+# max. number of speculative tokens: this corresponds to
+# num_heads in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 4
+
+# precision
+# TODO The vLLM here uses float32, but some op on the vllm-ascend
+# do not support float32, such as ROPE, When it is fixed, it is
+# recommended to change this to float32.
+PRECISION = "float16"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                      per_test_common_llm_kwargs,
+                                      baseline_llm_kwargs, test_llm_kwargs,
+                                      batch_size: int, output_len: int,
+                                      seed: int):
+
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_logprobs": False,
+    },
+}, {
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_logprobs": True,
+    },
+}])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("logprobs", [1, 6])
+def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
+                                   per_test_common_llm_kwargs,
+                                   baseline_llm_kwargs, test_llm_kwargs,
+                                   batch_size: int, output_len: int, seed: int,
+                                   logprobs: int):
+
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
+
+
+@pytest.mark.skipif(True, reason="Open it when graph mode ready.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "enforce_eager": False,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_e2e_greedy_correctness_cuda_graph(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
+    """Verify greedy equality with cuda graph enabled and different
+    batch sizes."""
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.skipif(True, reason="Open it when preempt ready.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": k,
+            },
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_different_k(vllm_runner, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           test_llm_kwargs, batch_size: int, output_len: int,
+                           seed: int):
+    """Verify that eagle speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4,
+    },
+}])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
+                             per_test_common_llm_kwargs, baseline_llm_kwargs,
+                             test_llm_kwargs, batch_size: int, output_len: int,
+                             seed: int):
+    """Verify that eagle speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
\ No newline at end of file
diff --git a/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_medusa_correctness.py
similarity index 99%
rename from tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py
rename to tests/e2e/long_term/spec_decode_v0/e2e/test_medusa_correctness.py
index e0c2efd..26398e2 100644
--- a/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/e2e/long_term/spec_decode_v0/e2e/test_medusa_correctness.py
@@ -41,9 +41,10 @@ import os
 
 import pytest
 
-from tests.e2e.long_term.spec_decode.e2e.conftest import \
+from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
     run_equality_correctness_test
-from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill
+from tests.e2e.long_term.spec_decode_v0.utils import \
+    maybe_enable_chunked_prefill
 
 # main model
 # lmsys/vicuna-7b-v1.3 was to be used but it's causing
diff --git a/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_mlp_correctness.py
similarity index 99%
rename from tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py
rename to tests/e2e/long_term/spec_decode_v0/e2e/test_mlp_correctness.py
index 56db617..37003e4 100644
--- a/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/e2e/long_term/spec_decode_v0/e2e/test_mlp_correctness.py
@@ -41,9 +41,10 @@ import pytest
 from vllm.model_executor.layers.vocab_parallel_embedding import \
     pad_vocab_size  # noqa: F401
 
-from tests.e2e.long_term.spec_decode.e2e.conftest import \
+from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
     run_equality_correctness_test
-from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill
+from tests.e2e.long_term.spec_decode_v0.utils import \
+    maybe_enable_chunked_prefill
 
 # main model
 MAIN_MODEL = "JackFram/llama-160m"
diff --git a/tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
similarity index 100%
rename from tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py
rename to tests/e2e/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
diff --git a/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py b/tests/e2e/long_term/spec_decode_v0/e2e/test_ngram_correctness.py
similarity index 98%
rename from tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py
rename to tests/e2e/long_term/spec_decode_v0/e2e/test_ngram_correctness.py
index b99187f..1cc20ab 100644
--- a/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/e2e/long_term/spec_decode_v0/e2e/test_ngram_correctness.py
@@ -44,9 +44,10 @@ for the target model outputs.
 
 import pytest
 
-from tests.e2e.long_term.spec_decode.e2e.conftest import \
+from tests.e2e.long_term.spec_decode_v0.e2e.conftest import \
     run_equality_correctness_test
-from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill
+from tests.e2e.long_term.spec_decode_v0.utils import \
+    maybe_enable_chunked_prefill
 
 
 @pytest.mark.parametrize(
diff --git a/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py b/tests/e2e/long_term/spec_decode_v0/test_dynamic_spec_decode.py
similarity index 96%
rename from tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py
rename to tests/e2e/long_term/spec_decode_v0/test_dynamic_spec_decode.py
index 8e9480e..63e4e1d 100644
--- a/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/e2e/long_term/spec_decode_v0/test_dynamic_spec_decode.py
@@ -27,8 +27,9 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
-from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler
-from tests.e2e.long_term.spec_decode.utils import create_batch, mock_worker
+from tests.e2e.long_term.spec_decode_v0.test_utils import \
+    mock_spec_decode_sampler
+from tests.e2e.long_term.spec_decode_v0.utils import create_batch, mock_worker
 
 
 @pytest.mark.parametrize('queue_size', [4])
diff --git a/tests/e2e/long_term/spec_decode/test_multi_step_worker.py b/tests/e2e/long_term/spec_decode_v0/test_multi_step_worker.py
similarity index 99%
rename from tests/e2e/long_term/spec_decode/test_multi_step_worker.py
rename to tests/e2e/long_term/spec_decode_v0/test_multi_step_worker.py
index b3017a9..1dc50dd 100644
--- a/tests/e2e/long_term/spec_decode/test_multi_step_worker.py
+++ b/tests/e2e/long_term/spec_decode_v0/test_multi_step_worker.py
@@ -29,7 +29,7 @@ from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob,
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
-from tests.e2e.long_term.spec_decode.utils import (
+from tests.e2e.long_term.spec_decode_v0.utils import (
     assert_logprobs_dict_allclose, create_batch,
     create_seq_group_metadata_from_prompts, create_worker,
     patch_execute_model_with_seeds, zero_kv_cache)
diff --git a/tests/e2e/long_term/spec_decode/test_ngram_worker.py b/tests/e2e/long_term/spec_decode_v0/test_ngram_worker.py
similarity index 99%
rename from tests/e2e/long_term/spec_decode/test_ngram_worker.py
rename to tests/e2e/long_term/spec_decode_v0/test_ngram_worker.py
index 078a4d2..30177b6 100644
--- a/tests/e2e/long_term/spec_decode/test_ngram_worker.py
+++ b/tests/e2e/long_term/spec_decode_v0/test_ngram_worker.py
@@ -22,7 +22,7 @@ from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
-from tests.e2e.long_term.spec_decode.utils import (
+from tests.e2e.long_term.spec_decode_v0.utils import (
     create_seq_group_metadata_from_prompts, create_worker)
 
 
diff --git a/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py b/tests/e2e/long_term/spec_decode_v0/test_spec_decode_worker.py
similarity index 99%
rename from tests/e2e/long_term/spec_decode/test_spec_decode_worker.py
rename to tests/e2e/long_term/spec_decode_v0/test_spec_decode_worker.py
index 94a1bcf..ffcb2f6 100644
--- a/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py
+++ b/tests/e2e/long_term/spec_decode_v0/test_spec_decode_worker.py
@@ -35,10 +35,10 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
                                                  split_num_cache_blocks_evenly)
 
-from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler
-from tests.e2e.long_term.spec_decode.utils import (create_batch,
-                                                   create_sampler_output_list,
-                                                   create_worker, mock_worker)
+from tests.e2e.long_term.spec_decode_v0.test_utils import \
+    mock_spec_decode_sampler
+from tests.e2e.long_term.spec_decode_v0.utils import (
+    create_batch, create_sampler_output_list, create_worker, mock_worker)
 from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
 from vllm_ascend.worker.worker import NPUWorker
 
diff --git a/tests/e2e/long_term/spec_decode/test_utils.py b/tests/e2e/long_term/spec_decode_v0/test_utils.py
similarity index 100%
rename from tests/e2e/long_term/spec_decode/test_utils.py
rename to tests/e2e/long_term/spec_decode_v0/test_utils.py
diff --git a/tests/e2e/long_term/spec_decode/utils.py b/tests/e2e/long_term/spec_decode_v0/utils.py
similarity index 100%
rename from tests/e2e/long_term/spec_decode/utils.py
rename to tests/e2e/long_term/spec_decode_v0/utils.py
diff --git a/tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py b/tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py
similarity index 100%
rename from tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
rename to tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py
diff --git a/tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py b/tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py
similarity index 100%
rename from tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
rename to tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
index d817f90..59d6035 100644
--- a/vllm_ascend/patch/__init__.py
+++ b/vllm_ascend/patch/__init__.py
@@ -100,18 +100,6 @@
 #    Future Plan:
 #       Revert it when the related pr is merged in vllm and vllm-ascend.
 #
-#   2. `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor` and
-#       `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace`
-#    Why:
-#       vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change.
-#    How：
-#       Use vLLM 0.8.4 method to patch it.
-#    Related PR (if no, explain why):
-#       - https://github.com/vllm-project/vllm/pull/15195
-#       - https://github.com/vllm-project/vllm-ascend/pull/395
-#    Future Plan:
-#       Remove it when we identify the reasons clearly.
-#
 # ** File: worker/patch_common/patch_spec_decode_worker.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker`
diff --git a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py b/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py
index ca87729..53ce312 100644
--- a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py
@@ -88,20 +88,4 @@ def sampler_output(
     return filtered_model_outputs, True
 
 
-def set_include_gpu_probs_tensor(self) -> None:
-    # Need include_gpu_probs_tensor for MultiSteoWorker
-    if hasattr(self.model_runner.model, "sampler"):
-        self.model_runner.model.sampler.include_gpu_probs_tensor = True
-    self.model_runner.sampler.include_gpu_probs_tensor = True
-
-
-def set_should_modify_greedy_probs_inplace(self) -> None:
-    if hasattr(self.model_runner.model, "sampler"):
-        self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
-            True)
-    self.model_runner.sampler.should_modify_greedy_probs_inplace = True
-
-
 MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output)
-MultiStepWorker.set_include_gpu_probs_tensor = set_include_gpu_probs_tensor
-MultiStepWorker.set_should_modify_greedy_probs_inplace = set_should_modify_greedy_probs_inplace
diff --git a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py b/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py
index 66e7aa5..d271e65 100644
--- a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py
@@ -57,11 +57,6 @@ def create_worker(
     ngram_prompt_lookup_min = (
         draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
 
-    # TODO(Yizhou): A quick fix, must be refactored ASAP
-    draft_worker_kwargs["vllm_config"].parallel_config.expert_parallel_size = 1
-    draft_worker_kwargs[
-        "vllm_config"].parallel_config.expert_tensor_parallel_size = 1
-
     draft_model_config = draft_worker_kwargs["vllm_config"].model_config
     draft_parallel_config: ParallelConfig = draft_worker_kwargs[
         'vllm_config'].parallel_config
@@ -72,6 +67,13 @@ def create_worker(
         proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                               ngram_prompt_lookup_max)
     else:
+        # TODO(Yizhou): A quick fix, must be refactored ASAP
+        # ngram need not this fix.
+        draft_worker_kwargs[
+            "vllm_config"].parallel_config.expert_parallel_size = 1
+        draft_worker_kwargs[
+            "vllm_config"].parallel_config.expert_tensor_parallel_size = 1
+
         draft_tp = draft_parallel_config.tensor_parallel_size
         target_tp = scorer_worker.parallel_config.tensor_parallel_size
 
diff --git a/vllm_ascend/worker/draft_model_runner.py b/vllm_ascend/worker/draft_model_runner.py
index 1306b1e..b070da1 100644
--- a/vllm_ascend/worker/draft_model_runner.py
+++ b/vllm_ascend/worker/draft_model_runner.py
@@ -51,12 +51,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
     """
 
     def __init__(self, model_runner: ModelRunnerBase):
-        if hasattr(
-                model_runner,
-                "return_hidden_states") and model_runner.return_hidden_states:
-            raise ValueError(
-                "return_hidden_states is not supported for TP1DraftModelRunner."
-            )
         super().__init__(model_runner)
 
         self.indices_of_seq_with_bonus_tokens = None
@@ -211,6 +205,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
             if self.prompt_adapter_config is not None:
                 raise ValueError("TP1DraftModelRunner has no support for "
                                  "prompt_adapter_config")
+            if model_input.inputs_embeds is not None:
+                raise ValueError("TP1DraftModelRunner has no support for "
+                                 "inputs_embeds")
             if model_input.multi_modal_kwargs:
                 raise ValueError(
                     "TP1DraftModelRunner has no support for multi_modal_kwargs"
@@ -272,6 +269,7 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
 
                 hidden_states = model_executable(
                     input_ids=model_input.input_tokens,
+                    inputs_embeds=None,
                     positions=model_input.input_positions,
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
@@ -293,6 +291,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
             )
             outputs.append(output)
 
+            if self.return_hidden_states and is_fallback:
+                output.hidden_states = hidden_states
+
             if model_input.attn_metadata.num_prefills == 0 \
                 and self.indices_of_seq_with_bonus_tokens is not None:
                 assert output.sampled_token_ids is not None