[CI/UT] fix spec ut in vllm-ascend main and vllm main (#759)

### What this PR does / why we need it? #### 1. fix spec ut in vllm-ascend main and vllm main As https://github.com/vllm-project/vllm-ascend/pull/694 and https://github.com/vllm-project/vllm-ascend/pull/749 verify, Now, vllm-ascend main and vllm 0.8.5, spec UT is happy, but vllm-ascend main and vllm main, CI is fail. I found the reason is a triton bug https://github.com/triton-lang/triton/issues/2266, but i I didn't figure it out that why the bug did not effect vllm-ascend main and vllm 0.8.5, maybe the usage of triton have changed when vllm 0.8.5 to latest main As the bug describe, I changed the minimum block_size in UT from 8 to 16, and the modification is verified locally to be effective. #### 2. modify some case skip form. I modified some commented out cases to skipif form, which is more standardized. ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? CI Signed-off-by: mengwei805 <mengwei25@huawei.com>
2025-05-10 09:45:56 +08:00
parent 58d2f85c4a
commit 19c8e134e4
5 changed files with 265 additions and 273 deletions
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -153,8 +153,7 @@ jobs:
              - "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py"

      - name: Run vllm-project/vllm-ascend Speculative Decode test
-        # speculative decode seems will cause oom issue, only disable it now on ci test with vLLM main
-        if: matrix.vllm_verison == 'v0.8.5.post1' && steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
+        if: steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
        run: |
          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
            pytest -sv tests/singlecard/spec_decode/e2e/test_mtp_correctness.py  # it needs a clean process
--- a/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py
@@ -186,110 +186,109 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
        ["disable_logprobs"])


-# TODO: Open it when vllm-ascend support graph mode and
-# @pytest.mark.parametrize(
-#     "common_llm_kwargs",
-#     [{
-#         "enforce_eager": False,
+@pytest.mark.skipif(True, reason="Open it when graph mode ready.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "enforce_eager": False,

-#         # Print spec metrics.
-#         "disable_log_stats": False,
+        # Print spec metrics.
+        "disable_log_stats": False,

-#         # Precision
-#         "dtype": PRECISION,
+        # Precision
+        "dtype": PRECISION,

-#         # Main model
-#         "model_name": MAIN_MODEL,
-#     }])
-# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-# @pytest.mark.parametrize("test_llm_kwargs", [
-#     {
-#         "speculative_config": {
-#             "model": SPEC_MODEL,
-#             "num_speculative_tokens": MAX_SPEC_TOKENS,
-#         },
-#     },
-# ])
-# @pytest.mark.parametrize("output_len", [
-#     128,
-# ])
-# @pytest.mark.parametrize("batch_size", [1, 32])
-# @pytest.mark.parametrize("seed", [1])
-# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
-# def test_medusa_e2e_greedy_correctness_cuda_graph(
-#         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-#         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-#         seed: int, prefill_chunk_size: int):
-#     """Verify greedy equality with cuda graph enabled and different
-#     batch sizes."""
-#     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-#     run_equality_correctness_test(vllm_runner,
-#                                   common_llm_kwargs,
-#                                   per_test_common_llm_kwargs,
-#                                   baseline_llm_kwargs,
-#                                   test_llm_kwargs,
-#                                   batch_size,
-#                                   max_output_len=output_len,
-#                                   seed=seed,
-#                                   temperature=0.0)
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
+def test_medusa_e2e_greedy_correctness_cuda_graph(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int, prefill_chunk_size: int):
+    """Verify greedy equality with cuda graph enabled and different
+    batch sizes."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)

-# TODO: There is a problem with the preemptive scheduling in the current
-# version, which makes this case fail. Please release this case after the
-# preemptive scheduling problem is solved.
-# @pytest.mark.parametrize(
-#     "common_llm_kwargs",
-#     [{
-#         "block_size": 8,
-#         # 2 for small prompt, 256//8 for generated.
-#         "num_gpu_blocks_override": 2 + 256 // 8,
-#         "max_model_len": (2 + 256 // 8) * 8,

-#         # Skip cuda graph recording for fast test.
-#         "enforce_eager": True,
+@pytest.mark.skipif(True, reason="Open it when preempt ready.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 16,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,

-#         # Precision
-#         "dtype": PRECISION,
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,

-#         # Main model
-#         "model_name": MAIN_MODEL,
-#     }])
-# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-# @pytest.mark.parametrize("test_llm_kwargs", [
-#     {
-#         "speculative_config": {
-#             "model": SPEC_MODEL,
-#             "num_speculative_tokens": MAX_SPEC_TOKENS,
-#         },
-#     },
-# ])
-# @pytest.mark.parametrize(
-#     "output_len",
-#     [
-#         # Use small output len for fast test.
-#         128,
-#     ])
-# @pytest.mark.parametrize("batch_size", [4])
-# @pytest.mark.parametrize("seed", [1])
-# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
-# def test_medusa_e2e_greedy_correctness_with_preemption(
-#         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-#         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-#         seed: int, prefill_chunk_size: int):
-#     """Verify greedy equality, even when some sequences are preempted mid-
-#     generation.
-#     """
-#     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-#     run_equality_correctness_test(vllm_runner,
-#                                   common_llm_kwargs,
-#                                   per_test_common_llm_kwargs,
-#                                   baseline_llm_kwargs,
-#                                   test_llm_kwargs,
-#                                   batch_size,
-#                                   max_output_len=output_len,
-#                                   seed=seed,
-#                                   temperature=0.0)
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
+def test_medusa_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int, prefill_chunk_size: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)


@pytest.mark.parametrize(
--- a/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py
@@ -294,124 +294,120 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
                                      disable_seed=True)


-# TODO: There is a problem with the preemptive scheduling in the current
-# version, which makes this case fail. Please release this case after the
-# preemptive scheduling problem is solved.
-# @pytest.mark.parametrize(
-#     "common_llm_kwargs",
-#     [{
-#         "block_size": 8,
-#         # 2 for small prompt, 256//8 for generated.
-#         "num_gpu_blocks_override": 2 + 256 // 8,
-#         "max_model_len": (2 + 256 // 8) * 8,
+@pytest.mark.skipif(True, reason="Open it when preempt ready.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 16,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,

-#         # Skip cuda graph recording for fast test.
-#         "enforce_eager": True,
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,

-#         # Precision
-#         "dtype": PRECISION,
+        # Precision
+        "dtype": PRECISION,

-#         # Main model
-#         "model_name": MAIN_MODEL,
-#     }])
-# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-# @pytest.mark.parametrize("test_llm_kwargs", [
-#     {
-#         "speculative_config": {
-#             "model": SPEC_MODEL,
-#         },
-#     },
-# ])
-# @pytest.mark.parametrize(
-#     "output_len",
-#     [
-#         # Use small output len for fast test.
-#         128,
-#     ])
-# @pytest.mark.parametrize("batch_size", [4])
-# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
-# @pytest.mark.parametrize("seed", [1])
-# def test_mlp_e2e_greedy_correctness_with_preemption(
-#         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-#         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-#         prefill_chunk_size: int, seed: int):
-#     """Verify greedy equality, even when some sequences are preempted mid-
-#     generation.
-#     """
-#     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-#     run_equality_correctness_test(vllm_runner,
-#                                   common_llm_kwargs,
-#                                   per_test_common_llm_kwargs,
-#                                   baseline_llm_kwargs,
-#                                   test_llm_kwargs,
-#                                   batch_size,
-#                                   max_output_len=output_len,
-#                                   seed=seed,
-#                                   temperature=0.0)
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        prefill_chunk_size: int, seed: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)

-# TODO: There is a problem with the preemptive scheduling in the current
-# version, which makes this case fail. Please release this case after the
-# preemptive scheduling problem is solved.
-# @pytest.mark.parametrize(
-#     "common_llm_kwargs",
-#     [{
-#         "block_size": 8,
-#         # 2 for small prompt, 256//8 for generated.
-#         "num_gpu_blocks_override": 2 + 256 // 8,
-#         "max_model_len": (2 + 256 // 8) * 8,

-#         # Skip cuda graph recording for fast test.
-#         "enforce_eager": True,
+@pytest.mark.skipif(True, reason="Open it when preempt ready.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 16,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,

-#         # Precision
-#         "dtype": PRECISION,
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,

-#         # Main model
-#         "model_name": MAIN_MODEL,
-#     }])
-# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-# @pytest.mark.parametrize("test_llm_kwargs", [
-#     {
-#         "speculative_config": {
-#             "model": SPEC_MODEL,
-#         },
-#     },
-# ])
-# @pytest.mark.parametrize(
-#     "output_len",
-#     [
-#         # Use small output len for fast test.
-#         128,
-#     ])
-# @pytest.mark.parametrize("batch_size", [4])
-# @pytest.mark.parametrize("seed", [1])
-# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
-# def test_mlp_e2e_greedy_correctness_with_padding(
-#         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-#         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-#         prefill_chunk_size: int, seed: int):
-#     """Verify greedy equality when the vocab dimension is padded
-#     """
-#     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+        # Precision
+        "dtype": PRECISION,

-#     # Default pad_to is 64, test model has vocab_size of 32000
-#     def patched_pad_vocab_size(vocab_size, pad_to=None):
-#         return pad_vocab_size(vocab_size, pad_to=32064)
+        # Main model
+        "model_name": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
+def test_mlp_e2e_greedy_correctness_with_padding(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        prefill_chunk_size: int, seed: int):
+    """Verify greedy equality when the vocab dimension is padded
+    """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)

-#     # NOTE: Compared with vLLM, the patch method has been modified
-#     from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
-#     pad_vocab_size = patched_pad_vocab_size
-#     run_equality_correctness_test(vllm_runner,
-#                                   common_llm_kwargs,
-#                                   per_test_common_llm_kwargs,
-#                                   baseline_llm_kwargs,
-#                                   test_llm_kwargs,
-#                                   batch_size,
-#                                   max_output_len=output_len,
-#                                   seed=seed,
-#                                   temperature=0.0)
+    # Default pad_to is 64, test model has vocab_size of 32000
+    def patched_pad_vocab_size(vocab_size, pad_to=None):
+        return pad_vocab_size(vocab_size, pad_to=32064)
+
+    # NOTE: Compared with vLLM, the patch method has been modified
+    pad_vocab_size = patched_pad_vocab_size  # noqa: F811
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)


@pytest.mark.parametrize(
--- a/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py
@@ -307,7 +307,7 @@ def test_mtp_e2e_quant_greedy_correctness_torchair_graph(
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,
--- a/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py
@@ -173,71 +173,69 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
        ["disable_logprobs"])


-# TODO: There is a problem with the preemptive scheduling in the current
-# version, which makes this case fail. Please release this case after the
-# preemptive scheduling problem is solved.
-# @pytest.mark.parametrize(
-#     "common_llm_kwargs",
-#     [{
-#         "block_size": 8,
-#         # 2 for small prompt, 256//8 for generated.
-#         "num_gpu_blocks_override": 2 + 256 // 8,
-#         "max_model_len": (2 + 256 // 8) * 8,
+@pytest.mark.skipif(True, reason="Open it when preempt ready.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 16,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,

-#         # Skip cuda graph recording for fast test.
-#         "enforce_eager": True,
-#     }])
-# @pytest.mark.parametrize("per_test_common_llm_kwargs", [
-#     {
-#         "model_name": "JackFram/llama-160m",
-#     },
-# ])
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-# @pytest.mark.parametrize("test_llm_kwargs", [
-#     {
-#         "speculative_config": {
-#             "method": "ngram",
-#             "num_speculative_tokens": 5,
-#             "prompt_lookup_max": 3,
-#         },
-#         "enable_chunked_prefill": False,
-#     },
-#     {
-#         "speculative_config": {
-#             "method": "ngram",
-#             "num_speculative_tokens": 5,
-#             "prompt_lookup_max": 3,
-#             "disable_mqa_scorer": True,
-#         },
-#         "enable_chunked_prefill": True,
-#         "max_num_batched_tokens": 4,
-#         "max_num_seqs": 4
-#     },
-# ])
-# @pytest.mark.parametrize(
-#     "output_len",
-#     [
-#         # Use small output len for fast test.
-#         256,
-#     ])
-# @pytest.mark.parametrize("batch_size", [4])
-# @pytest.mark.parametrize("seed", [1])
-# def test_ngram_e2e_greedy_correctness_with_preemption(
-#         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-#         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-#         seed: int):
-#     """Verify greedy equality, even when some sequences are preempted mid-
-#     generation.
-#     """
-#     run_equality_correctness_test(vllm_runner,
-#                                   common_llm_kwargs,
-#                                   per_test_common_llm_kwargs,
-#                                   baseline_llm_kwargs,
-#                                   test_llm_kwargs,
-#                                   batch_size,
-#                                   max_output_len=output_len,
-#                                   temperature=0,
-#                                   seed=seed)
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "model_name": "JackFram/llama-160m",
+    },
+])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+        },
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_mqa_scorer": True,
+        },
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        256,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_ngram_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  temperature=0,
+                                  seed=seed)


@pytest.mark.parametrize(