[CI/UT] fix spec ut in vllm-ascend main and vllm main (#759)

### What this PR does / why we need it? #### 1. fix spec ut in vllm-ascend main and vllm main As https://github.com/vllm-project/vllm-ascend/pull/694 and https://github.com/vllm-project/vllm-ascend/pull/749 verify, Now, vllm-ascend main and vllm 0.8.5, spec UT is happy, but vllm-ascend main and vllm main, CI is fail. I found the reason is a triton bug https://github.com/triton-lang/triton/issues/2266, but i I didn't figure it out that why the bug did not effect vllm-ascend main and vllm 0.8.5, maybe the usage of triton have changed when vllm 0.8.5 to latest main As the bug describe, I changed the minimum block_size in UT from 8 to 16, and the modification is verified locally to be effective. #### 2. modify some case skip form. I modified some commented out cases to skipif form, which is more standardized. ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? CI Signed-off-by: mengwei805 <mengwei25@huawei.com>
2025-05-10 09:45:56 +08:00
parent 58d2f85c4a
commit 19c8e134e4
5 changed files with 265 additions and 273 deletions
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -153,8 +153,7 @@ jobs:
              - "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py"
      - name: Run vllm-project/vllm-ascend Speculative Decode test
-        # speculative decode seems will cause oom issue, only disable it now on ci test with vLLM main
+        if: steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
        if: matrix.vllm_verison == 'v0.8.5.post1' && steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
        run: |
          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
            pytest -sv tests/singlecard/spec_decode/e2e/test_mtp_correctness.py  # it needs a clean process
--- a/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py
@@ -186,110 +186,109 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
        ["disable_logprobs"])
-# TODO: Open it when vllm-ascend support graph mode and
+@pytest.mark.skipif(True, reason="Open it when graph mode ready.")
-# @pytest.mark.parametrize(
+@pytest.mark.parametrize(
-#     "common_llm_kwargs",
+    "common_llm_kwargs",
-#     [{
+    [{
-#         "enforce_eager": False,
+        "enforce_eager": False,
-#         # Print spec metrics.
+        # Print spec metrics.
-#         "disable_log_stats": False,
+        "disable_log_stats": False,
-#         # Precision
+        # Precision
-#         "dtype": PRECISION,
+        "dtype": PRECISION,
-#         # Main model
+        # Main model
-#         "model_name": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
-#     }])
+    }])
-# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-# @pytest.mark.parametrize("test_llm_kwargs", [
+@pytest.mark.parametrize("test_llm_kwargs", [
-#     {
+    {
-#         "speculative_config": {
+        "speculative_config": {
-#             "model": SPEC_MODEL,
+            "model": SPEC_MODEL,
-#             "num_speculative_tokens": MAX_SPEC_TOKENS,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
-#         },
+        },
-#     },
+    },
-# ])
+])
-# @pytest.mark.parametrize("output_len", [
+@pytest.mark.parametrize("output_len", [
-#     128,
+    128,
-# ])
+])
-# @pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("batch_size", [1, 32])
-# @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("seed", [1])
-# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
+@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
-# def test_medusa_e2e_greedy_correctness_cuda_graph(
+def test_medusa_e2e_greedy_correctness_cuda_graph(
-#         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-#         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-#         seed: int, prefill_chunk_size: int):
+        seed: int, prefill_chunk_size: int):
-#     """Verify greedy equality with cuda graph enabled and different
+    """Verify greedy equality with cuda graph enabled and different
-#     batch sizes."""
+    batch sizes."""
-#     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-#     run_equality_correctness_test(vllm_runner,
+    run_equality_correctness_test(vllm_runner,
-#                                   common_llm_kwargs,
+                                  common_llm_kwargs,
-#                                   per_test_common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
-#                                   baseline_llm_kwargs,
+                                  baseline_llm_kwargs,
-#                                   test_llm_kwargs,
+                                  test_llm_kwargs,
-#                                   batch_size,
+                                  batch_size,
-#                                   max_output_len=output_len,
+                                  max_output_len=output_len,
-#                                   seed=seed,
+                                  seed=seed,
-#                                   temperature=0.0)
+                                  temperature=0.0)
 # TODO: There is a problem with the preemptive scheduling in the current
 # version, which makes this case fail. Please release this case after the
 # preemptive scheduling problem is solved.
 # @pytest.mark.parametrize(
 #     "common_llm_kwargs",
 #     [{
 #         "block_size": 8,
 #         # 2 for small prompt, 256//8 for generated.
 #         "num_gpu_blocks_override": 2 + 256 // 8,
 #         "max_model_len": (2 + 256 // 8) * 8,
-#         # Skip cuda graph recording for fast test.
+@pytest.mark.skipif(True, reason="Open it when preempt ready.")
-#         "enforce_eager": True,
+@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,
-#         # Precision
+        # Skip cuda graph recording for fast test.
-#         "dtype": PRECISION,
+        "enforce_eager": True,
-#         # Main model
+        # Precision
-#         "model_name": MAIN_MODEL,
+        "dtype": PRECISION,
-#     }])
+
-# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+        # Main model
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+        "model_name": MAIN_MODEL,
-# @pytest.mark.parametrize("test_llm_kwargs", [
+    }])
-#     {
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-#         "speculative_config": {
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-#             "model": SPEC_MODEL,
+@pytest.mark.parametrize("test_llm_kwargs", [
-#             "num_speculative_tokens": MAX_SPEC_TOKENS,
+    {
-#         },
+        "speculative_config": {
-#     },
+            "model": SPEC_MODEL,
-# ])
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
-# @pytest.mark.parametrize(
+        },
-#     "output_len",
+    },
-#     [
+])
-#         # Use small output len for fast test.
+@pytest.mark.parametrize(
-#         128,
+    "output_len",
-#     ])
+    [
-# @pytest.mark.parametrize("batch_size", [4])
+        # Use small output len for fast test.
-# @pytest.mark.parametrize("seed", [1])
+        128,
-# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
+    ])
-# def test_medusa_e2e_greedy_correctness_with_preemption(
+@pytest.mark.parametrize("batch_size", [4])
-#         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+@pytest.mark.parametrize("seed", [1])
-#         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
-#         seed: int, prefill_chunk_size: int):
+def test_medusa_e2e_greedy_correctness_with_preemption(
-#     """Verify greedy equality, even when some sequences are preempted mid-
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-#     generation.
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-#     """
+        seed: int, prefill_chunk_size: int):
-#     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    """Verify greedy equality, even when some sequences are preempted mid-
-#     run_equality_correctness_test(vllm_runner,
+    generation.
-#                                   common_llm_kwargs,
+    """
-#                                   per_test_common_llm_kwargs,
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-#                                   baseline_llm_kwargs,
+    run_equality_correctness_test(vllm_runner,
-#                                   test_llm_kwargs,
+                                  common_llm_kwargs,
-#                                   batch_size,
+                                  per_test_common_llm_kwargs,
-#                                   max_output_len=output_len,
+                                  baseline_llm_kwargs,
-#                                   seed=seed,
+                                  test_llm_kwargs,
-#                                   temperature=0.0)
+                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
--- a/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py
@@ -294,124 +294,120 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
                                      disable_seed=True)
-# TODO: There is a problem with the preemptive scheduling in the current
+@pytest.mark.skipif(True, reason="Open it when preempt ready.")
-# version, which makes this case fail. Please release this case after the
+@pytest.mark.parametrize(
-# preemptive scheduling problem is solved.
+    "common_llm_kwargs",
-# @pytest.mark.parametrize(
+    [{
-#     "common_llm_kwargs",
+        "block_size": 16,
-#     [{
+        # 2 for small prompt, 256//8 for generated.
-#         "block_size": 8,
+        "num_gpu_blocks_override": 2 + 256 // 8,
-#         # 2 for small prompt, 256//8 for generated.
+        "max_model_len": (2 + 256 // 8) * 8,
 #         "num_gpu_blocks_override": 2 + 256 // 8,
 #         "max_model_len": (2 + 256 // 8) * 8,
-#         # Skip cuda graph recording for fast test.
+        # Skip cuda graph recording for fast test.
-#         "enforce_eager": True,
+        "enforce_eager": True,
-#         # Precision
+        # Precision
-#         "dtype": PRECISION,
+        "dtype": PRECISION,
-#         # Main model
+        # Main model
-#         "model_name": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
-#     }])
+    }])
-# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-# @pytest.mark.parametrize("test_llm_kwargs", [
+@pytest.mark.parametrize("test_llm_kwargs", [
-#     {
+    {
-#         "speculative_config": {
+        "speculative_config": {
-#             "model": SPEC_MODEL,
+            "model": SPEC_MODEL,
-#         },
+        },
-#     },
+    },
-# ])
+])
-# @pytest.mark.parametrize(
+@pytest.mark.parametrize(
-#     "output_len",
+    "output_len",
-#     [
+    [
-#         # Use small output len for fast test.
+        # Use small output len for fast test.
-#         128,
+        128,
-#     ])
+    ])
-# @pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("batch_size", [4])
-# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
+@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
-# @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("seed", [1])
-# def test_mlp_e2e_greedy_correctness_with_preemption(
+def test_mlp_e2e_greedy_correctness_with_preemption(
-#         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-#         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-#         prefill_chunk_size: int, seed: int):
+        prefill_chunk_size: int, seed: int):
-#     """Verify greedy equality, even when some sequences are preempted mid-
+    """Verify greedy equality, even when some sequences are preempted mid-
-#     generation.
+    generation.
-#     """
+    """
-#     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-#     run_equality_correctness_test(vllm_runner,
+    run_equality_correctness_test(vllm_runner,
-#                                   common_llm_kwargs,
+                                  common_llm_kwargs,
-#                                   per_test_common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
-#                                   baseline_llm_kwargs,
+                                  baseline_llm_kwargs,
-#                                   test_llm_kwargs,
+                                  test_llm_kwargs,
-#                                   batch_size,
+                                  batch_size,
-#                                   max_output_len=output_len,
+                                  max_output_len=output_len,
-#                                   seed=seed,
+                                  seed=seed,
-#                                   temperature=0.0)
+                                  temperature=0.0)
 # TODO: There is a problem with the preemptive scheduling in the current
 # version, which makes this case fail. Please release this case after the
 # preemptive scheduling problem is solved.
 # @pytest.mark.parametrize(
 #     "common_llm_kwargs",
 #     [{
 #         "block_size": 8,
 #         # 2 for small prompt, 256//8 for generated.
 #         "num_gpu_blocks_override": 2 + 256 // 8,
 #         "max_model_len": (2 + 256 // 8) * 8,
-#         # Skip cuda graph recording for fast test.
+@pytest.mark.skipif(True, reason="Open it when preempt ready.")
-#         "enforce_eager": True,
+@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,
-#         # Precision
+        # Skip cuda graph recording for fast test.
-#         "dtype": PRECISION,
+        "enforce_eager": True,
-#         # Main model
+        # Precision
-#         "model_name": MAIN_MODEL,
+        "dtype": PRECISION,
 #     }])
 # @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 # @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 # @pytest.mark.parametrize("test_llm_kwargs", [
 #     {
 #         "speculative_config": {
 #             "model": SPEC_MODEL,
 #         },
 #     },
 # ])
 # @pytest.mark.parametrize(
 #     "output_len",
 #     [
 #         # Use small output len for fast test.
 #         128,
 #     ])
 # @pytest.mark.parametrize("batch_size", [4])
 # @pytest.mark.parametrize("seed", [1])
 # @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
 # def test_mlp_e2e_greedy_correctness_with_padding(
 #         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
 #         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
 #         prefill_chunk_size: int, seed: int):
 #     """Verify greedy equality when the vocab dimension is padded
 #     """
 #     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-#     # Default pad_to is 64, test model has vocab_size of 32000
+        # Main model
-#     def patched_pad_vocab_size(vocab_size, pad_to=None):
+        "model_name": MAIN_MODEL,
-#         return pad_vocab_size(vocab_size, pad_to=32064)
+    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
        "speculative_config": {
            "model": SPEC_MODEL,
        },
    },
 ])
@pytest.mark.parametrize(
    "output_len",
    [
        # Use small output len for fast test.
        128,
    ])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
 def test_mlp_e2e_greedy_correctness_with_padding(
        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
        prefill_chunk_size: int, seed: int):
    """Verify greedy equality when the vocab dimension is padded
    """
    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-#     # NOTE: Compared with vLLM, the patch method has been modified
+    # Default pad_to is 64, test model has vocab_size of 32000
-#     from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
+    def patched_pad_vocab_size(vocab_size, pad_to=None):
-#     pad_vocab_size = patched_pad_vocab_size
+        return pad_vocab_size(vocab_size, pad_to=32064)
-#     run_equality_correctness_test(vllm_runner,
+
-#                                   common_llm_kwargs,
+    # NOTE: Compared with vLLM, the patch method has been modified
-#                                   per_test_common_llm_kwargs,
+    pad_vocab_size = patched_pad_vocab_size  # noqa: F811
-#                                   baseline_llm_kwargs,
+    run_equality_correctness_test(vllm_runner,
-#                                   test_llm_kwargs,
+                                  common_llm_kwargs,
-#                                   batch_size,
+                                  per_test_common_llm_kwargs,
-#                                   max_output_len=output_len,
+                                  baseline_llm_kwargs,
-#                                   seed=seed,
+                                  test_llm_kwargs,
-#                                   temperature=0.0)
+                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
--- a/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py
@@ -307,7 +307,7 @@ def test_mtp_e2e_quant_greedy_correctness_torchair_graph(
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,
--- a/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py
@@ -173,71 +173,69 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
        ["disable_logprobs"])
-# TODO: There is a problem with the preemptive scheduling in the current
+@pytest.mark.skipif(True, reason="Open it when preempt ready.")
-# version, which makes this case fail. Please release this case after the
+@pytest.mark.parametrize(
-# preemptive scheduling problem is solved.
+    "common_llm_kwargs",
-# @pytest.mark.parametrize(
+    [{
-#     "common_llm_kwargs",
+        "block_size": 16,
-#     [{
+        # 2 for small prompt, 256//8 for generated.
-#         "block_size": 8,
+        "num_gpu_blocks_override": 2 + 256 // 8,
-#         # 2 for small prompt, 256//8 for generated.
+        "max_model_len": (2 + 256 // 8) * 8,
 #         "num_gpu_blocks_override": 2 + 256 // 8,
 #         "max_model_len": (2 + 256 // 8) * 8,
-#         # Skip cuda graph recording for fast test.
+        # Skip cuda graph recording for fast test.
-#         "enforce_eager": True,
+        "enforce_eager": True,
-#     }])
+    }])
-# @pytest.mark.parametrize("per_test_common_llm_kwargs", [
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-#     {
+    {
-#         "model_name": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
-#     },
+    },
-# ])
+])
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-# @pytest.mark.parametrize("test_llm_kwargs", [
+@pytest.mark.parametrize("test_llm_kwargs", [
-#     {
+    {
-#         "speculative_config": {
+        "speculative_config": {
-#             "method": "ngram",
+            "method": "ngram",
-#             "num_speculative_tokens": 5,
+            "num_speculative_tokens": 5,
-#             "prompt_lookup_max": 3,
+            "prompt_lookup_max": 3,
-#         },
+        },
-#         "enable_chunked_prefill": False,
+        "enable_chunked_prefill": False,
-#     },
+    },
-#     {
+    {
-#         "speculative_config": {
+        "speculative_config": {
-#             "method": "ngram",
+            "method": "ngram",
-#             "num_speculative_tokens": 5,
+            "num_speculative_tokens": 5,
-#             "prompt_lookup_max": 3,
+            "prompt_lookup_max": 3,
-#             "disable_mqa_scorer": True,
+            "disable_mqa_scorer": True,
-#         },
+        },
-#         "enable_chunked_prefill": True,
+        "enable_chunked_prefill": True,
-#         "max_num_batched_tokens": 4,
+        "max_num_batched_tokens": 4,
-#         "max_num_seqs": 4
+        "max_num_seqs": 4
-#     },
+    },
-# ])
+])
-# @pytest.mark.parametrize(
+@pytest.mark.parametrize(
-#     "output_len",
+    "output_len",
-#     [
+    [
-#         # Use small output len for fast test.
+        # Use small output len for fast test.
-#         256,
+        256,
-#     ])
+    ])
-# @pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("batch_size", [4])
-# @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("seed", [1])
-# def test_ngram_e2e_greedy_correctness_with_preemption(
+def test_ngram_e2e_greedy_correctness_with_preemption(
-#         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-#         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-#         seed: int):
+        seed: int):
-#     """Verify greedy equality, even when some sequences are preempted mid-
+    """Verify greedy equality, even when some sequences are preempted mid-
-#     generation.
+    generation.
-#     """
+    """
-#     run_equality_correctness_test(vllm_runner,
+    run_equality_correctness_test(vllm_runner,
-#                                   common_llm_kwargs,
+                                  common_llm_kwargs,
-#                                   per_test_common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
-#                                   baseline_llm_kwargs,
+                                  baseline_llm_kwargs,
-#                                   test_llm_kwargs,
+                                  test_llm_kwargs,
-#                                   batch_size,
+                                  batch_size,
-#                                   max_output_len=output_len,
+                                  max_output_len=output_len,
-#                                   temperature=0,
+                                  temperature=0,
-#                                   seed=seed)
+                                  seed=seed)
@pytest.mark.parametrize(