[Misc] Refactor aclgraph accuracy test to use logprob-based comparison (#7455)

### What this PR does / why we need it? Replace text-match assertions with a two-tier logprob accuracy check: - Prefill (token 0): assert token ID is identical between eager baseline and compiled mode, then verify logprob matches within `atol`. - Decode (tokens 1-2): if chosen tokens match, compare logprobs directly; if they differ, cross-lookup the baseline token in the compiled model's top-20 distribution and assert the assigned logprob is within `decode_atol` (defaults to 2x atol). This tolerates minor argmax drift caused by floating-point differences while still catching distribution divergence. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.17.0 - vLLM main: 8a680463fa --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2026-03-23 09:08:21 +08:00
parent 9bf9b4b267
commit 75fae619d5
5 changed files with 228 additions and 145 deletions
--- a/.github/workflows/scripts/config.yaml
+++ b/.github/workflows/scripts/config.yaml
@@ -5,6 +5,8 @@ e2e-singlecard:
  estimated_time: 69
 - name: tests/e2e/singlecard/test_auto_fit_max_mode_len.py
  estimated_time: 70
+- name: tests/e2e/singlecard/test_eager_mode_acc.py
+  estimated_time: 255
 - name: tests/e2e/singlecard/test_aclgraph_accuracy.py
  estimated_time: 839
 - name: tests/e2e/singlecard/test_aclgraph_batch_invariant.py
--- a/tests/e2e/singlecard/test_aclgraph_accuracy.py
+++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py
@@ -21,97 +21,61 @@ import os

 import pytest

-from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid
+from tests.e2e.conftest import wait_until_npu_memory_free
+from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, compare_logprobs
+
+# ---------------------------------------------------------------------------
+# Test cases – no golden_answers needed; accuracy is verified via logprob
+# comparison against an eager-mode baseline.  Token 0 covers the prefill
+# forward pass; tokens 1-2 cover decode forward passes.
+# ---------------------------------------------------------------------------

 CASE_QWEN_ACLGRAPH = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_SHORT,
-    golden_answers=[
-        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
-        " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
-        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
-    ],
 )

 CASE_DS_ACLGRAPH = LLMTestCase(
    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
    quantization="ascend",
    prompts=PROMPTS_SHORT,
-    golden_answers=[
-        "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
-        " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
-        " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
-        " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
-    ],
 )

 CASE_QWEN_FULL = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_SHORT,
-    golden_answers=[
-        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
-        " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
-        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
-    ],
 )

 CASE_DS_FULL = LLMTestCase(
    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
    quantization="ascend",
    prompts=PROMPTS_SHORT,
-    golden_answers=[
-        "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
-        " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
-        " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
-        " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
-    ],
 )

 CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_LONG,
-    golden_answers=[
-        " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
-        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
-        " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
-    ],
 )

 CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
    quantization="ascend",
    prompts=PROMPTS_LONG,
-    golden_answers=[
-        "\n\nSelect an assignment template",
-        "\n\nI'm not sure how to approach this problem. I'm thinking that the area of the triangle is $1/2$ times the area",
-        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x = \\alpha$ be the common root",
-    ],
 )

 CASE_QWEN_EX = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_LONG,
-    golden_answers=[
-        " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
-        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
-        " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
-    ],
 )

 CASE_DS_EX = LLMTestCase(
    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
    quantization="ascend",
    prompts=PROMPTS_LONG,
-    golden_answers=[
-        "\n\nSelect an assignment template",
-        "\n\nI'm not sure how to approach this problem. I'm thinking that the area of the triangle is $1/2$ times the area",
-        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x = \\alpha$ be the common root",
-    ],
 )


+@wait_until_npu_memory_free(0.7)
@pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
 def test_piecewise_res_consistency(cur_case: LLMTestCase):
    runner_kwargs = {
@@ -120,14 +84,10 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase):
        "cudagraph_capture_sizes": [1, 2, 4, 8],
        "quantization": cur_case.quantization,
    }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)


+@wait_until_npu_memory_free(0.7)
@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
 def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
@@ -137,14 +97,10 @@ def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
        "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
        "quantization": cur_case.quantization,
    }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)


+@wait_until_npu_memory_free(0.7)
@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
 def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
@@ -155,14 +111,10 @@ def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
        "quantization": cur_case.quantization,
        "additional_config": {"ascend_compilation_config": {"enable_npugraph_ex": False}},
    }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)


+@wait_until_npu_memory_free(0.7)
@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
 def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
@@ -173,17 +125,13 @@ def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
        "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
        "additional_config": {"ascend_compilation_config": {"enable_npugraph_ex": True}},
    }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)


 # The accuracy has already been verified in the previous test case.
 # This test case is used to check whether the functionality works properly
 # after enabling the static kernel and whether it is uninstalled as expected.
+@wait_until_npu_memory_free(0.7)
@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX])
 def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
@@ -199,14 +147,9 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
            }
        },
    }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)

-    # Check whether the static kernel is properly uninstall
+    # Check whether the static kernel is properly uninstalled
    ascend_home_path = os.environ["ASCEND_HOME_PATH"]
    static_kernel_install_path = os.path.join(ascend_home_path, "opp/static_kernel/ai_core")
    assert not os.path.exists(static_kernel_install_path)
--- a/tests/e2e/singlecard/test_eager_mode_acc.py
+++ b/tests/e2e/singlecard/test_eager_mode_acc.py
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+This file test accuracy via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+MODEL_NAMES = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.414, "vllm-ascend/DeepSeek-V2-Lite-W8A8": 0.34}
+
+
+def run_test(model_name, more_args=None):
+    """Run the end to end accuracy test."""
+
+    # NOTE: Do not add any spaces to the string below, as this will cause parameter parsing errors.
+    model_args = f"pretrained={model_name},max_model_len=4096,enforce_eager=True"
+
+    if more_args is not None:
+        model_args = "{},{}".format(model_args, more_args)
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert model_name in EXPECTED_VALUES, f"Cannot find the expected value for the model {model_name=}"
+    expected_value = EXPECTED_VALUES[model_name]
+    assert measured_value - RTOL < expected_value and measured_value + RTOL > expected_value, (
+        f"Expected: {expected_value} |  Measured: {measured_value}"
+    )
+
+
+@pytest.mark.parametrize("model", MODEL_NAMES)
+def test_lm_eval_accuracy(model):
+    """Run with the V1 Engine."""
+    more_args = None
+    run_test(model, more_args)
--- a/tests/e2e/singlecard/test_xlite.py
+++ b/tests/e2e/singlecard/test_xlite.py
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 """
-Compare the outputs of vLLM with and without xlite.
+Compare the outputs of vLLM with and without xlite via logprob-based accuracy
+check (3 tokens: 1 prefill + 2 decode).

 Run `pytest tests/e2e/singlecard/test_xlite.py`.
 """
@@ -25,51 +26,19 @@ Run `pytest tests/e2e/singlecard/test_xlite.py`.
 import os

 import pytest
-from vllm import SamplingParams

-from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, gen_and_valid
+from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, compare_logprobs

 os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2"

 CASE_DECODE_ONLY = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_SHORT,
-    golden_answers=[
-        "Hello, my name is Lina. I'm a 22-year-old student from China.",
-        "The president of the United States is the same as the president of the United Nations. This is because the president",
-        "The capital of France is Paris. The capital of France is also the capital of the French Republic.",
-        "The future of AI is not just a technological challenge but a profound transformation of how we live, work",
-    ],
-    sampling_params=SamplingParams(
-        max_tokens=15,
-        temperature=0.0,
-        top_p=1.0,
-        top_k=0,
-        n=1,
-    ),
 )

 CASE_FULL = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
-    prompts=[
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ],
-    golden_answers=[
-        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
-        " Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital",
-        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
-    ],
-    sampling_params=SamplingParams(
-        max_tokens=32,
-        temperature=0.0,
-        top_p=1.0,
-        top_k=0,
-        n=1,
-    ),
+    prompts=PROMPTS_SHORT,
 )


@@ -82,12 +51,7 @@ def test_models_with_xlite_decode_only(cur_case: LLMTestCase):
        "block_size": 128,
        "additional_config": {"xlite_graph_config": {"enabled": True}},
    }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)


@pytest.mark.parametrize("cur_case", [CASE_FULL])
@@ -98,9 +62,4 @@ def test_models_with_xlite_full_mode(cur_case: LLMTestCase):
        "block_size": 128,
        "additional_config": {"xlite_graph_config": {"enabled": True, "full_mode": True}},
    }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)
--- a/tests/e2e/singlecard/utils.py
+++ b/tests/e2e/singlecard/utils.py
@@ -1,9 +1,8 @@
-from dataclasses import dataclass, field
+from dataclasses import dataclass

 from vllm import SamplingParams

 from tests.e2e.conftest import VllmRunner
-from tests.e2e.model_utils import check_outputs_equal

 PROMPTS_SHORT = [
    "Hello, my name is",
@@ -51,31 +50,143 @@ PROMPTS_LONG = [
 class LLMTestCase:
    model: str
    prompts: list[str]
-    golden_answers: list[str]
+    golden_answers: list[str] | None = None
    quantization: str | None = None
-    sampling_params: SamplingParams = field(
-        default_factory=lambda: SamplingParams(
-            max_tokens=32,
-            temperature=0.0,
-            top_p=1.0,
-            top_k=0,
-            n=1,
+
+
+# Keys that are specific to compilation/graph capture and should not be passed
+# to the eager baseline runner.
+_COMPILATION_KEYS = {"compilation_config", "additional_config", "cudagraph_capture_sizes"}
+
+# Top-K logprobs to fetch per token; used for decode-phase cross-lookup.
+_DECODE_TOPK = 20
+
+_LOGPROB_SAMPLING_PARAMS = SamplingParams(
+    max_tokens=3,
+    temperature=0.0,
+    top_p=1.0,
+    top_k=0,
+    logprobs=_DECODE_TOPK,
+)
+
+
+def _check_prefill_token(
+    base_seq,
+    comp_seq,
+    prompt_idx: int,
+    atol: float,
+) -> None:
+    """Token 0 is produced by the prefill pass; both models see identical input,
+    so the chosen token *must* be the same and its logprob must match within atol."""
+    base_token_id = base_seq.token_ids[0]
+    comp_token_id = comp_seq.token_ids[0]
+    assert base_token_id == comp_token_id, (
+        f"Prefill token mismatch at prompt {prompt_idx}: baseline={base_token_id}, compiled={comp_token_id}"
+    )
+    base_logprob = base_seq.logprobs[0][base_token_id].logprob
+    comp_logprob = comp_seq.logprobs[0][comp_token_id].logprob
+    assert abs(base_logprob - comp_logprob) <= atol, (
+        f"Prefill logprob mismatch at prompt {prompt_idx}: "
+        f"baseline={base_logprob:.4f}, compiled={comp_logprob:.4f}, "
+        f"diff={abs(base_logprob - comp_logprob):.4f} > atol={atol}"
+    )
+
+
+def _check_decode_token(
+    base_seq,
+    comp_seq,
+    token_idx: int,
+    prompt_idx: int,
+    decode_atol: float,
+) -> None:
+    """Tokens 1-2 come from decode passes.  When the two models pick different
+    tokens the context has already diverged, so we cannot compare logprobs of
+    the chosen tokens directly.  Instead we do a cross-lookup: find the
+    baseline's chosen token inside compiled's top-K distribution (and vice
+    versa) and assert that the assigned log-probability is close.  This
+    confirms that the compiled model's distribution is numerically consistent
+    with the baseline's even when the argmax differs by a tiny margin.
+    """
+    base_token_id = base_seq.token_ids[token_idx]
+    comp_token_id = comp_seq.token_ids[token_idx]
+    base_topk = base_seq.logprobs[token_idx]  # dict[token_id, Logprob]
+    comp_topk = comp_seq.logprobs[token_idx]
+
+    if base_token_id == comp_token_id:
+        # Happy path: same token, direct logprob comparison.
+        diff = abs(base_topk[base_token_id].logprob - comp_topk[comp_token_id].logprob)
+        assert diff <= decode_atol, (
+            f"Decode logprob mismatch at prompt {prompt_idx}, token {token_idx}: "
+            f"baseline={base_topk[base_token_id].logprob:.4f}, "
+            f"compiled={comp_topk[comp_token_id].logprob:.4f}, "
+            f"diff={diff:.4f} > decode_atol={decode_atol}"
        )
+        return
+
+    # Tokens differ – cross-lookup in each model's top-K distribution.
+    base_logprob = base_topk[base_token_id].logprob
+    comp_logprob = comp_topk[comp_token_id].logprob
+
+    # Check: what log-probability did compiled assign to baseline's token?
+    assert base_token_id in comp_topk, (
+        f"Decode token mismatch at prompt {prompt_idx}, token {token_idx}: "
+        f"baseline chose token {base_token_id} (logprob={base_logprob:.4f}) but "
+        f"compiled chose token {comp_token_id} (logprob={comp_logprob:.4f}) and "
+        f"baseline's token does not appear in compiled's top-{_DECODE_TOPK} distribution"
+    )
+    comp_logprob_of_base_token = comp_topk[base_token_id].logprob
+    diff = abs(base_logprob - comp_logprob_of_base_token)
+    assert diff <= decode_atol, (
+        f"Decode distribution mismatch at prompt {prompt_idx}, token {token_idx}: "
+        f"baseline chose token {base_token_id} with logprob={base_logprob:.4f}; "
+        f"compiled assigned logprob={comp_logprob_of_base_token:.4f} to that token, "
+        f"diff={diff:.4f} > decode_atol={decode_atol} "
+        f"(compiled chose token {comp_token_id} with logprob={comp_logprob:.4f})"
    )


-def gen_and_valid(runner_kwargs: dict, prompts: list[str], sampling_params: SamplingParams, golden_answers: list[str]):
+def compare_logprobs(
+    runner_kwargs: dict,
+    prompts: list[str],
+    atol: float = 0.0689,
+    decode_atol: float | None = None,
+) -> None:
+    """Run the model in eager baseline mode and in the configured compilation
+    mode, generate 3 tokens per prompt, then verify numerical accuracy:
+
+    * Token 0 (prefill pass): chosen token must be identical; logprob must
+      match within *atol*.
+    * Tokens 1-2 (decode passes): if chosen tokens match, logprob must be
+      within *decode_atol*; if they differ, the baseline token must appear in
+      the compiled model's top-K distribution with a logprob within
+      *decode_atol* of the baseline value.
+
+    *decode_atol* defaults to ``2 * atol`` when not supplied.
+    """
+    if decode_atol is None:
+        decode_atol = 2 * atol
+
+    baseline_kwargs = {k: v for k, v in runner_kwargs.items() if k not in _COMPILATION_KEYS}
+    baseline_kwargs["enforce_eager"] = True
+
+    with VllmRunner(**baseline_kwargs) as runner:
+        baseline_outputs = runner.model.generate(prompts=prompts, sampling_params=_LOGPROB_SAMPLING_PARAMS)
+
    with VllmRunner(**runner_kwargs) as runner:
-        vllm_aclgraph_outputs = runner.model.generate(prompts=prompts, sampling_params=sampling_params)
-    outputs_gen = []
-    for output in vllm_aclgraph_outputs:
-        outputs_gen.append(([output.outputs[0].index], output.outputs[0].text))
+        compiled_outputs = runner.model.generate(prompts=prompts, sampling_params=_LOGPROB_SAMPLING_PARAMS)

-    output_origin = [([0], answer) for answer in golden_answers]
+    for prompt_idx, (base_out, comp_out) in enumerate(zip(baseline_outputs, compiled_outputs)):
+        base_seq = base_out.outputs[0]
+        comp_seq = comp_out.outputs[0]

-    check_outputs_equal(
-        outputs_0_lst=output_origin,
-        outputs_1_lst=outputs_gen,
-        name_0="output_origin",
-        name_1="outputs_gen",
-    )
+        assert base_seq.logprobs is not None and comp_seq.logprobs is not None, (
+            f"logprobs not returned for prompt {prompt_idx}"
+        )
+        assert len(base_seq.token_ids) == len(comp_seq.token_ids) == 3, (
+            f"Expected 3 tokens for prompt {prompt_idx}, "
+            f"got baseline={len(base_seq.token_ids)}, compiled={len(comp_seq.token_ids)}"
+        )
+
+        _check_prefill_token(base_seq, comp_seq, prompt_idx, atol)
+        for token_idx in range(1, 3):
+            _check_decode_token(base_seq, comp_seq, token_idx, prompt_idx, decode_atol)