[Misc] Refactor aclgraph accuracy test to use logprob-based comparison (#7455)

### What this PR does / why we need it? Replace text-match assertions with a two-tier logprob accuracy check: - Prefill (token 0): assert token ID is identical between eager baseline and compiled mode, then verify logprob matches within `atol`. - Decode (tokens 1-2): if chosen tokens match, compare logprobs directly; if they differ, cross-lookup the baseline token in the compiled model's top-20 distribution and assert the assigned logprob is within `decode_atol` (defaults to 2x atol). This tolerates minor argmax drift caused by floating-point differences while still catching distribution divergence. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.17.0 - vLLM main: 8a680463fa --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2026-03-23 09:08:21 +08:00
parent 9bf9b4b267
commit 75fae619d5
5 changed files with 228 additions and 145 deletions
--- a/tests/e2e/singlecard/test_xlite.py
+++ b/tests/e2e/singlecard/test_xlite.py
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 """
-Compare the outputs of vLLM with and without xlite.
+Compare the outputs of vLLM with and without xlite via logprob-based accuracy
+check (3 tokens: 1 prefill + 2 decode).

 Run `pytest tests/e2e/singlecard/test_xlite.py`.
 """
@@ -25,51 +26,19 @@ Run `pytest tests/e2e/singlecard/test_xlite.py`.
 import os

 import pytest
-from vllm import SamplingParams

-from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, gen_and_valid
+from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, compare_logprobs

 os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2"

 CASE_DECODE_ONLY = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_SHORT,
-    golden_answers=[
-        "Hello, my name is Lina. I'm a 22-year-old student from China.",
-        "The president of the United States is the same as the president of the United Nations. This is because the president",
-        "The capital of France is Paris. The capital of France is also the capital of the French Republic.",
-        "The future of AI is not just a technological challenge but a profound transformation of how we live, work",
-    ],
-    sampling_params=SamplingParams(
-        max_tokens=15,
-        temperature=0.0,
-        top_p=1.0,
-        top_k=0,
-        n=1,
-    ),
 )

 CASE_FULL = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
-    prompts=[
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ],
-    golden_answers=[
-        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
-        " Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital",
-        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
-    ],
-    sampling_params=SamplingParams(
-        max_tokens=32,
-        temperature=0.0,
-        top_p=1.0,
-        top_k=0,
-        n=1,
-    ),
+    prompts=PROMPTS_SHORT,
 )


@@ -82,12 +51,7 @@ def test_models_with_xlite_decode_only(cur_case: LLMTestCase):
        "block_size": 128,
        "additional_config": {"xlite_graph_config": {"enabled": True}},
    }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)


@pytest.mark.parametrize("cur_case", [CASE_FULL])
@@ -98,9 +62,4 @@ def test_models_with_xlite_full_mode(cur_case: LLMTestCase):
        "block_size": 128,
        "additional_config": {"xlite_graph_config": {"enabled": True, "full_mode": True}},
    }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)