[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it? 1. Accuracy testing no longer compares eager and graph modes; instead, it directly extracts the golden result under the graph mode configuration (the implicit purpose of this case is to verify whether modifications affect existing results) 2. Next step: finer-grained supervision of logits/sampler results ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: 254f6b9867 Signed-off-by: wangli <wangli858794774@gmail.com>
2026-01-07 20:58:15 +08:00
parent b94fc13d3f
commit 1165b2c863
4 changed files with 231 additions and 353 deletions
--- a/tests/e2e/singlecard/test_xlite.py
+++ b/tests/e2e/singlecard/test_xlite.py
@@ -23,106 +23,85 @@ Run `pytest tests/e2e/singlecard/test_xlite.py`.
 import os

 import pytest
+from vllm import SamplingParams

-from tests.e2e.conftest import VllmRunner
-from tests.e2e.model_utils import check_outputs_equal
+from tests.e2e.singlecard.utils import (PROMPTS_SHORT, LLMTestCase,
+                                        gen_and_valid)

-os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2"

-MODELS = [
-    "Qwen/Qwen3-0.6B",
-]
-
-
-@pytest.mark.skip(
-    reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [15])
-def test_models_with_xlite_decode_only(
-    model: str,
-    max_tokens: int,
-) -> None:
-    prompts = [
-        "Hello, my name is", "The president of the United States is",
-        "The capital of France is", "The future of AI is"
-    ]
-
-    with VllmRunner(
-            model,
-            block_size=128,
-            max_model_len=1024,
-            additional_config={"xlite_graph_config": {
-                "enabled": True
-            }},
-    ) as runner:
-        vllm_xlite_outputs_list = runner.generate_greedy(prompts,
-                                                         max_tokens=max_tokens)
-        for idx in range(len(vllm_xlite_outputs_list)):
-            vllm_xlite_outputs_list[idx] = ([0],
-                                            vllm_xlite_outputs_list[idx][1])
-
-    vllm_xlite_answers = [
+CASE_DECODE_ONLY = LLMTestCase(
+    model="Qwen/Qwen3-0.6B",
+    prompts=PROMPTS_SHORT,
+    golden_answers=[
        "Hello, my name is Lina. I'm a 22-year-old student from China.",
        'The president of the United States is the same as the president of the United Nations. This is because the president',
        'The capital of France is Paris. The capital of France is also the capital of the French Republic.',
        'The future of AI is not just a technological challenge but a profound transformation of how we live, work'
-    ]
+    ],
+    sampling_params=SamplingParams(
+        max_tokens=15,
+        temperature=0.0,
+        top_p=1.0,
+        top_k=0,
+        n=1,
+    ))

-    vllm_eager_outputs_list = []
-    vllm_eager_outputs_list = ([([0], answer)
-                                for answer in vllm_xlite_answers])
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_eager_outputs_list,
-        outputs_1_lst=vllm_xlite_outputs_list,
-        name_0="vllm_eager_outputs",
-        name_1="vllm_xlite_outputs",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [32])
-def test_models_with_xlite_full_mode(
-    model: str,
-    max_tokens: int,
-) -> None:
-    prompts = [
+CASE_FULL_DECODE_ONLY = LLMTestCase(
+    model="Qwen/Qwen3-0.6B",
+    prompts=[
        "Hello, my name is", "The president of the United States is",
        "The capital of France is", "The future of AI is"
-    ]
+    ],
+    golden_answers=[
+        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
+        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
+        ' Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital',
+        " not just about the technology itself, but about how we use it to solve real-world problems. As AI continues to evolve, it's important to consider the ethical"
+    ],
+    sampling_params=SamplingParams(
+        max_tokens=32,
+        temperature=0.0,
+        top_p=1.0,
+        top_k=0,
+        n=1,
+    ))

-    with VllmRunner(
-            model,
-            block_size=128,
-            max_model_len=1024,
-            additional_config={
-                "xlite_graph_config": {
-                    "enabled": True,
-                    "full_mode": True
-                }
-            },
-    ) as runner:
-        vllm_xlite_outputs_list = runner.generate_greedy(prompts,
-                                                         max_tokens=max_tokens)
-        for idx in range(len(vllm_xlite_outputs_list)):
-            vllm_xlite_outputs_list[idx] = ([0],
-                                            vllm_xlite_outputs_list[idx][1])

-    vllm_xlite_answers = [
-        "Hello, my name is Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        'The president of the United States is the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
-        'The capital of France is Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital',
-        "The future of AI is not just about the technology itself, but about how we use it to solve real-world problems. As AI continues to evolve, it's important to consider the ethical"
-    ]
+@pytest.mark.skip(
+    reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
+@pytest.mark.parametrize("cur_case", [CASE_DECODE_ONLY])
+def test_models_with_xlite_decode_only(cur_case: LLMTestCase):
+    runner_kwargs = {
+        "model_name": cur_case.model,
+        "max_model_len": 1024,
+        "block_size": 128,
+        "additional_config": {
+            "xlite_graph_config": {
+                "enabled": True
+            }
+        },
+    }
+    gen_and_valid(runner_kwargs=runner_kwargs,
+                  prompts=cur_case.prompts,
+                  sampling_params=cur_case.sampling_params,
+                  golden_answers=cur_case.golden_answers)

-    vllm_eager_outputs_list = []
-    vllm_eager_outputs_list = ([([0], answer)
-                                for answer in vllm_xlite_answers])

-    check_outputs_equal(
-        outputs_0_lst=vllm_eager_outputs_list,
-        outputs_1_lst=vllm_xlite_outputs_list,
-        name_0="vllm_eager_outputs",
-        name_1="vllm_xlite_outputs",
-    )
+@pytest.mark.parametrize("cur_case", [CASE_FULL_DECODE_ONLY])
+def test_models_with_xlite_full_mode(cur_case: LLMTestCase):
+    runner_kwargs = {
+        "model_name": cur_case.model,
+        "max_model_len": 1024,
+        "block_size": 128,
+        "additional_config": {
+            "xlite_graph_config": {
+                "enabled": True,
+                "full_mode": True
+            }
+        },
+    }
+    gen_and_valid(runner_kwargs=runner_kwargs,
+                  prompts=cur_case.prompts,
+                  sampling_params=cur_case.sampling_params,
+                  golden_answers=cur_case.golden_answers)