diff --git a/.github/workflows/scripts/config.yaml b/.github/workflows/scripts/config.yaml index 22e7113c..bb098aa5 100644 --- a/.github/workflows/scripts/config.yaml +++ b/.github/workflows/scripts/config.yaml @@ -5,6 +5,8 @@ e2e-singlecard: estimated_time: 69 - name: tests/e2e/singlecard/test_auto_fit_max_mode_len.py estimated_time: 70 +- name: tests/e2e/singlecard/test_eager_mode_acc.py + estimated_time: 255 - name: tests/e2e/singlecard/test_aclgraph_accuracy.py estimated_time: 839 - name: tests/e2e/singlecard/test_aclgraph_batch_invariant.py diff --git a/tests/e2e/singlecard/test_aclgraph_accuracy.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py index 71711bf2..08365cb8 100644 --- a/tests/e2e/singlecard/test_aclgraph_accuracy.py +++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py @@ -21,97 +21,61 @@ import os import pytest -from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid +from tests.e2e.conftest import wait_until_npu_memory_free +from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, compare_logprobs + +# --------------------------------------------------------------------------- +# Test cases – no golden_answers needed; accuracy is verified via logprob +# comparison against an eager-mode baseline. Token 0 covers the prefill +# forward pass; tokens 1-2 cover decode forward passes. +# --------------------------------------------------------------------------- CASE_QWEN_ACLGRAPH = LLMTestCase( model="Qwen/Qwen3-0.6B", prompts=PROMPTS_SHORT, - golden_answers=[ - " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the", - " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president", - " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of", - " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and", - ], ) CASE_DS_ACLGRAPH = LLMTestCase( model="vllm-ascend/DeepSeek-V2-Lite-W8A8", quantization="ascend", prompts=PROMPTS_SHORT, - golden_answers=[ - "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2", - " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the", - " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art", - " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of", - ], ) CASE_QWEN_FULL = LLMTestCase( model="Qwen/Qwen3-0.6B", prompts=PROMPTS_SHORT, - golden_answers=[ - " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the", - " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president", - " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of", - " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and", - ], ) CASE_DS_FULL = LLMTestCase( model="vllm-ascend/DeepSeek-V2-Lite-W8A8", quantization="ascend", prompts=PROMPTS_SHORT, - golden_answers=[ - "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2", - " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the", - " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art", - " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of", - ], ) CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase( model="Qwen/Qwen3-0.6B", prompts=PROMPTS_LONG, - golden_answers=[ - " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the", - " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area", - " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can", - ], ) CASE_DS_FULL_DECODE_ONLY = LLMTestCase( model="vllm-ascend/DeepSeek-V2-Lite-W8A8", quantization="ascend", prompts=PROMPTS_LONG, - golden_answers=[ - "\n\nSelect an assignment template", - "\n\nI'm not sure how to approach this problem. I'm thinking that the area of the triangle is $1/2$ times the area", - "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x = \\alpha$ be the common root", - ], ) CASE_QWEN_EX = LLMTestCase( model="Qwen/Qwen3-0.6B", prompts=PROMPTS_LONG, - golden_answers=[ - " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the", - " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area", - " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can", - ], ) CASE_DS_EX = LLMTestCase( model="vllm-ascend/DeepSeek-V2-Lite-W8A8", quantization="ascend", prompts=PROMPTS_LONG, - golden_answers=[ - "\n\nSelect an assignment template", - "\n\nI'm not sure how to approach this problem. I'm thinking that the area of the triangle is $1/2$ times the area", - "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x = \\alpha$ be the common root", - ], ) +@wait_until_npu_memory_free(0.7) @pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH]) def test_piecewise_res_consistency(cur_case: LLMTestCase): runner_kwargs = { @@ -120,14 +84,10 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase): "cudagraph_capture_sizes": [1, 2, 4, 8], "quantization": cur_case.quantization, } - gen_and_valid( - runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers, - ) + compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts) +@wait_until_npu_memory_free(0.7) @pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL, CASE_DS_FULL]) def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch): monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False) @@ -137,14 +97,10 @@ def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch): "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"}, "quantization": cur_case.quantization, } - gen_and_valid( - runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers, - ) + compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts) +@wait_until_npu_memory_free(0.7) @pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY]) def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch): monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False) @@ -155,14 +111,10 @@ def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch): "quantization": cur_case.quantization, "additional_config": {"ascend_compilation_config": {"enable_npugraph_ex": False}}, } - gen_and_valid( - runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers, - ) + compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts) +@wait_until_npu_memory_free(0.7) @pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX]) def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch): monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False) @@ -173,17 +125,13 @@ def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch): "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"}, "additional_config": {"ascend_compilation_config": {"enable_npugraph_ex": True}}, } - gen_and_valid( - runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers, - ) + compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts) # The accuracy has already been verified in the previous test case. # This test case is used to check whether the functionality works properly # after enabling the static kernel and whether it is uninstalled as expected. +@wait_until_npu_memory_free(0.7) @pytest.mark.parametrize("cur_case", [CASE_QWEN_EX]) def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch): monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False) @@ -199,14 +147,9 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch): } }, } - gen_and_valid( - runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers, - ) + compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts) - # Check whether the static kernel is properly uninstall + # Check whether the static kernel is properly uninstalled ascend_home_path = os.environ["ASCEND_HOME_PATH"] static_kernel_install_path = os.path.join(ascend_home_path, "opp/static_kernel/ai_core") assert not os.path.exists(static_kernel_install_path) diff --git a/tests/e2e/singlecard/test_eager_mode_acc.py b/tests/e2e/singlecard/test_eager_mode_acc.py new file mode 100644 index 00000000..18b6e73d --- /dev/null +++ b/tests/e2e/singlecard/test_eager_mode_acc.py @@ -0,0 +1,68 @@ +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This file test accuracy via LMEval. +It uses local-completions, which interacts with vLLM +through the OAI API with N concurrent connections. +This simulates real work usage of the API and makes +sure that the zmq frontend mp RPC message passing and +AsyncLLMEngine are working correctly. +""" + +import lm_eval +import pytest + +MODEL_NAMES = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] +NUM_CONCURRENT = 500 +TASK = "gsm8k" +FILTER = "exact_match,strict-match" +RTOL = 0.03 +EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.414, "vllm-ascend/DeepSeek-V2-Lite-W8A8": 0.34} + + +def run_test(model_name, more_args=None): + """Run the end to end accuracy test.""" + + # NOTE: Do not add any spaces to the string below, as this will cause parameter parsing errors. + model_args = f"pretrained={model_name},max_model_len=4096,enforce_eager=True" + + if more_args is not None: + model_args = "{},{}".format(model_args, more_args) + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks="gsm8k", + batch_size="auto", + ) + + measured_value = results["results"][TASK][FILTER] + assert model_name in EXPECTED_VALUES, f"Cannot find the expected value for the model {model_name=}" + expected_value = EXPECTED_VALUES[model_name] + assert measured_value - RTOL < expected_value and measured_value + RTOL > expected_value, ( + f"Expected: {expected_value} | Measured: {measured_value}" + ) + + +@pytest.mark.parametrize("model", MODEL_NAMES) +def test_lm_eval_accuracy(model): + """Run with the V1 Engine.""" + more_args = None + run_test(model, more_args) diff --git a/tests/e2e/singlecard/test_xlite.py b/tests/e2e/singlecard/test_xlite.py index 231cb408..8a2cc8cd 100644 --- a/tests/e2e/singlecard/test_xlite.py +++ b/tests/e2e/singlecard/test_xlite.py @@ -15,7 +15,8 @@ # limitations under the License. # """ -Compare the outputs of vLLM with and without xlite. +Compare the outputs of vLLM with and without xlite via logprob-based accuracy +check (3 tokens: 1 prefill + 2 decode). Run `pytest tests/e2e/singlecard/test_xlite.py`. """ @@ -25,51 +26,19 @@ Run `pytest tests/e2e/singlecard/test_xlite.py`. import os import pytest -from vllm import SamplingParams -from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, gen_and_valid +from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, compare_logprobs os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2" CASE_DECODE_ONLY = LLMTestCase( model="Qwen/Qwen3-0.6B", prompts=PROMPTS_SHORT, - golden_answers=[ - "Hello, my name is Lina. I'm a 22-year-old student from China.", - "The president of the United States is the same as the president of the United Nations. This is because the president", - "The capital of France is Paris. The capital of France is also the capital of the French Republic.", - "The future of AI is not just a technological challenge but a profound transformation of how we live, work", - ], - sampling_params=SamplingParams( - max_tokens=15, - temperature=0.0, - top_p=1.0, - top_k=0, - n=1, - ), ) CASE_FULL = LLMTestCase( model="Qwen/Qwen3-0.6B", - prompts=[ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ], - golden_answers=[ - " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the", - " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president", - " Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital", - " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and", - ], - sampling_params=SamplingParams( - max_tokens=32, - temperature=0.0, - top_p=1.0, - top_k=0, - n=1, - ), + prompts=PROMPTS_SHORT, ) @@ -82,12 +51,7 @@ def test_models_with_xlite_decode_only(cur_case: LLMTestCase): "block_size": 128, "additional_config": {"xlite_graph_config": {"enabled": True}}, } - gen_and_valid( - runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers, - ) + compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts) @pytest.mark.parametrize("cur_case", [CASE_FULL]) @@ -98,9 +62,4 @@ def test_models_with_xlite_full_mode(cur_case: LLMTestCase): "block_size": 128, "additional_config": {"xlite_graph_config": {"enabled": True, "full_mode": True}}, } - gen_and_valid( - runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers, - ) + compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts) diff --git a/tests/e2e/singlecard/utils.py b/tests/e2e/singlecard/utils.py index 1ac30acb..649415f5 100644 --- a/tests/e2e/singlecard/utils.py +++ b/tests/e2e/singlecard/utils.py @@ -1,9 +1,8 @@ -from dataclasses import dataclass, field +from dataclasses import dataclass from vllm import SamplingParams from tests.e2e.conftest import VllmRunner -from tests.e2e.model_utils import check_outputs_equal PROMPTS_SHORT = [ "Hello, my name is", @@ -51,31 +50,143 @@ PROMPTS_LONG = [ class LLMTestCase: model: str prompts: list[str] - golden_answers: list[str] + golden_answers: list[str] | None = None quantization: str | None = None - sampling_params: SamplingParams = field( - default_factory=lambda: SamplingParams( - max_tokens=32, - temperature=0.0, - top_p=1.0, - top_k=0, - n=1, + + +# Keys that are specific to compilation/graph capture and should not be passed +# to the eager baseline runner. +_COMPILATION_KEYS = {"compilation_config", "additional_config", "cudagraph_capture_sizes"} + +# Top-K logprobs to fetch per token; used for decode-phase cross-lookup. +_DECODE_TOPK = 20 + +_LOGPROB_SAMPLING_PARAMS = SamplingParams( + max_tokens=3, + temperature=0.0, + top_p=1.0, + top_k=0, + logprobs=_DECODE_TOPK, +) + + +def _check_prefill_token( + base_seq, + comp_seq, + prompt_idx: int, + atol: float, +) -> None: + """Token 0 is produced by the prefill pass; both models see identical input, + so the chosen token *must* be the same and its logprob must match within atol.""" + base_token_id = base_seq.token_ids[0] + comp_token_id = comp_seq.token_ids[0] + assert base_token_id == comp_token_id, ( + f"Prefill token mismatch at prompt {prompt_idx}: baseline={base_token_id}, compiled={comp_token_id}" + ) + base_logprob = base_seq.logprobs[0][base_token_id].logprob + comp_logprob = comp_seq.logprobs[0][comp_token_id].logprob + assert abs(base_logprob - comp_logprob) <= atol, ( + f"Prefill logprob mismatch at prompt {prompt_idx}: " + f"baseline={base_logprob:.4f}, compiled={comp_logprob:.4f}, " + f"diff={abs(base_logprob - comp_logprob):.4f} > atol={atol}" + ) + + +def _check_decode_token( + base_seq, + comp_seq, + token_idx: int, + prompt_idx: int, + decode_atol: float, +) -> None: + """Tokens 1-2 come from decode passes. When the two models pick different + tokens the context has already diverged, so we cannot compare logprobs of + the chosen tokens directly. Instead we do a cross-lookup: find the + baseline's chosen token inside compiled's top-K distribution (and vice + versa) and assert that the assigned log-probability is close. This + confirms that the compiled model's distribution is numerically consistent + with the baseline's even when the argmax differs by a tiny margin. + """ + base_token_id = base_seq.token_ids[token_idx] + comp_token_id = comp_seq.token_ids[token_idx] + base_topk = base_seq.logprobs[token_idx] # dict[token_id, Logprob] + comp_topk = comp_seq.logprobs[token_idx] + + if base_token_id == comp_token_id: + # Happy path: same token, direct logprob comparison. + diff = abs(base_topk[base_token_id].logprob - comp_topk[comp_token_id].logprob) + assert diff <= decode_atol, ( + f"Decode logprob mismatch at prompt {prompt_idx}, token {token_idx}: " + f"baseline={base_topk[base_token_id].logprob:.4f}, " + f"compiled={comp_topk[comp_token_id].logprob:.4f}, " + f"diff={diff:.4f} > decode_atol={decode_atol}" ) + return + + # Tokens differ – cross-lookup in each model's top-K distribution. + base_logprob = base_topk[base_token_id].logprob + comp_logprob = comp_topk[comp_token_id].logprob + + # Check: what log-probability did compiled assign to baseline's token? + assert base_token_id in comp_topk, ( + f"Decode token mismatch at prompt {prompt_idx}, token {token_idx}: " + f"baseline chose token {base_token_id} (logprob={base_logprob:.4f}) but " + f"compiled chose token {comp_token_id} (logprob={comp_logprob:.4f}) and " + f"baseline's token does not appear in compiled's top-{_DECODE_TOPK} distribution" + ) + comp_logprob_of_base_token = comp_topk[base_token_id].logprob + diff = abs(base_logprob - comp_logprob_of_base_token) + assert diff <= decode_atol, ( + f"Decode distribution mismatch at prompt {prompt_idx}, token {token_idx}: " + f"baseline chose token {base_token_id} with logprob={base_logprob:.4f}; " + f"compiled assigned logprob={comp_logprob_of_base_token:.4f} to that token, " + f"diff={diff:.4f} > decode_atol={decode_atol} " + f"(compiled chose token {comp_token_id} with logprob={comp_logprob:.4f})" ) -def gen_and_valid(runner_kwargs: dict, prompts: list[str], sampling_params: SamplingParams, golden_answers: list[str]): +def compare_logprobs( + runner_kwargs: dict, + prompts: list[str], + atol: float = 0.0689, + decode_atol: float | None = None, +) -> None: + """Run the model in eager baseline mode and in the configured compilation + mode, generate 3 tokens per prompt, then verify numerical accuracy: + + * Token 0 (prefill pass): chosen token must be identical; logprob must + match within *atol*. + * Tokens 1-2 (decode passes): if chosen tokens match, logprob must be + within *decode_atol*; if they differ, the baseline token must appear in + the compiled model's top-K distribution with a logprob within + *decode_atol* of the baseline value. + + *decode_atol* defaults to ``2 * atol`` when not supplied. + """ + if decode_atol is None: + decode_atol = 2 * atol + + baseline_kwargs = {k: v for k, v in runner_kwargs.items() if k not in _COMPILATION_KEYS} + baseline_kwargs["enforce_eager"] = True + + with VllmRunner(**baseline_kwargs) as runner: + baseline_outputs = runner.model.generate(prompts=prompts, sampling_params=_LOGPROB_SAMPLING_PARAMS) + with VllmRunner(**runner_kwargs) as runner: - vllm_aclgraph_outputs = runner.model.generate(prompts=prompts, sampling_params=sampling_params) - outputs_gen = [] - for output in vllm_aclgraph_outputs: - outputs_gen.append(([output.outputs[0].index], output.outputs[0].text)) + compiled_outputs = runner.model.generate(prompts=prompts, sampling_params=_LOGPROB_SAMPLING_PARAMS) - output_origin = [([0], answer) for answer in golden_answers] + for prompt_idx, (base_out, comp_out) in enumerate(zip(baseline_outputs, compiled_outputs)): + base_seq = base_out.outputs[0] + comp_seq = comp_out.outputs[0] - check_outputs_equal( - outputs_0_lst=output_origin, - outputs_1_lst=outputs_gen, - name_0="output_origin", - name_1="outputs_gen", - ) + assert base_seq.logprobs is not None and comp_seq.logprobs is not None, ( + f"logprobs not returned for prompt {prompt_idx}" + ) + assert len(base_seq.token_ids) == len(comp_seq.token_ids) == 3, ( + f"Expected 3 tokens for prompt {prompt_idx}, " + f"got baseline={len(base_seq.token_ids)}, compiled={len(comp_seq.token_ids)}" + ) + + _check_prefill_token(base_seq, comp_seq, prompt_idx, atol) + for token_idx in range(1, 3): + _check_decode_token(base_seq, comp_seq, token_idx, prompt_idx, decode_atol)