From 70bef33f1351d416c5aba6af7193867863a752ac Mon Sep 17 00:00:00 2001 From: lilinsiman Date: Mon, 20 Oct 2025 20:04:04 +0800 Subject: [PATCH] add new accuracy test case for aclgraph (#3390) ### What this PR does / why we need it? Add new accuracy test case Deepseek-V2-Lite-W8A8 for aclgraph ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ut - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: lilinsiman --- .github/workflows/_e2e_test.yaml | 1 + tests/e2e/singlecard/test_aclgraph.py | 101 +++++++++++++++++++------- vllm_ascend/attention/mla_v1.py | 9 ++- 3 files changed, 83 insertions(+), 28 deletions(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 2761447..a330b62 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -177,6 +177,7 @@ jobs: pytest -sv tests/e2e/multicard/test_data_parallel.py pytest -sv tests/e2e/multicard/test_expert_parallel.py pytest -sv tests/e2e/multicard/test_external_launcher.py + pytest -sv tests/e2e/multicard/test_single_request_aclgraph.py pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py index e1ccdc9..570dbbc 100644 --- a/tests/e2e/singlecard/test_aclgraph.py +++ b/tests/e2e/singlecard/test_aclgraph.py @@ -21,6 +21,8 @@ Run `pytest tests/compile/test_aclgraph.py`. """ import os +import random +import string import pytest from vllm import SamplingParams @@ -30,6 +32,7 @@ from tests.e2e.model_utils import check_outputs_equal MODELS = [ "Qwen/Qwen3-0.6B", + "vllm-ascend/DeepSeek-V2-Lite-W8A8", ] @@ -45,20 +48,40 @@ def test_models_with_aclgraph( ] sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) - with VllmRunner( - model, - max_model_len=1024, - enforce_eager=False, - ) as runner: - vllm_aclgraph_outputs = runner.model.generate(prompts, sampling_params) + if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=False, + quantization="ascend", + ) as runner: + vllm_aclgraph_outputs = runner.model.generate( + prompts, sampling_params) - with VllmRunner( - model, - max_model_len=1024, - enforce_eager=True, - ) as runner: - vllm_eager_outputs = runner.model.generate(prompts, sampling_params) + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=True, + quantization="ascend", + ) as runner: + vllm_eager_outputs = runner.model.generate(prompts, + sampling_params) + else: + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=False, + ) as runner: + vllm_aclgraph_outputs = runner.model.generate( + prompts, sampling_params) + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=True, + ) as runner: + vllm_eager_outputs = runner.model.generate(prompts, + sampling_params) vllm_aclgraph_outputs_list = [] for output in vllm_aclgraph_outputs: vllm_aclgraph_outputs_list.append( @@ -85,6 +108,9 @@ def test_models_with_aclgraph_full_decode_only( ) -> None: if 'HCCL_OP_EXPANSION_MODE' in os.environ: del os.environ['HCCL_OP_EXPANSION_MODE'] + # NOTE: Randomly fill the prompt with the requested amount for + # the specified capture shape to prevent accuracy issues caused by padding + random_number = random.choice(list(range(6, 47, 8))) prompts = [ ('Solve the following math problem step by step.' 'The last line of your response should be of the form Answer: ' @@ -110,6 +136,9 @@ def test_models_with_aclgraph_full_decode_only( 'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$' 'and $x^2 + cx + b = 0$ also have a common real root.' 'Compute the sum $a + b + c$.') + ] + [ + ''.join(random.choices(string.ascii_lowercase, k=random.randint( + 1, 25))) for _ in range(random_number) ] sampling_params = SamplingParams(max_tokens=5, @@ -117,20 +146,42 @@ def test_models_with_aclgraph_full_decode_only( temperature=0.0, top_p=1.0, top_k=1) - with VllmRunner( - model, - max_model_len=1024, - enforce_eager=False, - compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"}, - ) as runner: - vllm_aclgraph_outputs = runner.model.generate(prompts, sampling_params) + if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=False, + compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"}, + quantization="ascend", + ) as runner: + vllm_aclgraph_outputs = runner.model.generate( + prompts, sampling_params) - with VllmRunner( - model, - max_model_len=1024, - enforce_eager=True, - ) as runner: - vllm_eager_outputs = runner.model.generate(prompts, sampling_params) + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=True, + quantization="ascend", + ) as runner: + vllm_eager_outputs = runner.model.generate(prompts, + sampling_params) + else: + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=False, + compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"}, + ) as runner: + vllm_aclgraph_outputs = runner.model.generate( + prompts, sampling_params) + + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=True, + ) as runner: + vllm_eager_outputs = runner.model.generate(prompts, + sampling_params) vllm_aclgraph_outputs_list = [] for output in vllm_aclgraph_outputs: diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index f88e01a..7f6e7f8 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -976,17 +976,20 @@ class AscendMLAImpl(MLAAttentionImpl): # Use TND layout for pure SpecDecoding and SpecDecoding in ChunkedPrefill input_layout = "TND" # [bs * q_seq_len, num_heads_per_rank, dim] - q_nope = q_nope.view(num_tokens, self.num_heads, -1) + # TODO: If the driver is upgraded later, the contiguous function can be deleted. + q_nope = q_nope.view(num_tokens, self.num_heads, -1).contiguous() q_pe = q_pe.view(num_tokens, self.num_heads, -1) sparse_mode = 3 spec_attn_mask = attn_metadata.decode.attn_mask # type:ignore actual_seq_lengths = decode_meta.actual_seq_lengths_q else: if self.enable_kv_nz: - q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1) + q_nope = q_nope.view(num_tokens, 1, self.num_heads, + -1).contiguous() q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1) else: - q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) + q_nope = q_nope.view(num_tokens, self.num_heads, 1, + -1).contiguous() q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) sparse_mode = 0 spec_attn_mask = None