From 70bef33f1351d416c5aba6af7193867863a752ac Mon Sep 17 00:00:00 2001
From: lilinsiman <lilinsiman@gmail.com>
Date: Mon, 20 Oct 2025 20:04:04 +0800
Subject: [PATCH] add new accuracy test case for aclgraph (#3390)

### What this PR does / why we need it?
Add new accuracy test case Deepseek-V2-Lite-W8A8 for aclgraph

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ut

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: lilinsiman <lilinsiman@gmail.com>
---
 .github/workflows/_e2e_test.yaml      |   1 +
 tests/e2e/singlecard/test_aclgraph.py | 101 +++++++++++++++++++-------
 vllm_ascend/attention/mla_v1.py       |   9 ++-
 3 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index 2761447..a330b62 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -177,6 +177,7 @@ jobs:
           pytest -sv tests/e2e/multicard/test_data_parallel.py
           pytest -sv tests/e2e/multicard/test_expert_parallel.py
           pytest -sv tests/e2e/multicard/test_external_launcher.py
+          pytest -sv tests/e2e/multicard/test_single_request_aclgraph.py
           pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
           pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
 
diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py
index e1ccdc9..570dbbc 100644
--- a/tests/e2e/singlecard/test_aclgraph.py
+++ b/tests/e2e/singlecard/test_aclgraph.py
@@ -21,6 +21,8 @@ Run `pytest tests/compile/test_aclgraph.py`.
 """
 
 import os
+import random
+import string
 
 import pytest
 from vllm import SamplingParams
@@ -30,6 +32,7 @@ from tests.e2e.model_utils import check_outputs_equal
 
 MODELS = [
     "Qwen/Qwen3-0.6B",
+    "vllm-ascend/DeepSeek-V2-Lite-W8A8",
 ]
 
 
@@ -45,20 +48,40 @@ def test_models_with_aclgraph(
     ]
 
     sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
-    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=False,
-    ) as runner:
-        vllm_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
+    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+                quantization="ascend",
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)
 
-    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=True,
-    ) as runner:
-        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=True,
+                quantization="ascend",
+        ) as runner:
+            vllm_eager_outputs = runner.model.generate(prompts,
+                                                       sampling_params)
+    else:
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)
 
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=True,
+        ) as runner:
+            vllm_eager_outputs = runner.model.generate(prompts,
+                                                       sampling_params)
     vllm_aclgraph_outputs_list = []
     for output in vllm_aclgraph_outputs:
         vllm_aclgraph_outputs_list.append(
@@ -85,6 +108,9 @@ def test_models_with_aclgraph_full_decode_only(
 ) -> None:
     if 'HCCL_OP_EXPANSION_MODE' in os.environ:
         del os.environ['HCCL_OP_EXPANSION_MODE']
+    # NOTE: Randomly fill the prompt with the requested amount for
+    # the specified capture shape to prevent accuracy issues caused by padding
+    random_number = random.choice(list(range(6, 47, 8)))
     prompts = [
         ('Solve the following math problem step by step.'
          'The last line of your response should be of the form Answer: '
@@ -110,6 +136,9 @@ def test_models_with_aclgraph_full_decode_only(
          'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
          'and $x^2 + cx + b = 0$ also have a common real root.'
          'Compute the sum $a + b + c$.')
+    ] + [
+        ''.join(random.choices(string.ascii_lowercase, k=random.randint(
+            1, 25))) for _ in range(random_number)
     ]
 
     sampling_params = SamplingParams(max_tokens=5,
@@ -117,20 +146,42 @@ def test_models_with_aclgraph_full_decode_only(
                                      temperature=0.0,
                                      top_p=1.0,
                                      top_k=1)
-    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=False,
-            compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
-    ) as runner:
-        vllm_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
+    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+                compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
+                quantization="ascend",
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)
 
-    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=True,
-    ) as runner:
-        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=True,
+                quantization="ascend",
+        ) as runner:
+            vllm_eager_outputs = runner.model.generate(prompts,
+                                                       sampling_params)
+    else:
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+                compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)
+
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=True,
+        ) as runner:
+            vllm_eager_outputs = runner.model.generate(prompts,
+                                                       sampling_params)
 
     vllm_aclgraph_outputs_list = []
     for output in vllm_aclgraph_outputs:
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index f88e01a..7f6e7f8 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -976,17 +976,20 @@ class AscendMLAImpl(MLAAttentionImpl):
             # Use TND layout for pure SpecDecoding and SpecDecoding in ChunkedPrefill
             input_layout = "TND"
             # [bs * q_seq_len, num_heads_per_rank, dim]
-            q_nope = q_nope.view(num_tokens, self.num_heads, -1)
+            # TODO: If the driver is upgraded later, the contiguous function can be deleted.
+            q_nope = q_nope.view(num_tokens, self.num_heads, -1).contiguous()
             q_pe = q_pe.view(num_tokens, self.num_heads, -1)
             sparse_mode = 3
             spec_attn_mask = attn_metadata.decode.attn_mask  # type:ignore
             actual_seq_lengths = decode_meta.actual_seq_lengths_q
         else:
             if self.enable_kv_nz:
-                q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1)
+                q_nope = q_nope.view(num_tokens, 1, self.num_heads,
+                                     -1).contiguous()
                 q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1)
             else:
-                q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1)
+                q_nope = q_nope.view(num_tokens, self.num_heads, 1,
+                                     -1).contiguous()
                 q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1)
             sparse_mode = 0
             spec_attn_mask = None