add new accuracy test case for aclgraph (#3390)

### What this PR does / why we need it? Add new accuracy test case Deepseek-V2-Lite-W8A8 for aclgraph ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ut - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2025-10-20 20:04:04 +08:00
parent b9e2896eb1
commit 70bef33f13
3 changed files with 83 additions and 28 deletions
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -177,6 +177,7 @@ jobs:
          pytest -sv tests/e2e/multicard/test_data_parallel.py
          pytest -sv tests/e2e/multicard/test_expert_parallel.py
          pytest -sv tests/e2e/multicard/test_external_launcher.py
+          pytest -sv tests/e2e/multicard/test_single_request_aclgraph.py
          pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py

--- a/tests/e2e/singlecard/test_aclgraph.py
+++ b/tests/e2e/singlecard/test_aclgraph.py
@@ -21,6 +21,8 @@ Run `pytest tests/compile/test_aclgraph.py`.
 """

 import os
+import random
+import string

 import pytest
 from vllm import SamplingParams
@@ -30,6 +32,7 @@ from tests.e2e.model_utils import check_outputs_equal

 MODELS = [
    "Qwen/Qwen3-0.6B",
+    "vllm-ascend/DeepSeek-V2-Lite-W8A8",
 ]


@@ -45,20 +48,40 @@ def test_models_with_aclgraph(
    ]

    sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
-    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=False,
-    ) as runner:
-        vllm_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
+    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+                quantization="ascend",
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)

-    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=True,
-    ) as runner:
-        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=True,
+                quantization="ascend",
+        ) as runner:
+            vllm_eager_outputs = runner.model.generate(prompts,
+                                                       sampling_params)
+    else:
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)

+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=True,
+        ) as runner:
+            vllm_eager_outputs = runner.model.generate(prompts,
+                                                       sampling_params)
    vllm_aclgraph_outputs_list = []
    for output in vllm_aclgraph_outputs:
        vllm_aclgraph_outputs_list.append(
@@ -85,6 +108,9 @@ def test_models_with_aclgraph_full_decode_only(
 ) -> None:
    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
        del os.environ['HCCL_OP_EXPANSION_MODE']
+    # NOTE: Randomly fill the prompt with the requested amount for
+    # the specified capture shape to prevent accuracy issues caused by padding
+    random_number = random.choice(list(range(6, 47, 8)))
    prompts = [
        ('Solve the following math problem step by step.'
         'The last line of your response should be of the form Answer: '
@@ -110,6 +136,9 @@ def test_models_with_aclgraph_full_decode_only(
         'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
         'and $x^2 + cx + b = 0$ also have a common real root.'
         'Compute the sum $a + b + c$.')
+    ] + [
+        ''.join(random.choices(string.ascii_lowercase, k=random.randint(
+            1, 25))) for _ in range(random_number)
    ]

    sampling_params = SamplingParams(max_tokens=5,
@@ -117,20 +146,42 @@ def test_models_with_aclgraph_full_decode_only(
                                     temperature=0.0,
                                     top_p=1.0,
                                     top_k=1)
-    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=False,
-            compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
-    ) as runner:
-        vllm_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
+    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+                compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
+                quantization="ascend",
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)

-    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=True,
-    ) as runner:
-        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=True,
+                quantization="ascend",
+        ) as runner:
+            vllm_eager_outputs = runner.model.generate(prompts,
+                                                       sampling_params)
+    else:
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=False,
+                compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
+        ) as runner:
+            vllm_aclgraph_outputs = runner.model.generate(
+                prompts, sampling_params)
+
+        with VllmRunner(
+                model,
+                max_model_len=1024,
+                enforce_eager=True,
+        ) as runner:
+            vllm_eager_outputs = runner.model.generate(prompts,
+                                                       sampling_params)

    vllm_aclgraph_outputs_list = []
    for output in vllm_aclgraph_outputs:
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -976,17 +976,20 @@ class AscendMLAImpl(MLAAttentionImpl):
            # Use TND layout for pure SpecDecoding and SpecDecoding in ChunkedPrefill
            input_layout = "TND"
            # [bs * q_seq_len, num_heads_per_rank, dim]
-            q_nope = q_nope.view(num_tokens, self.num_heads, -1)
+            # TODO: If the driver is upgraded later, the contiguous function can be deleted.
+            q_nope = q_nope.view(num_tokens, self.num_heads, -1).contiguous()
            q_pe = q_pe.view(num_tokens, self.num_heads, -1)
            sparse_mode = 3
            spec_attn_mask = attn_metadata.decode.attn_mask  # type:ignore
            actual_seq_lengths = decode_meta.actual_seq_lengths_q
        else:
            if self.enable_kv_nz:
-                q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1)
+                q_nope = q_nope.view(num_tokens, 1, self.num_heads,
+                                     -1).contiguous()
                q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1)
            else:
-                q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1)
+                q_nope = q_nope.view(num_tokens, self.num_heads, 1,
+                                     -1).contiguous()
                q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1)
            sparse_mode = 0
            spec_attn_mask = None