xc-llm-ascend/tests/e2e/singlecard/test_aclgraph_accuracy.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pytest

from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT,
                                        LLMTestCase, gen_and_valid)

CASE_QWEN_ACLGRAPH = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_SHORT,
    golden_answers=[
        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I want to know if there are any",
        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
        ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
        ' not just a technological frontier but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
    ],
)

CASE_DS_ACLGRAPH = LLMTestCase(
    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
    quantization="ascend",
    prompts=PROMPTS_SHORT,
    golden_answers=[
        '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
        ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
        ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
        ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
    ],
)

CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_LONG,
    golden_answers=[
        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations $x^2 +'
    ])

CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
    quantization="ascend",
    prompts=PROMPTS_LONG,
    golden_answers=[
        "\n\nSelect an assignment template",
        "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
    ])

CASE_QWEN_EX = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_LONG,
    golden_answers=[
        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
    ])

CASE_DS_EX = LLMTestCase(model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
                         quantization="ascend",
                         prompts=PROMPTS_LONG,
                         golden_answers=[
                             "\n\nSelect an assignment template",
                             "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
                             "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
                         ])

@pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
def test_piecewise_res_consistency(cur_case: LLMTestCase):
    runner_kwargs = {
        "model_name": cur_case.model,
        "max_model_len": 1024,
        "cudagraph_capture_sizes": [1, 2, 4, 8],
        "quantization": cur_case.quantization,
    }
    gen_and_valid(runner_kwargs=runner_kwargs,
                  prompts=cur_case.prompts,
                  sampling_params=cur_case.sampling_params,
                  golden_answers=cur_case.golden_answers)


@pytest.mark.parametrize(
    "cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
    runner_kwargs = {
        "model_name": cur_case.model,
        "max_model_len": 1024,
        "compilation_config": {
            "cudagraph_capture_sizes": [4, 8, 32, 64],
            "cudagraph_mode": "FULL_DECODE_ONLY"
        },
        "quantization": cur_case.quantization,
    }
    gen_and_valid(runner_kwargs=runner_kwargs,
                  prompts=cur_case.prompts,
                  sampling_params=cur_case.sampling_params,
                  golden_answers=cur_case.golden_answers)

@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
    runner_kwargs = {
        "model_name": cur_case.model,
        "quantization": cur_case.quantization,
        "max_model_len": 1024,
        "compilation_config": {
            "cudagraph_capture_sizes": [4, 8, 32, 64],
            "cudagraph_mode": "FULL_DECODE_ONLY"
        },
        "additional_config": {
            "npugraph_ex_config": {
                "enable": True
            }
        },
    }
    gen_and_valid(runner_kwargs=runner_kwargs,
                  prompts=cur_case.prompts,
                  sampling_params=cur_case.sampling_params,
                  golden_answers=cur_case.golden_answers)
-												[aclgraph] implentment NPUPiecewiseBackend to enable aclgraph (#836)

### What this PR does / why we need it?
1. Implentment `NPUPiecewiseBackend` to enable aclgraph
2. Eable aclgraph by default in V1, but raise error when running
deepseek and raise warning when running models except for qwen

### How was this patch tested?
CI pass with the new ut

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
											
										
										
											2025-05-29 11:58:26 +08:00
+								#
 								# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 								# Copyright 2023 The vLLM team.
 								#
 								# Licensed under the Apache License, Version 2.0 (the "License");
 								# you may not use this file except in compliance with the License.
 								# You may obtain a copy of the License at
 								#
 								#     http://www.apache.org/licenses/LICENSE-2.0
 								#
 								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								#
-												[Feat][Graph]Support FULL_DECEDE_ONLY mode for MLA models (#3125)

### What this PR does / why we need it?
Adds support for capturing the Multi-Layer Attention (MLA) decode
operation into an ACL graph. This improves performance by compiling the
attention kernel for single-token decoding.

Key changes include:
- Implementing the graph capture logic for the MLA kernel, including
workspace management and parameter updates.
- Modifying the rotary embedding (RoPE) handling to use pre-allocated
tensors, which is a requirement for graph capture.
- Adding a `build_for_graph_capture` method to the MLA metadata builder
to create dummy metadata during the graph compilation phase.

Known issues:
- Currently, MTP is not supported in FULL_DECEDE_ONLY mode -- we're
working on a fix
- We are preparing to remove update_mla_attn_params with
auto_dispatch_capture

### Does this PR introduce _any_ user-facing change?
compilation_config={
    "cudagraph_mode": "FULL_DECODE_ONLY",
},
### How was this patch tested?


- vLLM version: v0.11.0

---------

Signed-off-by: panchao-hub <315134829@qq.com>
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
											
										
										
											2025-10-10 16:31:20 +08:00
-												[aclgraph] implentment NPUPiecewiseBackend to enable aclgraph (#836)

### What this PR does / why we need it?
1. Implentment `NPUPiecewiseBackend` to enable aclgraph
2. Eable aclgraph by default in V1, but raise error when running
deepseek and raise warning when running models except for qwen

### How was this patch tested?
CI pass with the new ut

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
											
										
										
											2025-05-29 11:58:26 +08:00
+								import pytest
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT,
 								                                        LLMTestCase, gen_and_valid)
-												[aclgraph] implentment NPUPiecewiseBackend to enable aclgraph (#836)

### What this PR does / why we need it?
1. Implentment `NPUPiecewiseBackend` to enable aclgraph
2. Eable aclgraph by default in V1, but raise error when running
deepseek and raise warning when running models except for qwen

### How was this patch tested?
CI pass with the new ut

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
											
										
										
											2025-05-29 11:58:26 +08:00
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_QWEN_ACLGRAPH = LLMTestCase(
 								    model="Qwen/Qwen3-0.6B",
 								    prompts=PROMPTS_SHORT,
 								    golden_answers=[
-												[BugFix][Fusion] Patch compile backend to make fusion available (#5308)

Currently, the vllm pr: https://github.com/vllm-project/vllm/pull/24252
is causing operator fusion to fail, which can be mitigated by patching
the backend. Once the problem is completely resolved, I will submit a
new pull request to remove the patch.

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
											
										
										
											2025-12-26 09:18:16 +08:00
+								        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I want to know if there are any",
 								        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
 								        ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
 								        ' not just a technological frontier but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								    ],
 								)
 								CASE_DS_ACLGRAPH = LLMTestCase(
 								    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
 								    quantization="ascend",
 								    prompts=PROMPTS_SHORT,
 								    golden_answers=[
-												[OP] Enable custom op aclnnMoeInitRoutingCustom (#5332)

### What this PR does / why we need it?
This PR enables custom op `aclnnMoeInitRoutingCustom` introduced in PR
#5251

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586

---------

Signed-off-by: QianChenxi <chenxi.qian.cq@outlook.com>
Signed-off-by: zzzzwwjj <1183291235@qq.com>
Co-authored-by: zzzzwwjj <1183291235@qq.com>
											
										
										
											2026-01-09 09:35:18 +08:00
+								        '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
-												[BugFix][Fusion] Patch compile backend to make fusion available (#5308)

Currently, the vllm pr: https://github.com/vllm-project/vllm/pull/24252
is causing operator fusion to fail, which can be mitigated by patching
the backend. Once the problem is completely resolved, I will submit a
new pull request to remove the patch.

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
											
										
										
											2025-12-26 09:18:16 +08:00
+								        ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
 								        ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
-												[OP] Enable custom op aclnnMoeInitRoutingCustom (#5332)

### What this PR does / why we need it?
This PR enables custom op `aclnnMoeInitRoutingCustom` introduced in PR
#5251

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586

---------

Signed-off-by: QianChenxi <chenxi.qian.cq@outlook.com>
Signed-off-by: zzzzwwjj <1183291235@qq.com>
Co-authored-by: zzzzwwjj <1183291235@qq.com>
											
										
										
											2026-01-09 09:35:18 +08:00
+								        ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								    ],
 								)
-												[BugFix][Fusion] Patch compile backend to make fusion available (#5308)

Currently, the vllm pr: https://github.com/vllm-project/vllm/pull/24252
is causing operator fusion to fail, which can be mitigated by patching
the backend. Once the problem is completely resolved, I will submit a
new pull request to remove the patch.

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
											
										
										
											2025-12-26 09:18:16 +08:00
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
 								    model="Qwen/Qwen3-0.6B",
 								    prompts=PROMPTS_LONG,
 								    golden_answers=[
-												[Fix] fix aclgraph e2e test. (#4131)

### What this PR does / why we need it?
Due to the inconsistency between the attention operators used in eager
mode and graph mode, the accumulation order of the operator cannot be
guaranteed to be deterministic. Therefore, we modify the test to compare
with given outputs.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
											
										
										
											2025-11-24 17:22:03 +08:00
+								        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
-												[CI] Migrate e2e test runner to hk (#5344)

### What this PR does / why we need it?
This patch add new runner labels for the HK region, and e2e single-card
testing has been migrated to this runner.

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-26 09:00:51 +08:00
+								        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
 								        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations $x^2 +'
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								    ])
-												[Fix] fix aclgraph e2e test. (#4131)

### What this PR does / why we need it?
Due to the inconsistency between the attention operators used in eager
mode and graph mode, the accumulation order of the operator cannot be
guaranteed to be deterministic. Therefore, we modify the test to compare
with given outputs.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
											
										
										
											2025-11-24 17:22:03 +08:00
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
 								    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
 								    quantization="ascend",
 								    prompts=PROMPTS_LONG,
 								    golden_answers=[
-												[feature] add_rms_norm support bias (#5790)

### What this PR does / why we need it?
This PR is to replace addRmsNorm and Add With addRmsNormBias. This way
can lead to a more effecient result.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Full Test Pass

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d

Signed-off-by: Chen_HaoWen <chenhaowen12@huawei.com>
Co-authored-by: Chen_HaoWen <chenhaowen12@huawei.com>
											
										
										
											2026-01-23 21:09:54 +08:00
+								        "\n\nSelect an assignment template",
 								        "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
 								        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
-												[Fix] fix aclgraph e2e test. (#4131)

### What this PR does / why we need it?
Due to the inconsistency between the attention operators used in eager
mode and graph mode, the accumulation order of the operator cannot be
guaranteed to be deterministic. Therefore, we modify the test to compare
with given outputs.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
											
										
										
											2025-11-24 17:22:03 +08:00
+								    ])
-												[Feat][Graph]Support FULL_DECEDE_ONLY mode for MLA models (#3125)

### What this PR does / why we need it?
Adds support for capturing the Multi-Layer Attention (MLA) decode
operation into an ACL graph. This improves performance by compiling the
attention kernel for single-token decoding.

Key changes include:
- Implementing the graph capture logic for the MLA kernel, including
workspace management and parameter updates.
- Modifying the rotary embedding (RoPE) handling to use pre-allocated
tensors, which is a requirement for graph capture.
- Adding a `build_for_graph_capture` method to the MLA metadata builder
to create dummy metadata during the graph compilation phase.

Known issues:
- Currently, MTP is not supported in FULL_DECEDE_ONLY mode -- we're
working on a fix
- We are preparing to remove update_mla_attn_params with
auto_dispatch_capture

### Does this PR introduce _any_ user-facing change?
compilation_config={
    "cudagraph_mode": "FULL_DECODE_ONLY",
},
### How was this patch tested?


- vLLM version: v0.11.0

---------

Signed-off-by: panchao-hub <315134829@qq.com>
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
											
										
										
											2025-10-10 16:31:20 +08:00
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_QWEN_EX = LLMTestCase(
 								    model="Qwen/Qwen3-0.6B",
 								    prompts=PROMPTS_LONG,
 								    golden_answers=[
-												enable npugraph_ex (#5120)

### What this PR does / why we need it?
We will expose the enabling switch for npugraph_ex to better facilitate
subsequent optimization.

### Does this PR introduce _any_ user-facing change?
Previously, the enable_npugraph_ex switch would trigger an error; now we
have removed the error reporting mechanism to better facilitate
subsequent optimization efforts.
Basic functionalities are available in CANN and torch_npu for Q3, while
advanced optimizations will depend on the Q4 release.

### How was this patch tested?
llm =LLM(
    model=model,
    enforce_eager=False ,
        additional_config={
        "enable_npugraph_ex":  True
        },
        compilation_config={
            "cudagraph_mode": "FULL_DECODE_ONLY",
            "cudagraph_capture_sizes": [16],
        },
}


- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
											
										
										
											2025-12-18 09:08:40 +08:00
+								        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
-												[CI] Migrate e2e test runner to hk (#5344)

### What this PR does / why we need it?
This patch add new runner labels for the HK region, and e2e single-card
testing has been migrated to this runner.

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-26 09:00:51 +08:00
+								        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
-												enable npugraph_ex (#5120)

### What this PR does / why we need it?
We will expose the enabling switch for npugraph_ex to better facilitate
subsequent optimization.

### Does this PR introduce _any_ user-facing change?
Previously, the enable_npugraph_ex switch would trigger an error; now we
have removed the error reporting mechanism to better facilitate
subsequent optimization efforts.
Basic functionalities are available in CANN and torch_npu for Q3, while
advanced optimizations will depend on the Q4 release.

### How was this patch tested?
llm =LLM(
    model=model,
    enforce_eager=False ,
        additional_config={
        "enable_npugraph_ex":  True
        },
        compilation_config={
            "cudagraph_mode": "FULL_DECODE_ONLY",
            "cudagraph_capture_sizes": [16],
        },
}


- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
											
										
										
											2025-12-18 09:08:40 +08:00
+								        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
 								    ])
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_DS_EX = LLMTestCase(model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
 								                         quantization="ascend",
 								                         prompts=PROMPTS_LONG,
 								                         golden_answers=[
-												[feature] add_rms_norm support bias (#5790)

### What this PR does / why we need it?
This PR is to replace addRmsNorm and Add With addRmsNormBias. This way
can lead to a more effecient result.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Full Test Pass

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d

Signed-off-by: Chen_HaoWen <chenhaowen12@huawei.com>
Co-authored-by: Chen_HaoWen <chenhaowen12@huawei.com>
											
										
										
											2026-01-23 21:09:54 +08:00
+								                             "\n\nSelect an assignment template",
 								                             "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
 								                             "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								                         ])
 								@pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
 								def test_piecewise_res_consistency(cur_case: LLMTestCase):
 								    runner_kwargs = {
 								        "model_name": cur_case.model,
 								        "max_model_len": 1024,
 								        "cudagraph_capture_sizes": [1, 2, 4, 8],
 								        "quantization": cur_case.quantization,
 								    }
 								    gen_and_valid(runner_kwargs=runner_kwargs,
 								                  prompts=cur_case.prompts,
 								                  sampling_params=cur_case.sampling_params,
 								                  golden_answers=cur_case.golden_answers)
 								@pytest.mark.parametrize(
 								    "cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
 								def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
 								    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
 								    runner_kwargs = {
 								        "model_name": cur_case.model,
 								        "max_model_len": 1024,
 								        "compilation_config": {
 								            "cudagraph_capture_sizes": [4, 8, 32, 64],
 								            "cudagraph_mode": "FULL_DECODE_ONLY"
 								        },
 								        "quantization": cur_case.quantization,
 								    }
 								    gen_and_valid(runner_kwargs=runner_kwargs,
 								                  prompts=cur_case.prompts,
 								                  sampling_params=cur_case.sampling_params,
 								                  golden_answers=cur_case.golden_answers)
-												[CI] Re-open skipped cases due to PTA upgrading and update the golden results (#6144)

### What this PR does / why we need it?
Re-open `tests/e2e/singlecard/test_aclgraph_accuracy.py` and update its
golden results to match PTA 2.9.0

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60

Signed-off-by: wjunLu <wjunlu217@gmail.com>
											
										
										
											2026-01-23 10:46:31 +08:00
+								@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
 								    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
 								    runner_kwargs = {
 								        "model_name": cur_case.model,
 								        "quantization": cur_case.quantization,
 								        "max_model_len": 1024,
 								        "compilation_config": {
 								            "cudagraph_capture_sizes": [4, 8, 32, 64],
 								            "cudagraph_mode": "FULL_DECODE_ONLY"
 								        },
 								        "additional_config": {
-												[Feature]refactor the npugraph_ex config, support online-infer with static kernel (#5775)

### What this PR does / why we need it?
This is a part of
https://github.com/vllm-project/vllm-ascend/issues/4715#issue-3694310762
1. refactor the npugraph_ex config，modified the default configuration of
the static kernel, new default value of static kernel is false
2. support online-infer with static kernel
3. fixed the issue where manually modifying FX graphs caused an abnormal
model return type, and removed the related redundant code.

### Does this PR introduce _any_ user-facing change?
yes，the new config of npugraph_ex is as follow:
```
additional_config={
            "npugraph_ex_config": {
                "enable": True,
                "enable_static_kernel": False
            }
        }
```
### How was this patch tested?
```
vllm serve /data/DeepSeek-V3.1-Terminus-w4a8 \
    --host 0.0.0.0 \
    --port 8004 \
    --data-parallel-size 4 \
    --tensor-parallel-size 4 \
    --quantization ascend \
    --seed 1024 \
    --served-model-name deepseek_v3 \
    --enable-expert-parallel \
    --max-num-seqs 48 \
    --max-model-len 40000 \
    --async-scheduling \
    --max-num-batched-tokens 9000 \
    --trust-remote-code \
    --no-enable-prefix-caching \
    --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp","disable_padded_drafter_batch": false}' \
    --gpu-memory-utilization 0.9 \
    --compilation-config '{"cudagraph_capture_sizes":[4,32,64,112,160,176,192], "cudagraph_mode": "FULL_DECODE_ONLY"}' \
    --additional-config \
    '{"enable_shared_expert_dp": true,"multistream_overlap_shared_expert": true,"npugraph_ex_config":{"enable":true}}'
```

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d

---------

Signed-off-by: chencangtao <chencangtao@huawei.com>
Signed-off-by: ChenCangtao <50493711+ChenCangtao@users.noreply.github.com>
Co-authored-by: chencangtao <chencangtao@huawei.com>
											
										
										
											2026-01-20 21:31:38 +08:00
+								            "npugraph_ex_config": {
 								                "enable": True
 								            }
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								        },
 								    }
 								    gen_and_valid(runner_kwargs=runner_kwargs,
 								                  prompts=cur_case.prompts,
 								                  sampling_params=cur_case.sampling_params,
 								                  golden_answers=cur_case.golden_answers)