xc-llm-ascend/tests/e2e/singlecard/test_aclgraph_accuracy.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pytest
import os

from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT,
                                        LLMTestCase, gen_and_valid)

CASE_QWEN_ACLGRAPH = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_SHORT,
    golden_answers=[
        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
        ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
        ' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
    ],
)

CASE_DS_ACLGRAPH = LLMTestCase(
    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
    quantization="ascend",
    prompts=PROMPTS_SHORT,
    golden_answers=[
        '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
        ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
        ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
        ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
    ],
)

CASE_QWEN_FULL = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_SHORT,
    golden_answers=[
        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
        ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
        ' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
    ],
)

CASE_DS_FULL = LLMTestCase(
    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
    quantization="ascend",
    prompts=PROMPTS_SHORT,
    golden_answers=[
        '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
        ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
        ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
        ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
    ],
)

CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_LONG,
    golden_answers=[
        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
        " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
    ])

CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
    quantization="ascend",
    prompts=PROMPTS_LONG,
    golden_answers=[
        "\n\nSelect an assignment template",
        "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
    ])

CASE_QWEN_EX = LLMTestCase(
    model="Qwen/Qwen3-0.6B",
    prompts=PROMPTS_LONG,
    golden_answers=[
        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
        " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
    ])

CASE_DS_EX = LLMTestCase(model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
                         quantization="ascend",
                         prompts=PROMPTS_LONG,
                         golden_answers=[
                             "\n\nSelect an assignment template",
                             "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
                             "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
                         ])

@pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
def test_piecewise_res_consistency(cur_case: LLMTestCase):
    runner_kwargs = {
        "model_name": cur_case.model,
        "max_model_len": 1024,
        "cudagraph_capture_sizes": [1, 2, 4, 8],
        "quantization": cur_case.quantization,
    }
    gen_and_valid(runner_kwargs=runner_kwargs,
                  prompts=cur_case.prompts,
                  sampling_params=cur_case.sampling_params,
                  golden_answers=cur_case.golden_answers)

@pytest.mark.parametrize(
    "cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
    runner_kwargs = {
        "model_name": cur_case.model,
        "max_model_len": 1024,
        "compilation_config": {
            "cudagraph_capture_sizes": [4, 8, 32, 64],
            "cudagraph_mode": "FULL_DECODE_ONLY"
        },
        "quantization": cur_case.quantization,
    }
    gen_and_valid(runner_kwargs=runner_kwargs,
                  prompts=cur_case.prompts,
                  sampling_params=cur_case.sampling_params,
                  golden_answers=cur_case.golden_answers)

@pytest.mark.parametrize(
    "cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
    runner_kwargs = {
        "model_name": cur_case.model,
        "max_model_len": 1024,
        "compilation_config": {
            "cudagraph_capture_sizes": [4, 8, 32, 64],
            "cudagraph_mode": "FULL_DECODE_ONLY"
        },
        "quantization": cur_case.quantization,
        "additional_config": {
            "npugraph_ex_config": {
                "enable": False
            }
        },
    }
    gen_and_valid(runner_kwargs=runner_kwargs,
                  prompts=cur_case.prompts,
                  sampling_params=cur_case.sampling_params,
                  golden_answers=cur_case.golden_answers)

@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
    runner_kwargs = {
        "model_name": cur_case.model,
        "quantization": cur_case.quantization,
        "max_model_len": 1024,
        "compilation_config": {
            "cudagraph_capture_sizes": [4, 8, 32, 64],
            "cudagraph_mode": "FULL_DECODE_ONLY"
        },
        "additional_config": {
            "npugraph_ex_config": {
                "enable": True
            }
        },
    }
    gen_and_valid(runner_kwargs=runner_kwargs,
                  prompts=cur_case.prompts,
                  sampling_params=cur_case.sampling_params,
                  golden_answers=cur_case.golden_answers)

# The accuracy has already been verified in the previous test case.
# This test case is used to check whether the functionality works properly
# after enabling the static kernel and whether it is uninstalled as expected.
@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX])
def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
    runner_kwargs = {
        "model_name": cur_case.model,
        "quantization": cur_case.quantization,
        "max_model_len": 1024,
        "compilation_config": {
            "cudagraph_capture_sizes": [4, 8],
            "cudagraph_mode": "FULL_DECODE_ONLY"
        },
        "additional_config": {
            "npugraph_ex_config": {
                "enable": True,
                "enable_static_kernel": True,
            }
        },
    }
    gen_and_valid(runner_kwargs=runner_kwargs,
                  prompts=cur_case.prompts,
                  sampling_params=cur_case.sampling_params,
                  golden_answers=cur_case.golden_answers)

    # Check whether the static kernel is properly uninstall
    ascend_home_path = os.environ["ASCEND_HOME_PATH"]
    static_kernel_install_path = os.path.join(ascend_home_path, 'opp/static_kernel/ai_core')
    assert not os.path.exists(static_kernel_install_path)
-												[aclgraph] implentment NPUPiecewiseBackend to enable aclgraph (#836)

### What this PR does / why we need it?
1. Implentment `NPUPiecewiseBackend` to enable aclgraph
2. Eable aclgraph by default in V1, but raise error when running
deepseek and raise warning when running models except for qwen

### How was this patch tested?
CI pass with the new ut

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
											
										
										
											2025-05-29 11:58:26 +08:00
+								#
 								# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 								# Copyright 2023 The vLLM team.
 								#
 								# Licensed under the Apache License, Version 2.0 (the "License");
 								# you may not use this file except in compliance with the License.
 								# You may obtain a copy of the License at
 								#
 								#     http://www.apache.org/licenses/LICENSE-2.0
 								#
 								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								#
-												[Feat][Graph]Support FULL_DECEDE_ONLY mode for MLA models (#3125)

### What this PR does / why we need it?
Adds support for capturing the Multi-Layer Attention (MLA) decode
operation into an ACL graph. This improves performance by compiling the
attention kernel for single-token decoding.

Key changes include:
- Implementing the graph capture logic for the MLA kernel, including
workspace management and parameter updates.
- Modifying the rotary embedding (RoPE) handling to use pre-allocated
tensors, which is a requirement for graph capture.
- Adding a `build_for_graph_capture` method to the MLA metadata builder
to create dummy metadata during the graph compilation phase.

Known issues:
- Currently, MTP is not supported in FULL_DECEDE_ONLY mode -- we're
working on a fix
- We are preparing to remove update_mla_attn_params with
auto_dispatch_capture

### Does this PR introduce _any_ user-facing change?
compilation_config={
    "cudagraph_mode": "FULL_DECODE_ONLY",
},
### How was this patch tested?


- vLLM version: v0.11.0

---------

Signed-off-by: panchao-hub <315134829@qq.com>
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
											
										
										
											2025-10-10 16:31:20 +08:00
-												[aclgraph] implentment NPUPiecewiseBackend to enable aclgraph (#836)

### What this PR does / why we need it?
1. Implentment `NPUPiecewiseBackend` to enable aclgraph
2. Eable aclgraph by default in V1, but raise error when running
deepseek and raise warning when running models except for qwen

### How was this patch tested?
CI pass with the new ut

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
											
										
										
											2025-05-29 11:58:26 +08:00
+								import pytest
-												[e2e Test][npugraph_ex]add static kernel e2e test case (#6320)

### What this PR does / why we need it?
Added an E2E test case for the scenario of enabling a static kernel for
npugraph_ex, monitoring its compilation and unloading process.
Also fixed the previously existing spelling errors

- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd

---------

Signed-off-by: chencangtao <chencangtao@huawei.com>
Co-authored-by: chencangtao <chencangtao@huawei.com>
											
										
										
											2026-01-30 16:24:48 +08:00
+								import os
-												[aclgraph] implentment NPUPiecewiseBackend to enable aclgraph (#836)

### What this PR does / why we need it?
1. Implentment `NPUPiecewiseBackend` to enable aclgraph
2. Eable aclgraph by default in V1, but raise error when running
deepseek and raise warning when running models except for qwen

### How was this patch tested?
CI pass with the new ut

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
											
										
										
											2025-05-29 11:58:26 +08:00
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT,
 								                                        LLMTestCase, gen_and_valid)
-												[aclgraph] implentment NPUPiecewiseBackend to enable aclgraph (#836)

### What this PR does / why we need it?
1. Implentment `NPUPiecewiseBackend` to enable aclgraph
2. Eable aclgraph by default in V1, but raise error when running
deepseek and raise warning when running models except for qwen

### How was this patch tested?
CI pass with the new ut

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
											
										
										
											2025-05-29 11:58:26 +08:00
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_QWEN_ACLGRAPH = LLMTestCase(
 								    model="Qwen/Qwen3-0.6B",
 								    prompts=PROMPTS_SHORT,
 								    golden_answers=[
-												[Main][Ops] Make triton rope support index_selecting from cos_sin_cache (#5450)

### What this PR does / why we need it?

This PR extends original `rope_triton_forward` and
`split_qkv_rmsnorm_rope` to support `cos_sin_cache` && `positions` as
inputs. This fully aligns to vLLM RoPE api interface. Compared with
earlier implementation for RoPE, the benefits are:

1. avoiding pre-computation of `cos` `sin` before model execution, which
helps to remove redundant codes.
2. allowing eagle3 draft model to have different rope parameters with
main model (see #6612 ). This help to recover accept rate && accuracy in
that case.

In addition, this kernel change only introduces very small performance
degradation. Those `index_select` or `chunk` operations are now changed
into simple memory access in triton kernel (For example,
https://github.com/vllm-project/vllm-ascend/pull/5450/changes#diff-a4c2d3071530df193b98f9bf38553874bc4d47571336711f116c26d019cfbb6aR77-R81).

**Highlights**

- **RoPE Cache Unification**: Replaced separate _sin and _cos global
tensors with a unified cos_sin_cache and explicit positions tensor for
Rotary Positional Embeddings (RoPE), streamlining data handling.
- **Triton Kernel Integration**: Updated Triton kernels
(split_qkv_rmsnorm_rope_kernel, _triton_rope) to directly consume the
cos_sin_cache and positions for more efficient and integrated RoPE
calculations.
- **Custom Operation Registration**: Registered `rope_forward_oot` as a
new custom operation, allowing its use in fused compilation passes and
providing a dedicated entry point for the new RoPE implementation.
- **Refactored RoPE Forward Pass**: Modified the rope_forward_oot
function to accept the new cos_sin_cache and positions arguments,
enabling a more flexible and integrated RoPE application within the
system.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5326c89803566a131c928f7fdd2100b75c981a42

Additional test on Qwen3-235b accuracy:

| Aime2024 | GSM8K | Livecodebench |
| -------- | -------- | -------- |
| 83.33 | 96.26 | 70.23 |

---------

Signed-off-by: Angazenn <supperccell@163.com>
											
										
										
											2026-02-11 21:20:53 +08:00
+								        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-												[BugFix][Fusion] Patch compile backend to make fusion available (#5308)

Currently, the vllm pr: https://github.com/vllm-project/vllm/pull/24252
is causing operator fusion to fail, which can be mitigated by patching
the backend. Once the problem is completely resolved, I will submit a
new pull request to remove the patch.

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
											
										
										
											2025-12-26 09:18:16 +08:00
+								        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
 								        ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
-												[Main][Ops] Make triton rope support index_selecting from cos_sin_cache (#5450)

### What this PR does / why we need it?

This PR extends original `rope_triton_forward` and
`split_qkv_rmsnorm_rope` to support `cos_sin_cache` && `positions` as
inputs. This fully aligns to vLLM RoPE api interface. Compared with
earlier implementation for RoPE, the benefits are:

1. avoiding pre-computation of `cos` `sin` before model execution, which
helps to remove redundant codes.
2. allowing eagle3 draft model to have different rope parameters with
main model (see #6612 ). This help to recover accept rate && accuracy in
that case.

In addition, this kernel change only introduces very small performance
degradation. Those `index_select` or `chunk` operations are now changed
into simple memory access in triton kernel (For example,
https://github.com/vllm-project/vllm-ascend/pull/5450/changes#diff-a4c2d3071530df193b98f9bf38553874bc4d47571336711f116c26d019cfbb6aR77-R81).

**Highlights**

- **RoPE Cache Unification**: Replaced separate _sin and _cos global
tensors with a unified cos_sin_cache and explicit positions tensor for
Rotary Positional Embeddings (RoPE), streamlining data handling.
- **Triton Kernel Integration**: Updated Triton kernels
(split_qkv_rmsnorm_rope_kernel, _triton_rope) to directly consume the
cos_sin_cache and positions for more efficient and integrated RoPE
calculations.
- **Custom Operation Registration**: Registered `rope_forward_oot` as a
new custom operation, allowing its use in fused compilation passes and
providing a dedicated entry point for the new RoPE implementation.
- **Refactored RoPE Forward Pass**: Modified the rope_forward_oot
function to accept the new cos_sin_cache and positions arguments,
enabling a more flexible and integrated RoPE application within the
system.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5326c89803566a131c928f7fdd2100b75c981a42

Additional test on Qwen3-235b accuracy:

| Aime2024 | GSM8K | Livecodebench |
| -------- | -------- | -------- |
| 83.33 | 96.26 | 70.23 |

---------

Signed-off-by: Angazenn <supperccell@163.com>
											
										
										
											2026-02-11 21:20:53 +08:00
+								        ' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								    ],
 								)
 								CASE_DS_ACLGRAPH = LLMTestCase(
 								    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
 								    quantization="ascend",
 								    prompts=PROMPTS_SHORT,
 								    golden_answers=[
-												[OP] Enable custom op aclnnMoeInitRoutingCustom (#5332)

### What this PR does / why we need it?
This PR enables custom op `aclnnMoeInitRoutingCustom` introduced in PR
#5251

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586

---------

Signed-off-by: QianChenxi <chenxi.qian.cq@outlook.com>
Signed-off-by: zzzzwwjj <1183291235@qq.com>
Co-authored-by: zzzzwwjj <1183291235@qq.com>
											
										
										
											2026-01-09 09:35:18 +08:00
+								        '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
-												[BugFix][Fusion] Patch compile backend to make fusion available (#5308)

Currently, the vllm pr: https://github.com/vllm-project/vllm/pull/24252
is causing operator fusion to fail, which can be mitigated by patching
the backend. Once the problem is completely resolved, I will submit a
new pull request to remove the patch.

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
											
										
										
											2025-12-26 09:18:16 +08:00
+								        ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
 								        ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
-												[OP] Enable custom op aclnnMoeInitRoutingCustom (#5332)

### What this PR does / why we need it?
This PR enables custom op `aclnnMoeInitRoutingCustom` introduced in PR
#5251

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586

---------

Signed-off-by: QianChenxi <chenxi.qian.cq@outlook.com>
Signed-off-by: zzzzwwjj <1183291235@qq.com>
Co-authored-by: zzzzwwjj <1183291235@qq.com>
											
										
										
											2026-01-09 09:35:18 +08:00
+								        ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								    ],
 								)
-												[BugFix][Fusion] Patch compile backend to make fusion available (#5308)

Currently, the vllm pr: https://github.com/vllm-project/vllm/pull/24252
is causing operator fusion to fail, which can be mitigated by patching
the backend. Once the problem is completely resolved, I will submit a
new pull request to remove the patch.

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
											
										
										
											2025-12-26 09:18:16 +08:00
-												[ModelRunner][Fix] Pads query_start_loc to satisfy FIA/TND constraint (#6475)

### What this PR does / why we need it?
This PR reverts "[ModelRunner] Revert [Fix] Pads query_start_loc to
satisfy FIA/TND constraint #6459 (commit
5b0a6bcfe9eca595bbcd064363596553b6bbd1fe)" and fixes a check in
`model_runner_v1`.

**A key change is that we remove the strict assertion in the latest
commit, as it turns out MLA + PIECEWISE will slice during computing,
leaving our assertion uncalled for and will only cause false alarm.**

This handles both uniform and mixed batches (by inserting a dummy
request for mixed batches), consolidates ad-hoc padding into a single
helper, copies the updated buffer to the device, which prevents kernel
mismatches or failures and ensure correct shapes for FIA/TND execution
in full graph modes.

We currently place this helper in `execute_model`. My original design
was to include it in `_prepare_inputs`, but that doesn’t work because it
must run after padding. While I’d prefer to minimize the impact and
reuse as much of the base class as possible in the future, it doesn’t
seem achievable at the moment.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
Test cases added.

- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd

---------

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
											
										
										
											2026-02-04 21:11:08 +08:00
+								CASE_QWEN_FULL = LLMTestCase(
 								    model="Qwen/Qwen3-0.6B",
 								    prompts=PROMPTS_SHORT,
 								    golden_answers=[
-												[Main][Ops] Make triton rope support index_selecting from cos_sin_cache (#5450)

### What this PR does / why we need it?

This PR extends original `rope_triton_forward` and
`split_qkv_rmsnorm_rope` to support `cos_sin_cache` && `positions` as
inputs. This fully aligns to vLLM RoPE api interface. Compared with
earlier implementation for RoPE, the benefits are:

1. avoiding pre-computation of `cos` `sin` before model execution, which
helps to remove redundant codes.
2. allowing eagle3 draft model to have different rope parameters with
main model (see #6612 ). This help to recover accept rate && accuracy in
that case.

In addition, this kernel change only introduces very small performance
degradation. Those `index_select` or `chunk` operations are now changed
into simple memory access in triton kernel (For example,
https://github.com/vllm-project/vllm-ascend/pull/5450/changes#diff-a4c2d3071530df193b98f9bf38553874bc4d47571336711f116c26d019cfbb6aR77-R81).

**Highlights**

- **RoPE Cache Unification**: Replaced separate _sin and _cos global
tensors with a unified cos_sin_cache and explicit positions tensor for
Rotary Positional Embeddings (RoPE), streamlining data handling.
- **Triton Kernel Integration**: Updated Triton kernels
(split_qkv_rmsnorm_rope_kernel, _triton_rope) to directly consume the
cos_sin_cache and positions for more efficient and integrated RoPE
calculations.
- **Custom Operation Registration**: Registered `rope_forward_oot` as a
new custom operation, allowing its use in fused compilation passes and
providing a dedicated entry point for the new RoPE implementation.
- **Refactored RoPE Forward Pass**: Modified the rope_forward_oot
function to accept the new cos_sin_cache and positions arguments,
enabling a more flexible and integrated RoPE application within the
system.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5326c89803566a131c928f7fdd2100b75c981a42

Additional test on Qwen3-235b accuracy:

| Aime2024 | GSM8K | Livecodebench |
| -------- | -------- | -------- |
| 83.33 | 96.26 | 70.23 |

---------

Signed-off-by: Angazenn <supperccell@163.com>
											
										
										
											2026-02-11 21:20:53 +08:00
+								        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-												[ModelRunner][Fix] Pads query_start_loc to satisfy FIA/TND constraint (#6475)

### What this PR does / why we need it?
This PR reverts "[ModelRunner] Revert [Fix] Pads query_start_loc to
satisfy FIA/TND constraint #6459 (commit
5b0a6bcfe9eca595bbcd064363596553b6bbd1fe)" and fixes a check in
`model_runner_v1`.

**A key change is that we remove the strict assertion in the latest
commit, as it turns out MLA + PIECEWISE will slice during computing,
leaving our assertion uncalled for and will only cause false alarm.**

This handles both uniform and mixed batches (by inserting a dummy
request for mixed batches), consolidates ad-hoc padding into a single
helper, copies the updated buffer to the device, which prevents kernel
mismatches or failures and ensure correct shapes for FIA/TND execution
in full graph modes.

We currently place this helper in `execute_model`. My original design
was to include it in `_prepare_inputs`, but that doesn’t work because it
must run after padding. While I’d prefer to minimize the impact and
reuse as much of the base class as possible in the future, it doesn’t
seem achievable at the moment.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
Test cases added.

- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd

---------

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
											
										
										
											2026-02-04 21:11:08 +08:00
+								        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
 								        ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
-												[Main][Ops] Make triton rope support index_selecting from cos_sin_cache (#5450)

### What this PR does / why we need it?

This PR extends original `rope_triton_forward` and
`split_qkv_rmsnorm_rope` to support `cos_sin_cache` && `positions` as
inputs. This fully aligns to vLLM RoPE api interface. Compared with
earlier implementation for RoPE, the benefits are:

1. avoiding pre-computation of `cos` `sin` before model execution, which
helps to remove redundant codes.
2. allowing eagle3 draft model to have different rope parameters with
main model (see #6612 ). This help to recover accept rate && accuracy in
that case.

In addition, this kernel change only introduces very small performance
degradation. Those `index_select` or `chunk` operations are now changed
into simple memory access in triton kernel (For example,
https://github.com/vllm-project/vllm-ascend/pull/5450/changes#diff-a4c2d3071530df193b98f9bf38553874bc4d47571336711f116c26d019cfbb6aR77-R81).

**Highlights**

- **RoPE Cache Unification**: Replaced separate _sin and _cos global
tensors with a unified cos_sin_cache and explicit positions tensor for
Rotary Positional Embeddings (RoPE), streamlining data handling.
- **Triton Kernel Integration**: Updated Triton kernels
(split_qkv_rmsnorm_rope_kernel, _triton_rope) to directly consume the
cos_sin_cache and positions for more efficient and integrated RoPE
calculations.
- **Custom Operation Registration**: Registered `rope_forward_oot` as a
new custom operation, allowing its use in fused compilation passes and
providing a dedicated entry point for the new RoPE implementation.
- **Refactored RoPE Forward Pass**: Modified the rope_forward_oot
function to accept the new cos_sin_cache and positions arguments,
enabling a more flexible and integrated RoPE application within the
system.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5326c89803566a131c928f7fdd2100b75c981a42

Additional test on Qwen3-235b accuracy:

| Aime2024 | GSM8K | Livecodebench |
| -------- | -------- | -------- |
| 83.33 | 96.26 | 70.23 |

---------

Signed-off-by: Angazenn <supperccell@163.com>
											
										
										
											2026-02-11 21:20:53 +08:00
+								        ' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
-												[ModelRunner][Fix] Pads query_start_loc to satisfy FIA/TND constraint (#6475)

### What this PR does / why we need it?
This PR reverts "[ModelRunner] Revert [Fix] Pads query_start_loc to
satisfy FIA/TND constraint #6459 (commit
5b0a6bcfe9eca595bbcd064363596553b6bbd1fe)" and fixes a check in
`model_runner_v1`.

**A key change is that we remove the strict assertion in the latest
commit, as it turns out MLA + PIECEWISE will slice during computing,
leaving our assertion uncalled for and will only cause false alarm.**

This handles both uniform and mixed batches (by inserting a dummy
request for mixed batches), consolidates ad-hoc padding into a single
helper, copies the updated buffer to the device, which prevents kernel
mismatches or failures and ensure correct shapes for FIA/TND execution
in full graph modes.

We currently place this helper in `execute_model`. My original design
was to include it in `_prepare_inputs`, but that doesn’t work because it
must run after padding. While I’d prefer to minimize the impact and
reuse as much of the base class as possible in the future, it doesn’t
seem achievable at the moment.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
Test cases added.

- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd

---------

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
											
										
										
											2026-02-04 21:11:08 +08:00
+								    ],
 								)
 								CASE_DS_FULL = LLMTestCase(
 								    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
 								    quantization="ascend",
 								    prompts=PROMPTS_SHORT,
 								    golden_answers=[
 								        '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
 								        ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
 								        ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
 								        ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
 								    ],
 								)
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
 								    model="Qwen/Qwen3-0.6B",
 								    prompts=PROMPTS_LONG,
 								    golden_answers=[
-												[Fix] fix aclgraph e2e test. (#4131)

### What this PR does / why we need it?
Due to the inconsistency between the attention operators used in eager
mode and graph mode, the accumulation order of the operator cannot be
guaranteed to be deterministic. Therefore, we modify the test to compare
with given outputs.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
											
										
										
											2025-11-24 17:22:03 +08:00
+								        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
-												[Main][Ops] Make triton rope support index_selecting from cos_sin_cache (#5450)

### What this PR does / why we need it?

This PR extends original `rope_triton_forward` and
`split_qkv_rmsnorm_rope` to support `cos_sin_cache` && `positions` as
inputs. This fully aligns to vLLM RoPE api interface. Compared with
earlier implementation for RoPE, the benefits are:

1. avoiding pre-computation of `cos` `sin` before model execution, which
helps to remove redundant codes.
2. allowing eagle3 draft model to have different rope parameters with
main model (see #6612 ). This help to recover accept rate && accuracy in
that case.

In addition, this kernel change only introduces very small performance
degradation. Those `index_select` or `chunk` operations are now changed
into simple memory access in triton kernel (For example,
https://github.com/vllm-project/vllm-ascend/pull/5450/changes#diff-a4c2d3071530df193b98f9bf38553874bc4d47571336711f116c26d019cfbb6aR77-R81).

**Highlights**

- **RoPE Cache Unification**: Replaced separate _sin and _cos global
tensors with a unified cos_sin_cache and explicit positions tensor for
Rotary Positional Embeddings (RoPE), streamlining data handling.
- **Triton Kernel Integration**: Updated Triton kernels
(split_qkv_rmsnorm_rope_kernel, _triton_rope) to directly consume the
cos_sin_cache and positions for more efficient and integrated RoPE
calculations.
- **Custom Operation Registration**: Registered `rope_forward_oot` as a
new custom operation, allowing its use in fused compilation passes and
providing a dedicated entry point for the new RoPE implementation.
- **Refactored RoPE Forward Pass**: Modified the rope_forward_oot
function to accept the new cos_sin_cache and positions arguments,
enabling a more flexible and integrated RoPE application within the
system.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5326c89803566a131c928f7fdd2100b75c981a42

Additional test on Qwen3-235b accuracy:

| Aime2024 | GSM8K | Livecodebench |
| -------- | -------- | -------- |
| 83.33 | 96.26 | 70.23 |

---------

Signed-off-by: Angazenn <supperccell@163.com>
											
										
										
											2026-02-11 21:20:53 +08:00
+								        " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
 								        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								    ])
-												[Fix] fix aclgraph e2e test. (#4131)

### What this PR does / why we need it?
Due to the inconsistency between the attention operators used in eager
mode and graph mode, the accumulation order of the operator cannot be
guaranteed to be deterministic. Therefore, we modify the test to compare
with given outputs.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
											
										
										
											2025-11-24 17:22:03 +08:00
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
 								    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
 								    quantization="ascend",
 								    prompts=PROMPTS_LONG,
 								    golden_answers=[
-												[feature] add_rms_norm support bias (#5790)

### What this PR does / why we need it?
This PR is to replace addRmsNorm and Add With addRmsNormBias. This way
can lead to a more effecient result.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Full Test Pass

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d

Signed-off-by: Chen_HaoWen <chenhaowen12@huawei.com>
Co-authored-by: Chen_HaoWen <chenhaowen12@huawei.com>
											
										
										
											2026-01-23 21:09:54 +08:00
+								        "\n\nSelect an assignment template",
 								        "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
 								        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
-												[Fix] fix aclgraph e2e test. (#4131)

### What this PR does / why we need it?
Due to the inconsistency between the attention operators used in eager
mode and graph mode, the accumulation order of the operator cannot be
guaranteed to be deterministic. Therefore, we modify the test to compare
with given outputs.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
											
										
										
											2025-11-24 17:22:03 +08:00
+								    ])
-												[Feat][Graph]Support FULL_DECEDE_ONLY mode for MLA models (#3125)

### What this PR does / why we need it?
Adds support for capturing the Multi-Layer Attention (MLA) decode
operation into an ACL graph. This improves performance by compiling the
attention kernel for single-token decoding.

Key changes include:
- Implementing the graph capture logic for the MLA kernel, including
workspace management and parameter updates.
- Modifying the rotary embedding (RoPE) handling to use pre-allocated
tensors, which is a requirement for graph capture.
- Adding a `build_for_graph_capture` method to the MLA metadata builder
to create dummy metadata during the graph compilation phase.

Known issues:
- Currently, MTP is not supported in FULL_DECEDE_ONLY mode -- we're
working on a fix
- We are preparing to remove update_mla_attn_params with
auto_dispatch_capture

### Does this PR introduce _any_ user-facing change?
compilation_config={
    "cudagraph_mode": "FULL_DECODE_ONLY",
},
### How was this patch tested?


- vLLM version: v0.11.0

---------

Signed-off-by: panchao-hub <315134829@qq.com>
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
											
										
										
											2025-10-10 16:31:20 +08:00
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_QWEN_EX = LLMTestCase(
 								    model="Qwen/Qwen3-0.6B",
 								    prompts=PROMPTS_LONG,
 								    golden_answers=[
-												enable npugraph_ex (#5120)

### What this PR does / why we need it?
We will expose the enabling switch for npugraph_ex to better facilitate
subsequent optimization.

### Does this PR introduce _any_ user-facing change?
Previously, the enable_npugraph_ex switch would trigger an error; now we
have removed the error reporting mechanism to better facilitate
subsequent optimization efforts.
Basic functionalities are available in CANN and torch_npu for Q3, while
advanced optimizations will depend on the Q4 release.

### How was this patch tested?
llm =LLM(
    model=model,
    enforce_eager=False ,
        additional_config={
        "enable_npugraph_ex":  True
        },
        compilation_config={
            "cudagraph_mode": "FULL_DECODE_ONLY",
            "cudagraph_capture_sizes": [16],
        },
}


- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
											
										
										
											2025-12-18 09:08:40 +08:00
+								        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
-												[Main][Ops] Make triton rope support index_selecting from cos_sin_cache (#5450)

### What this PR does / why we need it?

This PR extends original `rope_triton_forward` and
`split_qkv_rmsnorm_rope` to support `cos_sin_cache` && `positions` as
inputs. This fully aligns to vLLM RoPE api interface. Compared with
earlier implementation for RoPE, the benefits are:

1. avoiding pre-computation of `cos` `sin` before model execution, which
helps to remove redundant codes.
2. allowing eagle3 draft model to have different rope parameters with
main model (see #6612 ). This help to recover accept rate && accuracy in
that case.

In addition, this kernel change only introduces very small performance
degradation. Those `index_select` or `chunk` operations are now changed
into simple memory access in triton kernel (For example,
https://github.com/vllm-project/vllm-ascend/pull/5450/changes#diff-a4c2d3071530df193b98f9bf38553874bc4d47571336711f116c26d019cfbb6aR77-R81).

**Highlights**

- **RoPE Cache Unification**: Replaced separate _sin and _cos global
tensors with a unified cos_sin_cache and explicit positions tensor for
Rotary Positional Embeddings (RoPE), streamlining data handling.
- **Triton Kernel Integration**: Updated Triton kernels
(split_qkv_rmsnorm_rope_kernel, _triton_rope) to directly consume the
cos_sin_cache and positions for more efficient and integrated RoPE
calculations.
- **Custom Operation Registration**: Registered `rope_forward_oot` as a
new custom operation, allowing its use in fused compilation passes and
providing a dedicated entry point for the new RoPE implementation.
- **Refactored RoPE Forward Pass**: Modified the rope_forward_oot
function to accept the new cos_sin_cache and positions arguments,
enabling a more flexible and integrated RoPE application within the
system.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5326c89803566a131c928f7fdd2100b75c981a42

Additional test on Qwen3-235b accuracy:

| Aime2024 | GSM8K | Livecodebench |
| -------- | -------- | -------- |
| 83.33 | 96.26 | 70.23 |

---------

Signed-off-by: Angazenn <supperccell@163.com>
											
										
										
											2026-02-11 21:20:53 +08:00
+								        " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
 								        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
-												enable npugraph_ex (#5120)

### What this PR does / why we need it?
We will expose the enabling switch for npugraph_ex to better facilitate
subsequent optimization.

### Does this PR introduce _any_ user-facing change?
Previously, the enable_npugraph_ex switch would trigger an error; now we
have removed the error reporting mechanism to better facilitate
subsequent optimization efforts.
Basic functionalities are available in CANN and torch_npu for Q3, while
advanced optimizations will depend on the Q4 release.

### How was this patch tested?
llm =LLM(
    model=model,
    enforce_eager=False ,
        additional_config={
        "enable_npugraph_ex":  True
        },
        compilation_config={
            "cudagraph_mode": "FULL_DECODE_ONLY",
            "cudagraph_capture_sizes": [16],
        },
}


- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
											
										
										
											2025-12-18 09:08:40 +08:00
+								    ])
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								CASE_DS_EX = LLMTestCase(model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
 								                         quantization="ascend",
 								                         prompts=PROMPTS_LONG,
 								                         golden_answers=[
-												[feature] add_rms_norm support bias (#5790)

### What this PR does / why we need it?
This PR is to replace addRmsNorm and Add With addRmsNormBias. This way
can lead to a more effecient result.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Full Test Pass

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d

Signed-off-by: Chen_HaoWen <chenhaowen12@huawei.com>
Co-authored-by: Chen_HaoWen <chenhaowen12@huawei.com>
											
										
										
											2026-01-23 21:09:54 +08:00
+								                             "\n\nSelect an assignment template",
 								                             "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
 								                             "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								                         ])
 								@pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
 								def test_piecewise_res_consistency(cur_case: LLMTestCase):
 								    runner_kwargs = {
 								        "model_name": cur_case.model,
 								        "max_model_len": 1024,
 								        "cudagraph_capture_sizes": [1, 2, 4, 8],
 								        "quantization": cur_case.quantization,
 								    }
 								    gen_and_valid(runner_kwargs=runner_kwargs,
 								                  prompts=cur_case.prompts,
 								                  sampling_params=cur_case.sampling_params,
 								                  golden_answers=cur_case.golden_answers)
-												[ModelRunner][Fix] Pads query_start_loc to satisfy FIA/TND constraint (#6475)

### What this PR does / why we need it?
This PR reverts "[ModelRunner] Revert [Fix] Pads query_start_loc to
satisfy FIA/TND constraint #6459 (commit
5b0a6bcfe9eca595bbcd064363596553b6bbd1fe)" and fixes a check in
`model_runner_v1`.

**A key change is that we remove the strict assertion in the latest
commit, as it turns out MLA + PIECEWISE will slice during computing,
leaving our assertion uncalled for and will only cause false alarm.**

This handles both uniform and mixed batches (by inserting a dummy
request for mixed batches), consolidates ad-hoc padding into a single
helper, copies the updated buffer to the device, which prevents kernel
mismatches or failures and ensure correct shapes for FIA/TND execution
in full graph modes.

We currently place this helper in `execute_model`. My original design
was to include it in `_prepare_inputs`, but that doesn’t work because it
must run after padding. While I’d prefer to minimize the impact and
reuse as much of the base class as possible in the future, it doesn’t
seem achievable at the moment.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
Test cases added.

- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd

---------

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
											
										
										
											2026-02-04 21:11:08 +08:00
+								@pytest.mark.parametrize(
 								    "cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
 								def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
 								    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
 								    runner_kwargs = {
 								        "model_name": cur_case.model,
 								        "max_model_len": 1024,
 								        "compilation_config": {
 								            "cudagraph_capture_sizes": [4, 8, 32, 64],
 								            "cudagraph_mode": "FULL_DECODE_ONLY"
 								        },
 								        "quantization": cur_case.quantization,
 								    }
 								    gen_and_valid(runner_kwargs=runner_kwargs,
 								                  prompts=cur_case.prompts,
 								                  sampling_params=cur_case.sampling_params,
 								                  golden_answers=cur_case.golden_answers)
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
 								@pytest.mark.parametrize(
 								    "cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
 								def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
 								    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
 								    runner_kwargs = {
 								        "model_name": cur_case.model,
 								        "max_model_len": 1024,
 								        "compilation_config": {
 								            "cudagraph_capture_sizes": [4, 8, 32, 64],
 								            "cudagraph_mode": "FULL_DECODE_ONLY"
 								        },
 								        "quantization": cur_case.quantization,
-												[npugraph_ex]enable npugraph_ex by default (#6664)

### What this PR does / why we need it?

This pull request enables the `npugraph_ex` backend by default to
improve performance on Ascend NPUs, as proposed in the
[RFC](https://github.com/vllm-project/vllm-ascend/issues/6214).


### Does this PR introduce _any_ user-facing change?

Yes. `npugraph_ex` is now enabled by default. Users can disable it by
setting `enable: false` in the `npugraph_ex_config` section of the
`additional_config`.

### How was this patch tested?

CI passed. The changes are covered by existing and new E2E tests
(`test_aclgraph_accuracy.py`) and unit tests (`test_ascend_config.py`)
that have been updated to reflect the new default behavior. The tests
verify correctness and consistency with `npugraph_ex` enabled and
disabled, as well as with the new static kernel option.

Signed-off-by: huyuanquan1 <huyuanquan1@huawei.com>
Co-authored-by: huyuanquan1 <huyuanquan1@huawei.com>
											
										
										
											2026-02-12 08:44:06 +08:00
+								        "additional_config": {
 								            "npugraph_ex_config": {
 								                "enable": False
 								            }
 								        },
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								    }
 								    gen_and_valid(runner_kwargs=runner_kwargs,
 								                  prompts=cur_case.prompts,
 								                  sampling_params=cur_case.sampling_params,
 								                  golden_answers=cur_case.golden_answers)
-												[CI] Re-open skipped cases due to PTA upgrading and update the golden results (#6144)

### What this PR does / why we need it?
Re-open `tests/e2e/singlecard/test_aclgraph_accuracy.py` and update its
golden results to match PTA 2.9.0

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60

Signed-off-by: wjunLu <wjunlu217@gmail.com>
											
										
										
											2026-01-23 10:46:31 +08:00
+								@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
 								    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
 								    runner_kwargs = {
 								        "model_name": cur_case.model,
 								        "quantization": cur_case.quantization,
 								        "max_model_len": 1024,
 								        "compilation_config": {
 								            "cudagraph_capture_sizes": [4, 8, 32, 64],
 								            "cudagraph_mode": "FULL_DECODE_ONLY"
 								        },
 								        "additional_config": {
-												[Feature]refactor the npugraph_ex config, support online-infer with static kernel (#5775)

### What this PR does / why we need it?
This is a part of
https://github.com/vllm-project/vllm-ascend/issues/4715#issue-3694310762
1. refactor the npugraph_ex config，modified the default configuration of
the static kernel, new default value of static kernel is false
2. support online-infer with static kernel
3. fixed the issue where manually modifying FX graphs caused an abnormal
model return type, and removed the related redundant code.

### Does this PR introduce _any_ user-facing change?
yes，the new config of npugraph_ex is as follow:
```
additional_config={
            "npugraph_ex_config": {
                "enable": True,
                "enable_static_kernel": False
            }
        }
```
### How was this patch tested?
```
vllm serve /data/DeepSeek-V3.1-Terminus-w4a8 \
    --host 0.0.0.0 \
    --port 8004 \
    --data-parallel-size 4 \
    --tensor-parallel-size 4 \
    --quantization ascend \
    --seed 1024 \
    --served-model-name deepseek_v3 \
    --enable-expert-parallel \
    --max-num-seqs 48 \
    --max-model-len 40000 \
    --async-scheduling \
    --max-num-batched-tokens 9000 \
    --trust-remote-code \
    --no-enable-prefix-caching \
    --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp","disable_padded_drafter_batch": false}' \
    --gpu-memory-utilization 0.9 \
    --compilation-config '{"cudagraph_capture_sizes":[4,32,64,112,160,176,192], "cudagraph_mode": "FULL_DECODE_ONLY"}' \
    --additional-config \
    '{"enable_shared_expert_dp": true,"multistream_overlap_shared_expert": true,"npugraph_ex_config":{"enable":true}}'
```

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d

---------

Signed-off-by: chencangtao <chencangtao@huawei.com>
Signed-off-by: ChenCangtao <50493711+ChenCangtao@users.noreply.github.com>
Co-authored-by: chencangtao <chencangtao@huawei.com>
											
										
										
											2026-01-20 21:31:38 +08:00
+								            "npugraph_ex_config": {
 								                "enable": True
 								            }
-												[1/N][CI] Refactor accuracy test (#5400)

### What this PR does / why we need it?
1. Accuracy testing no longer compares eager and graph modes; instead,
it directly extracts the golden result under the graph mode
configuration (the implicit purpose of this case is to verify whether
modifications affect existing results)
2. Next step: finer-grained supervision of logits/sampler results
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2026-01-07 20:58:15 +08:00
+								        },
 								    }
 								    gen_and_valid(runner_kwargs=runner_kwargs,
 								                  prompts=cur_case.prompts,
 								                  sampling_params=cur_case.sampling_params,
 								                  golden_answers=cur_case.golden_answers)
-												[e2e Test][npugraph_ex]add static kernel e2e test case (#6320)

### What this PR does / why we need it?
Added an E2E test case for the scenario of enabling a static kernel for
npugraph_ex, monitoring its compilation and unloading process.
Also fixed the previously existing spelling errors

- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd

---------

Signed-off-by: chencangtao <chencangtao@huawei.com>
Co-authored-by: chencangtao <chencangtao@huawei.com>
											
										
										
											2026-01-30 16:24:48 +08:00
 								# The accuracy has already been verified in the previous test case.
 								# This test case is used to check whether the functionality works properly
 								# after enabling the static kernel and whether it is uninstalled as expected.
 								@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX])
 								def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
 								    monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
 								    runner_kwargs = {
 								        "model_name": cur_case.model,
 								        "quantization": cur_case.quantization,
 								        "max_model_len": 1024,
 								        "compilation_config": {
 								            "cudagraph_capture_sizes": [4, 8],
 								            "cudagraph_mode": "FULL_DECODE_ONLY"
 								        },
 								        "additional_config": {
 								            "npugraph_ex_config": {
 								                "enable": True,
 								                "enable_static_kernel": True,
 								            }
 								        },
 								    }
 								    gen_and_valid(runner_kwargs=runner_kwargs,
 								                  prompts=cur_case.prompts,
 								                  sampling_params=cur_case.sampling_params,
 								                  golden_answers=cur_case.golden_answers)
 								    # Check whether the static kernel is properly uninstall
 								    ascend_home_path = os.environ["ASCEND_HOME_PATH"]
 								    static_kernel_install_path = os.path.join(ascend_home_path, 'opp/static_kernel/ai_core')
 								    assert not os.path.exists(static_kernel_install_path)