xc-llm-ascend/vllm_ascend/compilation/npu_graph_ex_pass_manager.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from torch import fx as fx
from vllm.compilation.inductor_pass import get_pass_context
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
from vllm.config import VllmConfig


class NpuGraphEXPassManager:
    """
    A pass manager for npu_graph ex fusion passes.
    It handles the configuration and execution of passes.
    The counterpart in vllm is PostGradPassManager. Since torch_npu
    does not support triton for now, we define our own pass manager.
    """

    def __init__(self):
        self.passes: list[VllmInductorPass] = []

    def __call__(self, graph: fx.Graph) -> fx.Graph:
        compile_range = get_pass_context().compile_range

        for pass_ in self.passes:
            if pass_.is_applicable_for_range(compile_range):
                pass_(graph)
        graph.recompiler()
        return graph

    def add(self, pass_: VllmInductorPass):
        assert isinstance(pass_, VllmInductorPass)
        self.passes.append(pass_)

    def configure(self, config: VllmConfig):
        # By default, we enable the graph fusion and quantization fusion pass.
        self.ascend_compilation_config: dict = config.additional_config.get("ascend_compilation_config", {})
[Graph][Fusion] Add QKVNormRope and QKVNormRopeWithBias (#5721) ### What this PR does / why we need it? This PR builds upon PR https://github.com/vllm-project/vllm-ascend/pull/5011 and aims to further enhance the npu_graph_ex_passes module. Based on prior work, we have added graph optimization support for the add_rms_quant fused operator in scenarios where a bias term is present—ensuring the fusion pattern is correctly registered and matched into the computation graph. For validation, we switched to the Qwen3-235B-A22B-W8A8 model for QKVNormRopeWithBias and Qwen3-32B model for QKVNormRope . Benchmark results show that, compared to the unfused baseline, enabling this fusion pass significantly improves inference throughput for W8A8 quantized models. For more details can refer to the RFC:https://github.com/vllm-project/vllm-ascend/issues/4715 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ``` llm = LLM( model=model, tensor_parallel_size=GPUs_per_dp_rank, enforce_eager=False, enable_expert_parallel=enable_expert_parallel, trust_remote_code=trust_remote_code, gpu_memory_utilization=0.98, max_num_batched_tokens=512, # load_format="dummy", max_model_len=2048, max_num_seqs=16, quantization="ascend", additional_config={ "refresh": True, "enable_npugraph_ex": True }, compilation_config={ "cudagraph_capture_sizes": [8, 16], "cudagraph_mode": "FULL_DECODE_ONLY", }, ) if profile_dir: llm.start_profile() outputs = llm.generate(prompts, sampling_params) if profile_dir: llm.stop_profile() for i, output in enumerate(outputs): if i >= 5: break prompt = output.prompt generated_text = output.outputs[0].text print( f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " f"Generated text: {generated_text!r}" ) ``` - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: cjian <2318164299@qq.com> 2026-01-22 17:22:41 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# This file is a part of the vllm-ascend project.`
			`#`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from torch import fx as fx`
			`from vllm.compilation.inductor_pass import get_pass_context`
			`from vllm.compilation.vllm_inductor_pass import VllmInductorPass`
			`from vllm.config import VllmConfig`


			`class NpuGraphEXPassManager:`
			`"""`
			`A pass manager for npu_graph ex fusion passes.`
			`It handles the configuration and execution of passes.`
			`The counterpart in vllm is PostGradPassManager. Since torch_npu`
			`does not support triton for now, we define our own pass manager.`
			`"""`

			`def __init__(self):`
			`self.passes: list[VllmInductorPass] = []`

			`def __call__(self, graph: fx.Graph) -> fx.Graph:`
			`compile_range = get_pass_context().compile_range`

			`for pass_ in self.passes:`
			`if pass_.is_applicable_for_range(compile_range):`
			`pass_(graph)`
			`graph.recompiler()`
			`return graph`

			`def add(self, pass_: VllmInductorPass):`
			`assert isinstance(pass_, VllmInductorPass)`
			`self.passes.append(pass_)`

			`def configure(self, config: VllmConfig):`
			`# By default, we enable the graph fusion and quantization fusion pass.`
			`self.ascend_compilation_config: dict = config.additional_config.get("ascend_compilation_config", {})`