xc-llm-ascend/vllm_ascend/compilation/passes/utils/npugraph_ex_utils_check.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from torch._inductor.pattern_matcher import Match
from vllm.logger import logger


def extra_stream_scope_check(match: Match) -> bool:
    """
    Checks if all nodes in the same stream.
    """
    non_default_streams = set()
    has_default = False

    for node in match.nodes:
        if node.op == "call_function":
            current_stream = node.meta.get("stream_label")
            if current_stream is None:
                has_default = True
            else:
                non_default_streams.add(current_stream)
                if len(non_default_streams) > 1:
                    logger.debug(
                        f"Cross-stream operation detected in pattern match for AddRMSNormQuant. "
                        f"Multiple streams found: {non_default_streams}. "
                        f"Fusion is not supported for cross-stream operations."
                    )
                    return False

    if has_default and len(non_default_streams) > 0:
        logger.debug(
            f"Cross-stream operation detected in pattern match for AddRMSNormQuant. "
            f"Multiple streams found: {non_default_streams}. "
            f"Fusion is not supported for cross-stream operations."
        )
        return False

    return True


_register_patterns = set()


def check_and_register_fusion_pass(pattern_class: type, **kwargs):
    global _register_patterns
    eps = kwargs.get("eps", 1e-6)
    pattern_key = str(pattern_class.__name__) + str(eps)
    if pattern_key in _register_patterns:
        return

    pattern = pattern_class(**kwargs)
    try:
        pattern.register()
        _register_patterns.add(pattern_key)
    except RuntimeError as e:
        if "Duplicate pattern" in str(e):
            logger.warning(f"Pattern {pattern_class.__name__} eps {eps} has been registered")
            _register_patterns.add(pattern_key)
        else:
            raise e
[Graph][Fusion] Add QKVNormRope and QKVNormRopeWithBias (#5721) ### What this PR does / why we need it? This PR builds upon PR https://github.com/vllm-project/vllm-ascend/pull/5011 and aims to further enhance the npu_graph_ex_passes module. Based on prior work, we have added graph optimization support for the add_rms_quant fused operator in scenarios where a bias term is present—ensuring the fusion pattern is correctly registered and matched into the computation graph. For validation, we switched to the Qwen3-235B-A22B-W8A8 model for QKVNormRopeWithBias and Qwen3-32B model for QKVNormRope . Benchmark results show that, compared to the unfused baseline, enabling this fusion pass significantly improves inference throughput for W8A8 quantized models. For more details can refer to the RFC:https://github.com/vllm-project/vllm-ascend/issues/4715 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ``` llm = LLM( model=model, tensor_parallel_size=GPUs_per_dp_rank, enforce_eager=False, enable_expert_parallel=enable_expert_parallel, trust_remote_code=trust_remote_code, gpu_memory_utilization=0.98, max_num_batched_tokens=512, # load_format="dummy", max_model_len=2048, max_num_seqs=16, quantization="ascend", additional_config={ "refresh": True, "enable_npugraph_ex": True }, compilation_config={ "cudagraph_capture_sizes": [8, 16], "cudagraph_mode": "FULL_DECODE_ONLY", }, ) if profile_dir: llm.start_profile() outputs = llm.generate(prompts, sampling_params) if profile_dir: llm.stop_profile() for i, output in enumerate(outputs): if i >= 5: break prompt = output.prompt generated_text = output.outputs[0].text print( f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " f"Generated text: {generated_text!r}" ) ``` - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: cjian <2318164299@qq.com> 2026-01-22 17:22:41 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# This file is a part of the vllm-ascend project.`
			`#`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from torch._inductor.pattern_matcher import Match`
			`from vllm.logger import logger`


			`def extra_stream_scope_check(match: Match) -> bool:`
			`"""`
			`Checks if all nodes in the same stream.`
			`"""`
			`non_default_streams = set()`
			`has_default = False`

			`for node in match.nodes:`
			`if node.op == "call_function":`
			`current_stream = node.meta.get("stream_label")`
			`if current_stream is None:`
			`has_default = True`
			`else:`
			`non_default_streams.add(current_stream)`
			`if len(non_default_streams) > 1:`
			`logger.debug(`
			`f"Cross-stream operation detected in pattern match for AddRMSNormQuant. "`
			`f"Multiple streams found: {non_default_streams}. "`
			`f"Fusion is not supported for cross-stream operations."`
			`)`
			`return False`

			`if has_default and len(non_default_streams) > 0:`
			`logger.debug(`
			`f"Cross-stream operation detected in pattern match for AddRMSNormQuant. "`
			`f"Multiple streams found: {non_default_streams}. "`
			`f"Fusion is not supported for cross-stream operations."`
			`)`
			`return False`

			`return True`
[bugfix][npugraph_ex]duplicate pattern issue (#6513) ### What this PR does / why we need it? When the draft model also uses vllmbackend for graph compilation, the fusion pass registration occurs again, resulting in errors due to duplicate patterns. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 --------- Signed-off-by: chencangtao <chencangtao@huawei.com> Co-authored-by: chencangtao <chencangtao@huawei.com> 2026-02-04 08:49:13 +08:00

			`_register_patterns = set()`


			`def check_and_register_fusion_pass(pattern_class: type, **kwargs):`
			`global _register_patterns`
			`eps = kwargs.get("eps", 1e-6)`
			`pattern_key = str(pattern_class.__name__) + str(eps)`
			`if pattern_key in _register_patterns:`
			`return`

			`pattern = pattern_class(**kwargs)`
			`try:`
			`pattern.register()`
			`_register_patterns.add(pattern_key)`
			`except RuntimeError as e:`
			`if "Duplicate pattern" in str(e):`
			`logger.warning(f"Pattern {pattern_class.__name__} eps {eps} has been registered")`
			`_register_patterns.add(pattern_key)`
			`else:`
			`raise e`