xc-llm-ascend/tests/ut/compilation/test_npugraph_ex_utils_check.py

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#

from vllm_ascend.compilation.passes.utils.npugraph_ex_utils_check import \
    extra_stream_scope_check


def test_extra_stream_scope_check_logic():
    """
    Test the extra_stream_scope_check logic used in both fusion patterns.
    This is a pure function test (copied logic for testability).
    """

    class MockNode:

        def __init__(self, stream_label=None):
            self.op = "call_function"
            self.meta = {"stream_label": stream_label}

    class MockMatch:

        def __init__(self, nodes):
            self.nodes = nodes

    # Test 1: all default → OK
    assert extra_stream_scope_check(
        MockMatch([MockNode(None), MockNode(None)])) is True

    # Test 2: same non-default → OK
    assert extra_stream_scope_check(
        MockMatch([MockNode("s1"), MockNode("s1")])) is True

    # Test 3: mixed non-default → FAIL
    assert extra_stream_scope_check(
        MockMatch([MockNode("s1"), MockNode("s2")])) is False

    # Test 4: default + non-default → FAIL
    assert extra_stream_scope_check(
        MockMatch([MockNode(None), MockNode("s1")])) is False

    # Test 5: empty → OK
    assert extra_stream_scope_check(MockMatch([])) is True
[Graph][Fusion] Add QKVNormRope and QKVNormRopeWithBias (#5721) ### What this PR does / why we need it? This PR builds upon PR https://github.com/vllm-project/vllm-ascend/pull/5011 and aims to further enhance the npu_graph_ex_passes module. Based on prior work, we have added graph optimization support for the add_rms_quant fused operator in scenarios where a bias term is present—ensuring the fusion pattern is correctly registered and matched into the computation graph. For validation, we switched to the Qwen3-235B-A22B-W8A8 model for QKVNormRopeWithBias and Qwen3-32B model for QKVNormRope . Benchmark results show that, compared to the unfused baseline, enabling this fusion pass significantly improves inference throughput for W8A8 quantized models. For more details can refer to the RFC:https://github.com/vllm-project/vllm-ascend/issues/4715 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ``` llm = LLM( model=model, tensor_parallel_size=GPUs_per_dp_rank, enforce_eager=False, enable_expert_parallel=enable_expert_parallel, trust_remote_code=trust_remote_code, gpu_memory_utilization=0.98, max_num_batched_tokens=512, # load_format="dummy", max_model_len=2048, max_num_seqs=16, quantization="ascend", additional_config={ "refresh": True, "enable_npugraph_ex": True }, compilation_config={ "cudagraph_capture_sizes": [8, 16], "cudagraph_mode": "FULL_DECODE_ONLY", }, ) if profile_dir: llm.start_profile() outputs = llm.generate(prompts, sampling_params) if profile_dir: llm.stop_profile() for i, output in enumerate(outputs): if i >= 5: break prompt = output.prompt generated_text = output.outputs[0].text print( f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " f"Generated text: {generated_text!r}" ) ``` - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: cjian <2318164299@qq.com> 2026-01-22 17:22:41 +08:00			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# This file is a part of the vllm-ascend project.`
			`#`

[Graph][Fusion] Integrating inductor pass and npugraph ex pass (#6354) ### What this PR does / why we need it? Integrating inductor pass and npugraph ex pass, see RFC: https://github.com/vllm-project/vllm-ascend/issues/6347 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? all tests passed. - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd --------- Signed-off-by: wxsIcey <1790571317@qq.com> 2026-02-13 15:34:55 +08:00			`from vllm_ascend.compilation.passes.utils.npugraph_ex_utils_check import \`
[Graph][Fusion] Add QKVNormRope and QKVNormRopeWithBias (#5721) ### What this PR does / why we need it? This PR builds upon PR https://github.com/vllm-project/vllm-ascend/pull/5011 and aims to further enhance the npu_graph_ex_passes module. Based on prior work, we have added graph optimization support for the add_rms_quant fused operator in scenarios where a bias term is present—ensuring the fusion pattern is correctly registered and matched into the computation graph. For validation, we switched to the Qwen3-235B-A22B-W8A8 model for QKVNormRopeWithBias and Qwen3-32B model for QKVNormRope . Benchmark results show that, compared to the unfused baseline, enabling this fusion pass significantly improves inference throughput for W8A8 quantized models. For more details can refer to the RFC:https://github.com/vllm-project/vllm-ascend/issues/4715 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ``` llm = LLM( model=model, tensor_parallel_size=GPUs_per_dp_rank, enforce_eager=False, enable_expert_parallel=enable_expert_parallel, trust_remote_code=trust_remote_code, gpu_memory_utilization=0.98, max_num_batched_tokens=512, # load_format="dummy", max_model_len=2048, max_num_seqs=16, quantization="ascend", additional_config={ "refresh": True, "enable_npugraph_ex": True }, compilation_config={ "cudagraph_capture_sizes": [8, 16], "cudagraph_mode": "FULL_DECODE_ONLY", }, ) if profile_dir: llm.start_profile() outputs = llm.generate(prompts, sampling_params) if profile_dir: llm.stop_profile() for i, output in enumerate(outputs): if i >= 5: break prompt = output.prompt generated_text = output.outputs[0].text print( f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " f"Generated text: {generated_text!r}" ) ``` - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: cjian <2318164299@qq.com> 2026-01-22 17:22:41 +08:00			`extra_stream_scope_check`


			`def test_extra_stream_scope_check_logic():`
			`"""`
			`Test the extra_stream_scope_check logic used in both fusion patterns.`
			`This is a pure function test (copied logic for testability).`
			`"""`

			`class MockNode:`

			`def __init__(self, stream_label=None):`
			`self.op = "call_function"`
			`self.meta = {"stream_label": stream_label}`

			`class MockMatch:`

			`def __init__(self, nodes):`
			`self.nodes = nodes`

			`# Test 1: all default → OK`
			`assert extra_stream_scope_check(`
			`MockMatch([MockNode(None), MockNode(None)])) is True`

			`# Test 2: same non-default → OK`
			`assert extra_stream_scope_check(`
			`MockMatch([MockNode("s1"), MockNode("s1")])) is True`

			`# Test 3: mixed non-default → FAIL`
			`assert extra_stream_scope_check(`
			`MockMatch([MockNode("s1"), MockNode("s2")])) is False`

			`# Test 4: default + non-default → FAIL`
			`assert extra_stream_scope_check(`
			`MockMatch([MockNode(None), MockNode("s1")])) is False`

			`# Test 5: empty → OK`
			`assert extra_stream_scope_check(MockMatch([])) is True`