[Fusion] normalize fusion naming and enable e2e test (#4693)

### What this PR does / why we need it? This PR standardizes the fusion naming, changing `enable_quantization_fusion` to `fuse_norm_quant`, and enables e2e testing. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: wxsIcey <1790571317@qq.com>
2025-12-11 17:53:43 +08:00
parent 07c7131104
commit 18221c0e1d
8 changed files with 136 additions and 113 deletions
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -190,19 +190,18 @@ class AscendCompilationConfig:
    deployed on Ascend platforms.
    """

-    def __init__(self, enable_quantization_fusion: bool = True, **kwargs):
+    def __init__(self, fuse_norm_quant: bool = True, **kwargs):
        """
        Initialize the configuration.
        
        Args:
-            enable_quantization_fusion (bool): Whether to enable quantization fusion optimization.
-                When set to True, the system will optimize quantization-related operations,
-                reducing the number of quantization/dequantization nodes.
+            fuse_norm_quant (bool): Whether to enable norm and quant fusion optimization.
+                When set to True, the system will optimize norm and quant operations.
                Default: True
                
            **kwargs: Additional optional parameters for forward compatibility and configuration extension.
        """
-        self.enable_quantization_fusion = enable_quantization_fusion
+        self.fuse_norm_quant = fuse_norm_quant
        # Add more compilation related configs here as needed


--- a/vllm_ascend/compilation/graph_fusion_pass_manager.py
+++ b/vllm_ascend/compilation/graph_fusion_pass_manager.py
@@ -46,8 +46,8 @@ class GraphFusionPassManager:
        # By default, we enable the graph fusion and quantization fusion pass.
        self.ascend_compilation_config: dict = config.additional_config.get(
            "ascend_compilation_config", {})
-        if self.ascend_compilation_config.get("enable_quantization_fusion",
-                                              True):
-            from .passes.quant_fusion_pass import AddRMSNormQuantFusionPass
+        if self.ascend_compilation_config.get("fuse_norm_quant", True):
+            from .passes.norm_quant_fusion_pass import \
+                AddRMSNormQuantFusionPass
            self.passes.append(AddRMSNormQuantFusionPass(config))
        # Add more passes here as needed
--- a/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py
+++ b/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -88,8 +88,7 @@ class NPUPlatform(Platform):
        Get the custom compile backend. Previously, we used EagerAdaptor by default. 
        To use graph fusion operations, we defined our own backend compiler.
        """
-        from vllm_ascend.compilation.compiler_interface import AscendCompiler
-        return AscendCompiler.__module__ + "." + AscendCompiler.__name__
+        return "vllm_ascend.compilation.compiler_interface.AscendCompiler"

    @classmethod
    def pre_register_and_update(cls,
@@ -225,8 +224,8 @@ class NPUPlatform(Platform):
        if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
            compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE

-        from vllm_ascend.compilation.compiler_interface import AscendCompiler
-        compilation_config.oot_compiler = AscendCompiler.__module__ + "." + AscendCompiler.__name__
+        # get custom compile backend for graph fusion
+        compilation_config.oot_compiler = cls.get_compile_backend()

        if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
            compilation_config.mode = CompilationMode.NONE