[Fusion] [Graph] Add qknorm rope fusion operator (#4711)

### What this PR does / why we need it? This PR add `qkv_rmsnorm_rope` operator and introduces a graph fusion pass for `qknorm_rope` operations. The implementation includes a new configuration flag, a pattern matching pass using `torch._inductor.pattern_matcher`, and a custom Triton kernel for the fused operation. Co-authored-by: Angazenn [supperccell@163.com](mailto:supperccell@163.com) ### Does this PR introduce _any_ user-facing change? Yes, add new additional_config - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: wxsIcey <1790571317@qq.com>
2025-12-17 08:53:44 +08:00
parent b1a853b0f6
commit cadfa5ddc1
14 changed files with 754 additions and 71 deletions
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -17,6 +17,7 @@ from typing import Optional
 from uuid import uuid4

 from vllm.logger import logger
+from vllm.triton_utils import HAS_TRITON


 def check_kv_extra_config(vllm_config):
@@ -231,7 +232,10 @@ class AscendCompilationConfig:
    deployed on Ascend platforms.
    """

-    def __init__(self, fuse_norm_quant: bool = True, **kwargs):
+    def __init__(self,
+                 fuse_norm_quant: bool = True,
+                 fuse_qknorm_rope: bool = False,
+                 **kwargs):
        """
        Initialize the configuration.
        
@@ -239,11 +243,12 @@ class AscendCompilationConfig:
            fuse_norm_quant (bool): Whether to enable norm and quant fusion optimization.
                When set to True, the system will optimize norm and quant operations.
                Default: True
-                
+            fuse_qknorm_rope (bool): Whether to enable qknorm and rope fusion optimization.
+                Default: False
            **kwargs: Additional optional parameters for forward compatibility and configuration extension.
        """
        self.fuse_norm_quant = fuse_norm_quant
-        # Add more compilation related configs here as needed
+        self.fuse_qknorm_rope = HAS_TRITON or fuse_qknorm_rope


 class XliteGraphConfig: