[Fusion] [Graph] Add qknorm rope fusion operator (#4711)

### What this PR does / why we need it? This PR add `qkv_rmsnorm_rope` operator and introduces a graph fusion pass for `qknorm_rope` operations. The implementation includes a new configuration flag, a pattern matching pass using `torch._inductor.pattern_matcher`, and a custom Triton kernel for the fused operation. Co-authored-by: Angazenn [supperccell@163.com](mailto:supperccell@163.com) ### Does this PR introduce _any_ user-facing change? Yes, add new additional_config - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: wxsIcey <1790571317@qq.com>
2025-12-17 08:53:44 +08:00
parent b1a853b0f6
commit cadfa5ddc1
14 changed files with 754 additions and 71 deletions
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -25,6 +25,7 @@ from vllm_ascend.ascend_forward_context import set_ascend_forward_context
 from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
+from vllm_ascend.ops.rotary_embedding import update_cos_sin
 from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType

 PADDING_SLOT_ID = -1
@@ -143,6 +144,9 @@ class EagleProposer(Proposer):
                  aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
                  batch_descriptor=None,
                  dummy_compute_logits=lambda hidden_states: None):
+        # update global cos, sin
+        update_cos_sin(self.positions[:num_tokens])
+
        with set_ascend_forward_context(None,
                                        self.vllm_config,
                                        num_tokens=num_tokens):
@@ -338,6 +342,8 @@ class EagleProposer(Proposer):
        builder = self.runner.attn_groups[0][0].get_metadata_builder()
        attn_metadata = builder.build(0, common_attn_metadata,
                                      self.runner.get_model())
+        # update global cos, sin
+        update_cos_sin(self.positions[:num_input_tokens])

        with set_ascend_forward_context(attn_metadata,
                                        self.vllm_config,
@@ -443,6 +449,10 @@ class EagleProposer(Proposer):

            attn_metadata.attn_mask = attn_mask
            # Run the model.
+
+            # update global cos, sin
+            update_cos_sin(self.positions[:input_batch_size])
+
            with set_ascend_forward_context(attn_metadata,
                                            self.vllm_config,
                                            num_tokens=input_batch_size):