qwen3_moe/qwen25 support torchair graph (#2403)

### What this PR does / why we need it? Added support for the TorchAir graph mode in qwen3_moe and qwen2.5 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ```bash llm = LLM( model=model, tensor_parallel_size=GPUs_per_dp_rank, enforce_eager=False, enable_expert_parallel=True, max_model_len=4096, max_num_seqs=16, trust_remote_code=trust_remote_code, gpu_memory_utilization=0.4, additional_config={ "torchair_graph_config": { "enabled": True, "use_cached_graph": False, "graph_batch_sizes_init": False, "graph_batch_sizes": [16] }, "ascend_scheduler_config": { "enabled": True, "chunked_prefill_enabled":True, }, "refresh": True, }, ) ``` - vLLM version: v0.10.0 - vLLM main: b87cb97a53 Signed-off-by: taoyuxiang <oui.nicholas.tao@gmail.com>
2025-08-20 11:23:50 +08:00
parent 31ae249742
commit 7bec1a9b9c
9 changed files with 1123 additions and 9 deletions
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -19,6 +19,8 @@ import math
 from typing import Optional, Tuple

 import torch
+import torch.nn.functional as F
+import torch_npu
 from vllm.model_executor.layers.rotary_embedding import (
    DeepseekScalingRotaryEmbedding, RotaryEmbedding)

@@ -37,9 +39,11 @@ def rope_forward_oot(
    query: torch.Tensor,
    key: torch.Tensor,
    offsets: Optional[torch.Tensor] = None,
-    is_neox_style_override: Optional[bool] = None
+    is_neox_style_override: Optional[bool] = None,
+    is_qwen_torchair: Optional[bool] = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    if get_ascend_config().torchair_graph_config.enabled:
+    if get_ascend_config(
+    ).torchair_graph_config.enabled and not is_qwen_torchair:
        return self.forward_native(
            positions,
            query,
@@ -47,7 +51,6 @@ def rope_forward_oot(
            offsets,
        )

-    import torch_npu
    query_shape, key_shape = query.shape, key.shape
    if self.cos_sin_cache.device != query.device:
        self.cos_sin_cache = self.cos_sin_cache.to(query.device)
@@ -246,6 +249,92 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
    self.register_buffer("sin_cached", sin_cached, persistent=False)


+def __set_cos_sin_cache(self, seq_len, device, dtype):
+    inv_freq = 1.0 / (self.base**(torch.arange(
+        0, self.rotary_dim, 2, device=device, dtype=torch.float32) *
+                                  (1 / self.rotary_dim)))
+    self.register_buffer("inv_freq", inv_freq)
+
+    t = torch.arange(self.max_position_embeddings,
+                     device=self.inv_freq.device,
+                     dtype=torch.float32)
+    freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+    emb = torch.cat((freqs, freqs), dim=-1)
+    self.register_buffer("cos", emb.cos().to(dtype=dtype), persistent=False)
+    self.register_buffer("sin", emb.sin().to(dtype=dtype), persistent=False)
+    self.embed = F.embedding
+
+
+_original_re_init = RotaryEmbedding.__init__
+
+
+def qwen_rope_init_func(
+    self,
+    head_size: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+    dtype: torch.dtype,
+) -> None:
+    _original_re_init(self, head_size, rotary_dim, max_position_embeddings,
+                      base, is_neox_style, dtype)
+    if get_ascend_config().torchair_graph_config.enabled:
+        __set_cos_sin_cache(self,
+                            seq_len=max_position_embeddings,
+                            device="npu",
+                            dtype=dtype)
+
+
+def rope_forward(
+    self,
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    is_neox_style_override: Optional[bool] = None,
+    max_seq_len: Optional[int] = None,
+    is_prefill: Optional[bool] = True,
+    is_qwen_torchair: Optional[bool] = False,
+):
+    if get_ascend_config().torchair_graph_config.enabled \
+            and is_qwen_torchair and not is_prefill:
+        if max_seq_len is not None and torch.gt(max_seq_len,
+                                                self.max_position_embeddings):
+            __set_cos_sin_cache(self,
+                                seq_len=max_seq_len,
+                                device=query.device,
+                                dtype=torch.float32)
+
+        # bsnd/bnsd
+        if positions is not None:
+            cos = self.embed(positions, self.cos)
+            sin = self.embed(positions, self.sin)
+            self.cos_embed = cos
+            self.sin_embed = sin
+        else:
+            cos = self.cos_embed
+            sin = self.sin_embed
+
+        query = query.view(*query.shape[:-1], -1, self.head_size).contiguous()
+        key = key.view(*key.shape[:-1], -1, self.head_size).contiguous()
+
+        cos = cos.unsqueeze(-2).unsqueeze(-2)
+        sin = sin.unsqueeze(-2).unsqueeze(-2)
+
+        query = query.unsqueeze(1)
+        key = key.unsqueeze(1)
+
+        q_embed, k_embed = torch_npu.npu_apply_rotary_pos_emb(
+            query, key, cos, sin)
+        return q_embed.flatten(-2), k_embed.flatten(-2)
+    else:
+        return rope_forward_oot(self, positions, query, key, offsets,
+                                is_neox_style_override,
+                                is_qwen_torchair)  # type: ignore
+
+
 def deepseek_rope_init_func(
    self,
    head_size: int,
@@ -283,7 +372,8 @@ def deepseek_rope_init_func(
                       device="npu")


-RotaryEmbedding.forward_oot = rope_forward_oot
+RotaryEmbedding.__init__ = qwen_rope_init_func
+RotaryEmbedding.forward_oot = rope_forward

 # Note: we adopt the native huggingface deepseek rope initialization code from
 # https://huggingface.co/deepseek-ai/DeepSeek-V3-0324/blob/main/modeling_deepseek.py for