[main] Optimize rope in Qwen Models (#2571)

### What this PR does / why we need it? Optimize rope by caching sin and cos at the first layer in Qwen Models. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.10.1.1 - vLLM main: 562663a044 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: ZYang6263 <zy626375@gmail.com> Signed-off-by: rjg-lyh <1318825571@qq.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: ZYang6263 <51255902183@stu.ecnu.edu.cn> Co-authored-by: ZYang6263 <zy626375@gmail.com>
2025-09-09 14:28:14 +08:00
parent 5bcb4c1528
commit 7a205dbaa8
4 changed files with 136 additions and 47 deletions
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -20,6 +20,7 @@
 from typing import Optional, Union

 import torch
+import torch_npu
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.compilation.decorators import support_torch_compile
@@ -280,6 +281,11 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
        self.make_empty_intermediate_tensors = (
            make_empty_intermediate_tensors_factory(
                ["hidden_states", "residual"], config.hidden_size))
+        # Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache) may cause performance degradation at runtime.
+        x = torch.rand((2, 4), dtype=torch.float16).npu()
+        weight = torch.rand((2, 4), dtype=torch.float16).npu()
+        c = torch.rand((4, 4), dtype=torch.float32).npu()
+        torch_npu._npu_matmul_add_fp32(x, weight, c)

    def forward(
        self,