Optimize qwen2_vl and qwen2_5_vl (#701)

### What this PR does / why we need it? Optimize qwen2_vl and qwen2_5_vl. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? Testing this PR on 1080p picture with tp=1, bs=1 on Qwen2-VL and Qwen2.5-VL, every fa op's during time lasting from 11ms to 9ms, got roughly 22% perf boost. --------- Signed-off-by: zouyida2052 <zouyida@huawei.com> Signed-off-by: zouyida2052 <zouyida2002@gmail.com> Co-authored-by: zouyida2052 <zouyida@huawei.com>
2025-04-30 14:22:38 +08:00
parent 90aabaeb2e
commit ba9714ccee
4 changed files with 559 additions and 27 deletions
--- a/vllm_ascend/ops/activation.py
+++ b/vllm_ascend/ops/activation.py
@@ -16,7 +16,7 @@
 #

 import torch
-from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul


 def silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor:
@@ -26,4 +26,12 @@ def silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor:
    return out


-SiluAndMul.forward_oot = silu_and_mul_forward_oot
+def quick_gelu_forward_oot(self, x: torch.tensor) -> torch.Tensor:
+    import torch_npu
+
+    out = torch_npu.npu_fast_gelu(x)
+    return out
+
+
+QuickGELU.forward_oot = quick_gelu_forward_oot
+SiluAndMul.forward_oot = silu_and_mul_forward_oot