Drop vLLM 0.13.0 support (#6069)

### What this PR does / why we need it? Drop vLLM 0.13.0 support, upgrade to 0.14.0 - vLLM version: v0.13.0 - vLLM main: d68209402d --------- Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-23 09:45:08 +08:00
parent 27a513b672
commit 819a4459ce
39 changed files with 86 additions and 272 deletions
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -31,8 +31,7 @@ if HAS_TRITON:

 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import (AscendDeviceType, enable_custom_op,
-                               get_ascend_device_type, has_rope, is_vl_model,
-                               vllm_version_is)
+                               get_ascend_device_type, has_rope, is_vl_model)

 # Currently, rope ops used on npu requires detached cos && sin as inputs.
 # However, RotaryEmbedding in vllm use cos_sin_cache as a whole variable.
@@ -637,18 +636,8 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):
        cos: torch.Tensor,
        sin: torch.Tensor,
    ) -> torch.Tensor:
-        if vllm_version_is('0.13.0'):
-            origin_shape = x.shape
-            origin_dtype = x.dtype
-            if len(origin_shape) == 3:
-                x = x.unsqueeze(0)
-            if self.enable_fp32_compute:
-                x = x.float()
-                cos = cos.float()
-                sin = sin.float()
-        else:
-            x, cos, sin, origin_shape, origin_dtype = self._pre_process(
-                x, cos, sin)
+        x, cos, sin, origin_shape, origin_dtype = self._pre_process(
+            x, cos, sin)

        head_dim = x.shape[-1]
        # cos, sin: [seq_len, head_dim // 2]
@@ -660,12 +649,6 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):

        output = torch_npu.npu_rotary_mul(x, cos, sin)

-        if vllm_version_is('0.13.0'):
-            if len(origin_shape) == 3:
-                output = output.squeeze(0)
-            if self.enable_fp32_compute:
-                output = output.to(origin_dtype)
-        else:
-            output = self._post_process(output, origin_shape, origin_dtype)
+        output = self._post_process(output, origin_shape, origin_dtype)

        return output