Drop vLLM 0.13.0 support (#6069)

### What this PR does / why we need it?
Drop vLLM 0.13.0 support, upgrade to 0.14.0

- vLLM version: v0.13.0
- vLLM main:
d68209402d

---------

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2026-01-23 09:45:08 +08:00
committed by GitHub
parent 27a513b672
commit 819a4459ce
39 changed files with 86 additions and 272 deletions

View File

@@ -31,8 +31,7 @@ if HAS_TRITON:
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import (AscendDeviceType, enable_custom_op,
get_ascend_device_type, has_rope, is_vl_model,
vllm_version_is)
get_ascend_device_type, has_rope, is_vl_model)
# Currently, rope ops used on npu requires detached cos && sin as inputs.
# However, RotaryEmbedding in vllm use cos_sin_cache as a whole variable.
@@ -637,18 +636,8 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):
cos: torch.Tensor,
sin: torch.Tensor,
) -> torch.Tensor:
if vllm_version_is('0.13.0'):
origin_shape = x.shape
origin_dtype = x.dtype
if len(origin_shape) == 3:
x = x.unsqueeze(0)
if self.enable_fp32_compute:
x = x.float()
cos = cos.float()
sin = sin.float()
else:
x, cos, sin, origin_shape, origin_dtype = self._pre_process(
x, cos, sin)
x, cos, sin, origin_shape, origin_dtype = self._pre_process(
x, cos, sin)
head_dim = x.shape[-1]
# cos, sin: [seq_len, head_dim // 2]
@@ -660,12 +649,6 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):
output = torch_npu.npu_rotary_mul(x, cos, sin)
if vllm_version_is('0.13.0'):
if len(origin_shape) == 3:
output = output.squeeze(0)
if self.enable_fp32_compute:
output = output.to(origin_dtype)
else:
output = self._post_process(output, origin_shape, origin_dtype)
output = self._post_process(output, origin_shape, origin_dtype)
return output