Drop vLLM 0.13.0 support (#6069)

### What this PR does / why we need it? Drop vLLM 0.13.0 support, upgrade to 0.14.0 - vLLM version: v0.13.0 - vLLM main: d68209402d --------- Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-23 09:45:08 +08:00
parent 27a513b672
commit 819a4459ce
39 changed files with 86 additions and 272 deletions
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -50,7 +50,7 @@ from vllm_ascend.quantization.w8a8_dynamic import \
 from vllm_ascend.utils import (AscendDeviceType, enable_sp,
                               get_ascend_device_type, maybe_trans_nz,
                               npu_stream_switch, shared_expert_dp_enabled,
-                               shared_experts_calculation_stream, vllm_version_is)
+                               shared_experts_calculation_stream)

@dataclass
 class FusedMoEResult:
@@ -451,12 +451,7 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
        # Qwen3-Next specific gating mechanism
        if hasattr(self._shared_experts, "expert_gate") and \
            self._shared_experts.expert_gate is not None:
-            if vllm_version_is('0.13.0'):
-                # TODO(jianzs): remove this branch after vLLM new version is
-                # released
-                gate_out = self._shared_experts.expert_gate(hidden_states)  # type: ignore
-            else:
-                gate_out, _ = self._shared_experts.expert_gate(hidden_states)  # type: ignore
+            gate_out, _ = self._shared_experts.expert_gate(hidden_states)  # type: ignore
            shared_out = F.sigmoid(gate_out) * shared_out
        return shared_out

--- a/vllm_ascend/ops/mla.py
+++ b/vllm_ascend/ops/mla.py
@@ -31,16 +31,9 @@ from vllm.model_executor.layers.mla import (MLAModules,
                                            MultiHeadLatentAttentionWrapper)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.v1.attention.backend import AttentionMetadata  # type: ignore

 from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.utils import vllm_version_is
-
-# isort: off
-if vllm_version_is('0.13.0'):
-    from vllm.attention.backends.abstract import AttentionMetadata  # type: ignore
-else:
-    from vllm.v1.attention.backend import AttentionMetadata  # type: ignore
-# isort: on


 class IndexerWrapper(nn.Module):
--- a/vllm_ascend/ops/mm_encoder_attention.py
+++ b/vllm_ascend/ops/mm_encoder_attention.py
@@ -20,16 +20,10 @@ import torch
 import torch.nn.functional as F
 import torch_npu
 from vllm.config import MultiModalConfig
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention  # type: ignore

 import vllm_ascend.envs as envs_ascend
-from vllm_ascend.utils import vllm_version_is

-# isort: off
-if vllm_version_is('0.13.0'):
-    from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention  # type: ignore
-else:
-    from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention  # type: ignore
-# isort: on

 MIN_PAD_SIZE = 64  # min_size to pad weight
 MAX_PAD_SIZE = 128  # max_size to pad weight
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -31,8 +31,7 @@ if HAS_TRITON:

 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import (AscendDeviceType, enable_custom_op,
-                               get_ascend_device_type, has_rope, is_vl_model,
-                               vllm_version_is)
+                               get_ascend_device_type, has_rope, is_vl_model)

 # Currently, rope ops used on npu requires detached cos && sin as inputs.
 # However, RotaryEmbedding in vllm use cos_sin_cache as a whole variable.
@@ -637,18 +636,8 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):
        cos: torch.Tensor,
        sin: torch.Tensor,
    ) -> torch.Tensor:
-        if vllm_version_is('0.13.0'):
-            origin_shape = x.shape
-            origin_dtype = x.dtype
-            if len(origin_shape) == 3:
-                x = x.unsqueeze(0)
-            if self.enable_fp32_compute:
-                x = x.float()
-                cos = cos.float()
-                sin = sin.float()
-        else:
-            x, cos, sin, origin_shape, origin_dtype = self._pre_process(
-                x, cos, sin)
+        x, cos, sin, origin_shape, origin_dtype = self._pre_process(
+            x, cos, sin)

        head_dim = x.shape[-1]
        # cos, sin: [seq_len, head_dim // 2]
@@ -660,12 +649,6 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):

        output = torch_npu.npu_rotary_mul(x, cos, sin)

-        if vllm_version_is('0.13.0'):
-            if len(origin_shape) == 3:
-                output = output.squeeze(0)
-            if self.enable_fp32_compute:
-                output = output.to(origin_dtype)
-        else:
-            output = self._post_process(output, origin_shape, origin_dtype)
+        output = self._post_process(output, origin_shape, origin_dtype)

        return output
--- a/vllm_ascend/ops/triton/mamba/causal_conv1d.py
+++ b/vllm_ascend/ops/triton/mamba/causal_conv1d.py
@@ -14,14 +14,7 @@ import torch.nn.functional as F
 import triton
 import triton.language as tl

-from vllm_ascend.utils import vllm_version_is
-
-# isort: off
-if vllm_version_is('0.13.0'):
-    from vllm.attention.backends.utils import PAD_SLOT_ID  # type: ignore
-else:
-    from vllm.v1.attention.backends.utils import PAD_SLOT_ID  # type: ignore
-# isort: on
+from vllm.v1.attention.backends.utils import PAD_SLOT_ID  # type: ignore


 def causal_conv1d_ref(