Drop vLLM 0.13.0 support (#6069)

### What this PR does / why we need it?
Drop vLLM 0.13.0 support, upgrade to 0.14.0

- vLLM version: v0.13.0
- vLLM main:
d68209402d

---------

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2026-01-23 09:45:08 +08:00
committed by GitHub
parent 27a513b672
commit 819a4459ce
39 changed files with 86 additions and 272 deletions

View File

@@ -50,7 +50,7 @@ from vllm_ascend.quantization.w8a8_dynamic import \
from vllm_ascend.utils import (AscendDeviceType, enable_sp,
get_ascend_device_type, maybe_trans_nz,
npu_stream_switch, shared_expert_dp_enabled,
shared_experts_calculation_stream, vllm_version_is)
shared_experts_calculation_stream)
@dataclass
class FusedMoEResult:
@@ -451,12 +451,7 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
# Qwen3-Next specific gating mechanism
if hasattr(self._shared_experts, "expert_gate") and \
self._shared_experts.expert_gate is not None:
if vllm_version_is('0.13.0'):
# TODO(jianzs): remove this branch after vLLM new version is
# released
gate_out = self._shared_experts.expert_gate(hidden_states) # type: ignore
else:
gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore
gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore
shared_out = F.sigmoid(gate_out) * shared_out
return shared_out

View File

@@ -31,16 +31,9 @@ from vllm.model_executor.layers.mla import (MLAModules,
MultiHeadLatentAttentionWrapper)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.utils.torch_utils import direct_register_custom_op
from vllm.v1.attention.backend import AttentionMetadata # type: ignore
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import vllm_version_is
# isort: off
if vllm_version_is('0.13.0'):
from vllm.attention.backends.abstract import AttentionMetadata # type: ignore
else:
from vllm.v1.attention.backend import AttentionMetadata # type: ignore
# isort: on
class IndexerWrapper(nn.Module):

View File

@@ -20,16 +20,10 @@ import torch
import torch.nn.functional as F
import torch_npu
from vllm.config import MultiModalConfig
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore
import vllm_ascend.envs as envs_ascend
from vllm_ascend.utils import vllm_version_is
# isort: off
if vllm_version_is('0.13.0'):
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention # type: ignore
else:
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore
# isort: on
MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight

View File

@@ -31,8 +31,7 @@ if HAS_TRITON:
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import (AscendDeviceType, enable_custom_op,
get_ascend_device_type, has_rope, is_vl_model,
vllm_version_is)
get_ascend_device_type, has_rope, is_vl_model)
# Currently, rope ops used on npu requires detached cos && sin as inputs.
# However, RotaryEmbedding in vllm use cos_sin_cache as a whole variable.
@@ -637,18 +636,8 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):
cos: torch.Tensor,
sin: torch.Tensor,
) -> torch.Tensor:
if vllm_version_is('0.13.0'):
origin_shape = x.shape
origin_dtype = x.dtype
if len(origin_shape) == 3:
x = x.unsqueeze(0)
if self.enable_fp32_compute:
x = x.float()
cos = cos.float()
sin = sin.float()
else:
x, cos, sin, origin_shape, origin_dtype = self._pre_process(
x, cos, sin)
x, cos, sin, origin_shape, origin_dtype = self._pre_process(
x, cos, sin)
head_dim = x.shape[-1]
# cos, sin: [seq_len, head_dim // 2]
@@ -660,12 +649,6 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):
output = torch_npu.npu_rotary_mul(x, cos, sin)
if vllm_version_is('0.13.0'):
if len(origin_shape) == 3:
output = output.squeeze(0)
if self.enable_fp32_compute:
output = output.to(origin_dtype)
else:
output = self._post_process(output, origin_shape, origin_dtype)
output = self._post_process(output, origin_shape, origin_dtype)
return output

View File

@@ -14,14 +14,7 @@ import torch.nn.functional as F
import triton
import triton.language as tl
from vllm_ascend.utils import vllm_version_is
# isort: off
if vllm_version_is('0.13.0'):
from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore
else:
from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore
# isort: on
from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore
def causal_conv1d_ref(