Drop vLLM 0.13.0 support (#6069)
### What this PR does / why we need it?
Drop vLLM 0.13.0 support, upgrade to 0.14.0
- vLLM version: v0.13.0
- vLLM main:
d68209402d
---------
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -50,7 +50,7 @@ from vllm_ascend.quantization.w8a8_dynamic import \
|
||||
from vllm_ascend.utils import (AscendDeviceType, enable_sp,
|
||||
get_ascend_device_type, maybe_trans_nz,
|
||||
npu_stream_switch, shared_expert_dp_enabled,
|
||||
shared_experts_calculation_stream, vllm_version_is)
|
||||
shared_experts_calculation_stream)
|
||||
|
||||
@dataclass
|
||||
class FusedMoEResult:
|
||||
@@ -451,12 +451,7 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
|
||||
# Qwen3-Next specific gating mechanism
|
||||
if hasattr(self._shared_experts, "expert_gate") and \
|
||||
self._shared_experts.expert_gate is not None:
|
||||
if vllm_version_is('0.13.0'):
|
||||
# TODO(jianzs): remove this branch after vLLM new version is
|
||||
# released
|
||||
gate_out = self._shared_experts.expert_gate(hidden_states) # type: ignore
|
||||
else:
|
||||
gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore
|
||||
gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore
|
||||
shared_out = F.sigmoid(gate_out) * shared_out
|
||||
return shared_out
|
||||
|
||||
|
||||
@@ -31,16 +31,9 @@ from vllm.model_executor.layers.mla import (MLAModules,
|
||||
MultiHeadLatentAttentionWrapper)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
from vllm.v1.attention.backend import AttentionMetadata # type: ignore
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# isort: off
|
||||
if vllm_version_is('0.13.0'):
|
||||
from vllm.attention.backends.abstract import AttentionMetadata # type: ignore
|
||||
else:
|
||||
from vllm.v1.attention.backend import AttentionMetadata # type: ignore
|
||||
# isort: on
|
||||
|
||||
|
||||
class IndexerWrapper(nn.Module):
|
||||
|
||||
@@ -20,16 +20,10 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
import torch_npu
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# isort: off
|
||||
if vllm_version_is('0.13.0'):
|
||||
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention # type: ignore
|
||||
else:
|
||||
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore
|
||||
# isort: on
|
||||
|
||||
MIN_PAD_SIZE = 64 # min_size to pad weight
|
||||
MAX_PAD_SIZE = 128 # max_size to pad weight
|
||||
|
||||
@@ -31,8 +31,7 @@ if HAS_TRITON:
|
||||
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import (AscendDeviceType, enable_custom_op,
|
||||
get_ascend_device_type, has_rope, is_vl_model,
|
||||
vllm_version_is)
|
||||
get_ascend_device_type, has_rope, is_vl_model)
|
||||
|
||||
# Currently, rope ops used on npu requires detached cos && sin as inputs.
|
||||
# However, RotaryEmbedding in vllm use cos_sin_cache as a whole variable.
|
||||
@@ -637,18 +636,8 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
if vllm_version_is('0.13.0'):
|
||||
origin_shape = x.shape
|
||||
origin_dtype = x.dtype
|
||||
if len(origin_shape) == 3:
|
||||
x = x.unsqueeze(0)
|
||||
if self.enable_fp32_compute:
|
||||
x = x.float()
|
||||
cos = cos.float()
|
||||
sin = sin.float()
|
||||
else:
|
||||
x, cos, sin, origin_shape, origin_dtype = self._pre_process(
|
||||
x, cos, sin)
|
||||
x, cos, sin, origin_shape, origin_dtype = self._pre_process(
|
||||
x, cos, sin)
|
||||
|
||||
head_dim = x.shape[-1]
|
||||
# cos, sin: [seq_len, head_dim // 2]
|
||||
@@ -660,12 +649,6 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb):
|
||||
|
||||
output = torch_npu.npu_rotary_mul(x, cos, sin)
|
||||
|
||||
if vllm_version_is('0.13.0'):
|
||||
if len(origin_shape) == 3:
|
||||
output = output.squeeze(0)
|
||||
if self.enable_fp32_compute:
|
||||
output = output.to(origin_dtype)
|
||||
else:
|
||||
output = self._post_process(output, origin_shape, origin_dtype)
|
||||
output = self._post_process(output, origin_shape, origin_dtype)
|
||||
|
||||
return output
|
||||
|
||||
@@ -14,14 +14,7 @@ import torch.nn.functional as F
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# isort: off
|
||||
if vllm_version_is('0.13.0'):
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore
|
||||
else:
|
||||
from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore
|
||||
# isort: on
|
||||
from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore
|
||||
|
||||
|
||||
def causal_conv1d_ref(
|
||||
|
||||
Reference in New Issue
Block a user