Drop 0.12.0 support (#5146)

We decided to release v0.13.0 soon. So no need to support 0.12.0 now.
Let's drop it.

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-12-20 09:38:53 +08:00
committed by GitHub
parent 243ab7d720
commit 758d81dcb1
21 changed files with 63 additions and 149 deletions

View File

@@ -274,15 +274,6 @@ class AscendFusedMoE(FusedMoE):
def update_expert_map(self, new_expert_map):
self._expert_map = new_expert_map
@property
def expert_map(self) -> torch.Tensor | None:
return self._expert_map
@expert_map.setter
def expert_map(self, new_expert_map):
# TODO(Potabk): Remove this once we drop vllm v0.12.0(This makes backward compatibility with vllm v0.12.0)
self._expert_map = new_expert_map
def get_log2phy_map(self):
return self.log2phy

View File

@@ -17,15 +17,10 @@
import os
import vllm_ascend.patch.platform.patch_distributed # noqa
import vllm_ascend.patch.platform.patch_ec_connector # noqa
import vllm_ascend.patch.platform.patch_mamba_config # noqa
import vllm_ascend.patch.platform.patch_sched_yield # noqa
from vllm_ascend.utils import vllm_version_is
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv(
"EXPERT_MAP_RECORD", "false") == "true":
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
if vllm_version_is("0.12.0"):
import vllm_ascend.patch.platform.patch_ec_connector012 # noqa
else:
import vllm_ascend.patch.platform.patch_ec_connector # noqa

View File

@@ -1,33 +0,0 @@
import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector # type: ignore[import-not-found] # noqa
from safetensors.torch import load_file
from vllm.distributed.ec_transfer.ec_connector.base import \
ECConnectorMetadata # type: ignore[import-not-found] # noqa
from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import ( # type: ignore[import-not-found] # noqa
ECSharedStorageConnector, ECSharedStorageConnectorMetadata)
from vllm.logger import logger
class AscendECSharedStorageConnector(ECSharedStorageConnector):
def start_load_caches(self, encoder_cache, **kwargs) -> None:
metadata: ECConnectorMetadata = self._get_connector_metadata()
assert isinstance(metadata, ECSharedStorageConnectorMetadata)
assert encoder_cache is not None
if metadata is None:
logger.warning((
"In connector.start_load_caches, ",
"but the connector metadata is None",
))
return
# Load the EC for each mm data
for mm_data in metadata.mm_datas:
if mm_data.mm_hash in encoder_cache:
continue
filename = self._generate_filename_debug(mm_data.mm_hash)
ec_cache = load_file(filename)["ec_cache"].npu()
encoder_cache[mm_data.mm_hash] = ec_cache
logger.debug("Success load encoder cache for hash %s",
mm_data.mm_hash)
vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector

View File

@@ -351,22 +351,16 @@ class NPUPlatform(Platform):
CUSTOM_OP_REGISTERED = True
@classmethod
def get_attn_backend_cls(cls, selected_backend, *args, **kwargs):
if "attn_selector_config" in kwargs:
use_mla = kwargs["attn_selector_config"].use_mla
use_sparse = kwargs["attn_selector_config"].use_sparse
else:
use_mla = kwargs.get("use_mla",
args[4] if len(args) >= 5 else None)
use_sparse = kwargs.get("use_sparse",
args[6] if len(args) >= 7 else None)
def get_attn_backend_cls(cls, selected_backend, attn_selector_config):
backend_map = {
(True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
(False, False):
"vllm_ascend.attention.attention_v1.AscendAttentionBackend",
(True, True): "vllm_ascend.attention.sfa_v1.AscendSFABackend",
}
return backend_map[(use_mla, use_sparse)]
return backend_map[(attn_selector_config.use_mla,
attn_selector_config.use_sparse)]
@classmethod
def get_punica_wrapper(cls) -> str:

View File

@@ -116,8 +116,7 @@ from vllm_ascend.spec_decode.interface import SpecDcodeType
from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
enable_sp, get_ascend_device_type, is_moe_model,
lmhead_tp_enable, maybe_trans_nz,
vllm_version_is)
lmhead_tp_enable, maybe_trans_nz)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
from vllm_ascend.ascend_forward_context import ( # isort: skip
@@ -243,24 +242,15 @@ class NPUModelRunner(GPUModelRunner):
# Set up Attention
self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
"index_topk")
if vllm_version_is('0.12.0'):
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse)
else:
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse,
use_mm_prefix=self.model_config is not None
and self.model_config.is_mm_prefix_lm)
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse,
use_mm_prefix=self.model_config is not None
and self.model_config.is_mm_prefix_lm)
self.attn_mask_builder = AttentionMaskBuilder(self.device)
self._set_up_drafter()
@@ -1877,36 +1867,19 @@ class NPUModelRunner(GPUModelRunner):
self.speculative_config.method == "mtp":
attn_state = AscendAttentionState.SpecDecoding
if vllm_version_is("0.12.0"):
common_metadata = CommonAttentionMetadata(
query_start_loc=self.query_start_loc.gpu[:num_reqs +
common_metadata = CommonAttentionMetadata(
query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
1],
query_start_loc_cpu=self.query_start_loc.
cpu[:num_reqs + 1],
seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
seq_lens=self.seq_lens.cpu[:num_reqs],
num_reqs=num_reqs,
num_actual_tokens=num_tokens,
block_table_tensor=block_table_tensor[:num_reqs],
slot_mapping=slot_mapping.gpu,
num_computed_tokens_cpu=num_computed_tokens_cpu,
max_query_len=max_query_len,
max_seq_len=seq_lens)
else:
common_metadata = CommonAttentionMetadata(
query_start_loc=self.query_start_loc.gpu[:num_reqs +
1],
query_start_loc_cpu=self.query_start_loc.
cpu[:num_reqs + 1],
_seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
seq_lens=self.seq_lens.cpu[:num_reqs],
num_reqs=num_reqs,
num_actual_tokens=num_tokens,
block_table_tensor=block_table_tensor[:num_reqs],
slot_mapping=slot_mapping.gpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
max_query_len=max_query_len,
max_seq_len=seq_lens)
_seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
seq_lens=self.seq_lens.cpu[:num_reqs],
num_reqs=num_reqs,
num_actual_tokens=num_tokens,
block_table_tensor=block_table_tensor[:num_reqs],
slot_mapping=slot_mapping.gpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
max_query_len=max_query_len,
max_seq_len=seq_lens)
for attn_group in self.attn_groups[kv_cache_group_id]:
builder = attn_group.get_metadata_builder()

View File

@@ -22,6 +22,7 @@ import torch
from vllm.lora.request import LoRARequest
from vllm.pooling_params import PoolingParams
from vllm.v1.outputs import LogprobsTensors
from vllm.v1.pool.metadata import PoolingStates
from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
LogitsProcessors)
from vllm.v1.worker.gpu_input_batch import InputBatch
@@ -29,16 +30,6 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm_ascend.worker.block_table import MultiGroupBlockTable
class PoolingStates:
# NOTE: This should be removed after we drop support of vLLM v0.12.0
def __init__(self):
# for chunked prefill with ALL pooling
self.hidden_states_cache: list[torch.Tensor] = []
def clean(self):
self.hidden_states_cache.clear()
class NPUInputBatch(InputBatch):
def __init__(