Drop 0.10.2 (#3284)

Drop v0.10.2 support, we support vLLM 0.11.0rc3 now.
- vLLM version: v0.11.0rc3
- vLLM main:
https://github.com/vllm-project/vllm/commit/releases/v0.11.0

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-10-09 10:28:38 +08:00
committed by GitHub
parent 2dde1268c7
commit f12f76d7ba
17 changed files with 202 additions and 653 deletions

View File

@@ -78,10 +78,12 @@ from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
# yapf: disable
from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
KVCacheConfig, KVCacheGroupSpec,
KVCacheSpec, MambaSpec)
KVCacheSpec, MambaSpec,
UniformTypeKVCacheSpecs)
# yapf: enable
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
DraftTokenIds, LogprobsTensors, ModelRunnerOutput)
DraftTokenIds, LogprobsTensors, ModelRunnerOutput,
PoolerOutput)
from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
@@ -121,7 +123,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
AscendSocVersion, ProfileExecuteDuration,
get_ascend_soc_version, is_310p,
lmhead_tp_enable, vllm_version_is)
lmhead_tp_enable)
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if TYPE_CHECKING:
@@ -143,13 +145,6 @@ if is_310p():
else:
ACL_FORMAT = ACL_FORMAT_FRACTAL_ND
if not vllm_version_is("0.10.2"):
from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
from vllm.v1.outputs import PoolerOutput
else:
from vllm.sequence import PoolerOutput
UniformTypeKVCacheSpecs = None
@dataclass
class GraphCaptureContext:
@@ -308,23 +303,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
dtype=self.dtype,
device=self.device)
# Set up Attention
if vllm_version_is("0.10.2"):
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
self.model_config.is_attention_free,
use_mla=self.model_config.use_mla,
use_sfa=self.ascend_config.use_sfa)
else:
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sfa=self.ascend_config.use_sfa)
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sfa=self.ascend_config.use_sfa)
if torch.version.cann.startswith("8.3"):
self.attn_mask_builder = AttentionMaskBuilder(
self.scheduler_config.max_num_batched_tokens, self.dtype,
@@ -602,12 +587,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
to_update.apply(pooling_params)
backward_kwargs = {}
if vllm_version_is("0.10.2"):
backward_kwargs["mm_kwargs"] = new_req_data.mm_kwargs
backward_kwargs["mm_hashes"] = new_req_data.mm_hashes
backward_kwargs["mm_positions"] = new_req_data.mm_positions
else:
backward_kwargs["mm_features"] = new_req_data.mm_features
backward_kwargs["mm_features"] = new_req_data.mm_features
self.requests[req_id] = CachedRequestState(
req_id=req_id,
@@ -624,10 +604,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if self.uses_mrope:
if vllm_version_is("0.10.2"):
self._init_mrope_positions_0102(self.requests[req_id])
else:
self._init_mrope_positions(self.requests[req_id])
self._init_mrope_positions(self.requests[req_id])
req_ids_to_add.append(req_id)
@@ -759,39 +736,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
use_audio_in_video=use_audio_in_video,
)
def _init_mrope_positions_0102(self, req_state: CachedRequestState):
image_grid_thw = []
video_grid_thw = []
second_per_grid_ts = []
audio_feature_lengths = []
use_audio_in_video = False
assert req_state.mm_kwargs is not None
for mm_item in req_state.mm_kwargs:
mm_input = mm_item.get_data()
if mm_input.get("image_grid_thw") is not None:
image_grid_thw.append(mm_input["image_grid_thw"].tolist())
if mm_input.get("video_grid_thw") is not None:
video_grid_thw.append(mm_input["video_grid_thw"].tolist())
if mm_input.get("second_per_grid_ts") is not None:
second_per_grid_ts.append(mm_input["second_per_grid_ts"])
if mm_input.get("audio_feature_lengths") is not None:
audio_feature_lengths.append(mm_input["audio_feature_lengths"])
if mm_input.get("use_audio_in_video") is True:
use_audio_in_video = True
hf_config = self.model_config.hf_config
req_state.mrope_positions, req_state.mrope_position_delta = \
MRotaryEmbedding.get_input_positions_tensor(
req_state.prompt_token_ids,
hf_config=hf_config,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
audio_feature_lengths=audio_feature_lengths,
use_audio_in_video=use_audio_in_video,
)
def _sync_metadata_across_dp(
self, num_tokens: int, with_prefill: bool, enable_dbo: bool
) -> tuple[int, Optional[torch.Tensor], bool, bool]:
@@ -966,12 +910,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
return
# Batch the multi-modal inputs.
if vllm_version_is("0.10.2"):
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler_0102(
scheduler_output)
else:
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
scheduler_output)
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
scheduler_output)
encoder_outputs = []
for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
@@ -1003,31 +943,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
is_embed=pos_info.is_embed,
)
# TODO: remove this once we drop support for vLLM 0.10.2
def _batch_mm_kwargs_from_scheduler_0102(
self,
scheduler_output: "SchedulerOutput",
) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
if not scheduled_encoder_inputs:
return [], []
# Batch the multi-modal inputs.
mm_kwargs = list[MultiModalKwargsItem]()
# list of tuple (mm_hash, position_info)
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
req_state = self.requests[req_id]
assert req_state.mm_hashes is not None
assert req_state.mm_kwargs is not None
assert req_state.mm_positions is not None
for mm_input_id in encoder_input_ids:
mm_hash = req_state.mm_hashes[mm_input_id]
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
mm_hashes_pos.append(
(mm_hash, req_state.mm_positions[mm_input_id]))
return mm_kwargs, mm_hashes_pos
def _batch_mm_kwargs_from_scheduler(
self,
scheduler_output: "SchedulerOutput",
@@ -1067,20 +982,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
) -> list[torch.Tensor]:
def _iter_mm_features(req_state: CachedRequestState):
if vllm_version_is("0.10.2"):
# legacy path (to be removed later)
assert req_state.mm_hashes is not None
assert req_state.mm_positions is not None
for mm_hash, pos_info in zip(req_state.mm_hashes,
req_state.mm_positions):
yield mm_hash, pos_info, getattr(pos_info, "is_embed",
None)
else:
assert req_state.mm_features is not None
for mm_feature in req_state.mm_features:
pos_info = mm_feature.mm_position
yield mm_feature.identifier, pos_info, getattr(
pos_info, "is_embed", None)
assert req_state.mm_features is not None
for mm_feature in req_state.mm_features:
pos_info = mm_feature.mm_position
yield mm_feature.identifier, pos_info, getattr(
pos_info, "is_embed", None)
mm_embeds: list[torch.Tensor] = []
@@ -1527,10 +1433,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for attn_group in self.attn_groups[kv_cache_group_id]:
common_prefix_len = 0
extra_attn_metadata_args = {}
if vllm_version_is("0.10.2"):
builder = attn_group.metadata_builder
else:
builder = attn_group.get_metadata_builder()
builder = attn_group.get_metadata_builder()
if isinstance(builder, GDNAttentionMetadataBuilder):
if use_spec_decode:
extra_attn_metadata_args = dict(
@@ -1809,29 +1712,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
device=hidden_states.device)
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
if vllm_version_is("0.10.2"):
# Pooling models D2H & synchronize occurs in pooler.py:build_output
raw_pooler_output = self.model.pooler(
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
else:
model = cast(VllmModelForPooling, self.model)
raw_pooler_output = model.pooler(
hidden_states=hidden_states,
pooling_metadata=pooling_metadata,
)
raw_pooler_output = json_map_leaves(
lambda x: x.to("cpu", non_blocking=True),
raw_pooler_output,
)
torch.npu.synchronize()
model = cast(VllmModelForPooling, self.model)
raw_pooler_output = model.pooler(
hidden_states=hidden_states,
pooling_metadata=pooling_metadata,
)
raw_pooler_output = json_map_leaves(
lambda x: x.to("cpu", non_blocking=True),
raw_pooler_output,
)
torch.npu.synchronize()
pooler_output: list[Optional[torch.Tensor]] = []
for raw_output, seq_len, prompt_len in zip(
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
if vllm_version_is("0.10.2"):
output = raw_output.data if seq_len == prompt_len else None
else:
output = raw_output if seq_len == prompt_len else None
output = raw_output if seq_len == prompt_len else None
pooler_output.append(output)
return ModelRunnerOutput(
@@ -2006,8 +1901,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
num_scheduled_tokens_np, finished_sending,
finished_recving, kv_connector_output)
sample_hidden_states = hidden_states[logits_indices]
logits = self._compute_logits_wrapper(sample_hidden_states,
None)
logits = self.model.compute_logits(sample_hidden_states)
if broadcast_pp_output:
model_output_broadcast_data = {
"logits": logits.contiguous(),
@@ -2302,10 +2196,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
)
for attn_group in self.attn_groups[kv_cache_group_id]:
if vllm_version_is("0.10.2"):
builder = attn_group.metadata_builder
else:
builder = attn_group.get_metadata_builder()
builder = attn_group.get_metadata_builder()
attn_metadata_i = builder.build_for_graph_capture(
common_attn_metadata)
for layer_name in kv_cache_group_spec.layer_names:
@@ -2463,8 +2354,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
dtype=torch.int32)
def dummy_compute_logits(hidden_states):
return self._compute_logits_wrapper(
hidden_states[dummy_indices], None)
return self.model.compute_logits(
hidden_states[dummy_indices])
with set_ascend_forward_context(
attn_metadata,
@@ -2542,18 +2433,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
logit_indices = np.cumsum(num_scheduled_tokens) - 1
# TODO: need to rum a dummy sampler for generate task
hidden_states = hidden_states[logit_indices]
output = self._compute_logits_wrapper(hidden_states, None)
output = self.model.compute_logits(hidden_states)
NPUPlatform.synchronize()
del hidden_states, output
self.encoder_cache.clear()
gc.collect()
def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
if vllm_version_is("0.10.2"):
return self.model.compute_logits(hidden_states, sampling_metadata)
return self.model.compute_logits(hidden_states)
def _dummy_pooler_run_task(
self,
hidden_states: torch.Tensor,
@@ -2615,10 +2501,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for task in self.get_supported_pooling_tasks():
# Run a full batch with each task to ensure none of them OOMs
output = self._dummy_pooler_run_task(hidden_states, task)
if vllm_version_is("0.10.2"):
output_size[task] = output.get_data_nbytes()
else:
output_size[task] = sum(o.nbytes for o in output)
output_size[task] = sum(o.nbytes for o in output)
del output # Allow GC
max_task = max(output_size.items(), key=lambda x: x[1])[0]
@@ -2657,16 +2540,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.model.get_eagle3_aux_hidden_state_layers())
if self.lora_config:
if vllm_version_is("0.10.2"):
self.model = self.load_lora_model(self.model,
self.model_config,
self.scheduler_config,
self.lora_config,
self.device)
else:
self.model = self.load_lora_model(self.model,
self.vllm_config,
self.device)
self.model = self.load_lora_model(self.model, self.vllm_config,
self.device)
logger.info("Loading model weights took %.4f GB",
m.consumed_memory / float(2**30))
@@ -2694,17 +2569,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.initialize_attn_backend(kv_cache_config)
self.use_hybrid_blocks = (len(self.attn_groups) > 1)
# NOTE: Currently, we determine whether we need `num_accepted_tokens` through `MambaSpec`.
if vllm_version_is("0.10.2"):
self.need_accepted_tokens = any([
isinstance(
self.kv_cache_config.kv_cache_groups[0].kv_cache_spec,
MambaSpec) for attn_group in self.attn_groups
])
else:
self.need_accepted_tokens = any([
isinstance(attn_group[0].kv_cache_spec, MambaSpec)
for attn_group in self.attn_groups
])
self.need_accepted_tokens = any([
isinstance(attn_group[0].kv_cache_spec, MambaSpec)
for attn_group in self.attn_groups
])
self.may_reinitialize_input_batch(kv_cache_config)
@@ -2737,11 +2605,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
kv_caches: Dict[str, torch.Tensor] = {}
for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
if vllm_version_is("0.10.2"):
kv_cache_spec, group = group
else:
kv_cache_spec = group.kv_cache_spec
for group in self._kv_cache_spec_attn_group_iterator():
kv_cache_spec = group.kv_cache_spec
attn_backend = group.backend
for layer_name in group.layer_names:
if layer_name in self.runner_only_attn_layers:
@@ -2846,11 +2711,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
kv_caches: Dict[str, torch.Tensor] = {}
for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
if vllm_version_is("0.10.2"):
kv_cache_spec, group = group
else:
kv_cache_spec = group.kv_cache_spec
for group in self._kv_cache_spec_attn_group_iterator():
kv_cache_spec = group.kv_cache_spec
attn_backend = group.backend
for layer_name in group.layer_names:
if layer_name in self.runner_only_attn_layers:
@@ -2996,11 +2858,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
)), "Some layers are not correctly initialized"
kv_caches: Dict[str, torch.Tensor] = {}
for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
if vllm_version_is("0.10.2"):
kv_cache_spec, group = group
else:
kv_cache_spec = group.kv_cache_spec
for group in self._kv_cache_spec_attn_group_iterator():
kv_cache_spec = group.kv_cache_spec
attn_backend = group.backend
for layer_name in group.layer_names:
if layer_name in self.runner_only_attn_layers:
@@ -3211,50 +3070,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for k, v in attn_backend_layers.items()
}
def get_attn_backends_for_layers(
layer_names: list[str]
) -> dict[type[AttentionBackend], list[str]]:
"""Get attention_backend for all attention layers
TODO: Only used in v0.10.2, drop me when 0.10.2 is dropped
"""
layers = get_layers_from_vllm_config(self.vllm_config,
AttentionLayerBase,
layer_names)
attn_backends = {}
attn_backend_layers = defaultdict(list)
# Dedupe based on full class name; this is a bit safer than
# using the class itself as the key because when we create dynamic
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
# they are cached correctly, there will be different objects per
# layer.
for layer_name in layer_names:
attn_backend = layers[layer_name].get_attn_backend()
key = attn_backend.full_cls_name()
attn_backends[key] = attn_backend
attn_backend_layers[key].append(layer_name)
return {
attn_backends[k]: v
for k, v in attn_backend_layers.items()
}
def create_attn_groups_v0102(
attn_backends_map: dict[AttentionBackend, list[str]],
kv_cache_spec: KVCacheSpec,
) -> list[AttentionGroup]:
attn_groups: list[AttentionGroup] = []
for attn_backend, layer_names in attn_backends_map.items():
attn_metadata_builder_i = attn_backend.get_builder_cls()(
kv_cache_spec,
layer_names,
self.vllm_config,
self.device,
)
attn_group = AttentionGroup(attn_backend,
attn_metadata_builder_i,
layer_names)
attn_groups.append(attn_group)
return attn_groups
def create_attn_groups(
attn_backends_map: dict[AttentionBackend, list[str]],
) -> list[AttentionGroup]:
@@ -3274,18 +3089,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
attn_groups.append(attn_group)
return attn_groups
if vllm_version_is("0.10.2"):
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
kv_cache_spec = kv_cache_group_spec.kv_cache_spec
attn_backends = get_attn_backends_for_layers(
kv_cache_group_spec.layer_names)
self.attn_groups.append(
create_attn_groups_v0102(attn_backends, kv_cache_spec))
else:
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
attn_backends = get_attn_backends_for_group( # type: ignore
kv_cache_group_spec)
self.attn_groups.append(create_attn_groups(attn_backends))
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
attn_backends = get_attn_backends_for_group( # type: ignore
kv_cache_group_spec)
self.attn_groups.append(create_attn_groups(attn_backends))
# Calculate reorder batch threshold (if needed)
self.calculate_reorder_batch_threshold()
@@ -3299,31 +3106,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for attn_groups in self.attn_groups:
yield from attn_groups
def _kv_cache_spec_attn_group_iterator_v0102(
self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]:
if not self.kv_cache_config.kv_cache_groups:
return
for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups):
for attn_group in attn_groups:
yield self.kv_cache_config.kv_cache_groups[
kv_cache_spec_id].kv_cache_spec, attn_group
def _kv_cache_spec_attn_group_iterator_dispatcher(self):
if vllm_version_is("0.10.2"):
return self._kv_cache_spec_attn_group_iterator_v0102()
else:
return self._kv_cache_spec_attn_group_iterator()
def calculate_reorder_batch_threshold(self) -> None:
"""
Check that if any backends reorder batches; that the reordering
is compatible (e.g., decode threshold is the same)
"""
for group in self._attn_group_iterator():
if vllm_version_is("0.10.2"):
attn_metadata_builder_i = group.metadata_builder
else:
attn_metadata_builder_i = group.get_metadata_builder()
attn_metadata_builder_i = group.get_metadata_builder()
if hasattr(attn_metadata_builder_i, "reorder_batch_threshold"):
# check that if any backends reorder batches; that the reordering
# is compatible (e.g., decode threshold is the same)
@@ -3427,10 +3216,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
min_ag_builder_name = None
for attn_group in self._attn_group_iterator():
if vllm_version_is("0.10.2"):
builder = attn_group.metadata_builder
else:
builder = attn_group.get_metadata_builder()
builder = attn_group.get_metadata_builder()
if builder.aclgraph_support.value < min_ag_support.value:
min_ag_support = builder.aclgraph_support
min_ag_builder_name = builder.__class__.__name__
@@ -3674,7 +3460,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
req_idx = self.input_batch.req_id_to_index[req_id]
offset = self.query_start_loc_np[req_idx].item()
prompt_hidden_states = hidden_states[offset:offset + num_logits]
logits = self._compute_logits_wrapper(prompt_hidden_states, None)
logits = self.model.compute_logits(prompt_hidden_states)
# Get the "target" tokens for each index. For prompt at index i,
# the token at prompt index i+1 is the "sampled" token we want

View File

@@ -39,7 +39,6 @@ from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
from vllm.v1.utils import copy_slice
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.worker.block_table import MultiGroupBlockTable
@@ -79,12 +78,6 @@ class CachedRequestState:
@deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
"removed in v0.13. Please use `mm_kwargs` instead.")
def mm_inputs(self) -> list[MultiModalKwargsItems]:
if vllm_version_is("0.10.2"):
assert self.mm_kwargs is not None
return [
MultiModalKwargsItems.from_seq([item])
for item in self.mm_kwargs
]
assert self.mm_features is not None
return [
MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features