[main2main] upgrade vllm to 0308 (#7213)

### What this PR does / why we need it?
Update main2main to vllm 0308.
breaks:

* https://github.com/vllm-project/vllm/pull/30681
* https://github.com/vllm-project/vllm/pull/35552 remove
self.cudagraph_batch_sizes
* https://github.com/vllm-project/vllm/pull/35158 clear_metadata ->
defer_finalize
* https://github.com/vllm-project/vllm/pull/36006 remove
CacheConfig.cpu_offload_gb
* https://github.com/vllm-project/vllm/pull/35472
* https://github.com/vllm-project/vllm/pull/34552 attn_metadata_builder
* https://github.com/vllm-project/vllm/pull/30515 profile_seq_lens
* https://github.com/vllm-project/vllm/pull/28053 

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: menogrey <1299267905@qq.com>
Co-authored-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
zhangyiming
2026-03-18 09:24:43 +08:00
committed by GitHub
parent 79ef41a53d
commit 1c954ff264
16 changed files with 223 additions and 168 deletions

View File

@@ -46,7 +46,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params
from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is
from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled
# Currently we will fix block size to a small one since `num_reqs` can't be too large
_PREPARE_INPUTS_BLOCK_SIZE = 4
@@ -615,24 +615,7 @@ class SpecDecodeBaseProposer(EagleProposer):
if not self.parallel_drafting:
for draft_step in range(1, self.num_speculative_tokens):
per_layer_attn_metadata = dict()
if vllm_version_is("0.17.0"):
for attn_group in self.draft_attn_groups:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
common_attn_metadata,
batch_size,
num_input_tokens,
used_update_positions,
aclgraph_runtime_mode,
ori_seq_len,
slot_indices,
mtp_slot_mapping,
attn_group=attn_group,
)
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
else:
for attn_group in self.draft_attn_groups:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
@@ -644,6 +627,7 @@ class SpecDecodeBaseProposer(EagleProposer):
ori_seq_len,
slot_indices,
mtp_slot_mapping,
attn_group=attn_group,
)
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
@@ -653,21 +637,7 @@ class SpecDecodeBaseProposer(EagleProposer):
if not self.parallel_drafting:
for draft_step in range(1, self.num_speculative_tokens):
per_layer_attn_metadata = dict()
if vllm_version_is("0.17.0"):
for attn_group in self.draft_attn_groups:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
common_attn_metadata,
batch_size,
num_input_tokens,
used_update_positions,
aclgraph_runtime_mode,
attn_group=attn_group,
)
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
else:
for attn_group in self.draft_attn_groups:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
@@ -676,6 +646,7 @@ class SpecDecodeBaseProposer(EagleProposer):
num_input_tokens,
used_update_positions,
aclgraph_runtime_mode,
attn_group=attn_group,
)
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
@@ -1082,16 +1053,11 @@ class SpecDecodeBaseProposer(EagleProposer):
# 2.
# Recompute the slot mapping based on the new positions and
# rejection mask.
if vllm_version_is("0.17.0"):
# Use the first draft attention group's kv_cache_spec for block_size
# (all draft layers share the same kv-cache group)
assert len(self.draft_attn_groups) > 0
block_size = self.draft_attn_groups[0].kv_cache_spec.block_size
else:
if self.attn_metadata_builder is None:
block_size = self._get_attention_metadata_builder().kv_cache_spec.block_size
else:
block_size = self.attn_metadata_builder.kv_cache_spec.block_size
# Use the first draft attention group's kv_cache_spec for block_size
# (all draft layers share the same kv-cache group)
assert len(self.draft_attn_groups) > 0
block_size = self.draft_attn_groups[0].kv_cache_spec.block_size
new_slot_mapping = compute_new_slot_mapping(
cad=cad,
new_positions=self.positions[:total_num_output_tokens],
@@ -1130,8 +1096,7 @@ class SpecDecodeBaseProposer(EagleProposer):
attn_group=None,
):
assert draft_step > 0
if vllm_version_is("0.17.0"):
assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
common_attn_metadata = self.shallow_copy_metadata(old_common_metadata)
if draft_step == 1:
@@ -1243,13 +1208,7 @@ class SpecDecodeBaseProposer(EagleProposer):
# Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx]
common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step]
if vllm_version_is("0.17.0"):
attn_metadata_builder = attn_group.get_metadata_builder()
else:
if self.attn_metadata_builder is None:
attn_metadata_builder = self._get_attention_metadata_builder()
else:
attn_metadata_builder = self.attn_metadata_builder
attn_metadata_builder = attn_group.get_metadata_builder()
attn_metadata = attn_metadata_builder.build_for_drafting(
common_attn_metadata=common_attn_metadata,