[main2main] upgrade vllm to 0308 (#7213)

### What this PR does / why we need it?
Update main2main to vllm 0308.
breaks:

* https://github.com/vllm-project/vllm/pull/30681
* https://github.com/vllm-project/vllm/pull/35552 remove
self.cudagraph_batch_sizes
* https://github.com/vllm-project/vllm/pull/35158 clear_metadata ->
defer_finalize
* https://github.com/vllm-project/vllm/pull/36006 remove
CacheConfig.cpu_offload_gb
* https://github.com/vllm-project/vllm/pull/35472
* https://github.com/vllm-project/vllm/pull/34552 attn_metadata_builder
* https://github.com/vllm-project/vllm/pull/30515 profile_seq_lens
* https://github.com/vllm-project/vllm/pull/28053 

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: menogrey <1299267905@qq.com>
Co-authored-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
zhangyiming
2026-03-18 09:24:43 +08:00
committed by GitHub
parent 79ef41a53d
commit 1c954ff264
16 changed files with 223 additions and 168 deletions

View File

@@ -412,15 +412,14 @@ class NPUModelRunner(GPUModelRunner):
self.cpu_slot_mapping = None
self.sampling_done_event: torch.npu.Event | None = None
if vllm_version_is("0.17.0"):
# self.cudagraph_batch_sizes sorts in ascending order.
if (
self.compilation_config.cudagraph_capture_sizes
and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
):
self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
else:
self.cudagraph_batch_sizes = []
# self.cudagraph_batch_sizes sorts in ascending order.
if (
self.compilation_config.cudagraph_capture_sizes
and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
):
self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
else:
self.cudagraph_batch_sizes = []
self.mamba_state_idx: dict[str, int] = {}
self._mamba_copy_bufs: mamba_utils.MambaCopyBuffers | None = None
@@ -1376,7 +1375,12 @@ class NPUModelRunner(GPUModelRunner):
skip_compiled=has_encoder_input,
),
self.maybe_get_kv_connector_output(
scheduler_output, clear_metadata=clear_kv_metadata
scheduler_output,
**(
{"clear_metadata": clear_kv_metadata}
if vllm_version_is("0.17.0")
else {"defer_finalize": not clear_kv_metadata}
),
) as kv_connector_output,
):
hidden_states = self._model_forward(
@@ -2253,6 +2257,7 @@ class NPUModelRunner(GPUModelRunner):
remove_lora: bool = True,
is_graph_capturing: bool = False,
num_active_loras: int = 0,
profile_seq_lens: int | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
# only support eager mode and piecewise graph now
assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes()
@@ -2359,11 +2364,14 @@ class NPUModelRunner(GPUModelRunner):
# seq_lens. We use this seq_len only when capturing graph, and still use max_query_len
# in inference. This will be removed once npu_fused_infer_attention_score
# outperforms _npu_paged_attention on all cases.
seq_lens = (
SEQ_LEN_WITH_MAX_PA_WORKSPACE
if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config)
else max_query_len
) # type: ignore[assignment]
if profile_seq_lens is not None:
seq_lens = profile_seq_lens
else:
seq_lens = (
SEQ_LEN_WITH_MAX_PA_WORKSPACE
if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config)
else max_query_len
) # type: ignore[assignment]
self.seq_lens.np[:num_reqs_padded] = seq_lens
self.seq_lens.np[num_reqs_padded:] = 0
self.seq_lens.copy_to_gpu()
@@ -2579,14 +2587,13 @@ class NPUModelRunner(GPUModelRunner):
self.may_reinitialize_input_batch(kv_cache_config)
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
if vllm_version_is("0.17.0"):
# TODO: refactor the logic of attention
# Initialize drafter attention group initialization
if self.speculative_config and (
self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
):
assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
# TODO: refactor the logic of attention
# Initialize drafter attention group initialization
if self.speculative_config and (
self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
):
assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
if has_kv_transfer_group():
get_kv_transfer_group().register_kv_caches(kv_caches)
@@ -3031,11 +3038,18 @@ class NPUModelRunner(GPUModelRunner):
max_num_blocks.append(max_num_blocks_per_req)
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
assert self.cache_config.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
"for more details."
)
if vllm_version_is("0.17.0"):
assert self.cache_config.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
"for more details."
)
else:
assert self.offload_config.uva.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
"for more details."
)
self.input_batch = NPUInputBatch(
max_num_reqs=self.max_num_reqs,
max_model_len=max_model_len,