upgrade vLLM to main (#4608)

1. fix https://github.com/vllm-project/vllm/pull/28542
The model structure modifications we involved in are:
     - Qwen2.5-VL(still exist some patch)
     - Qwen2-VL
     - Qwen2
     - DeepSeek series
     - Qwen-moe series
2. fix https://github.com/vllm-project/vllm/pull/29121
   the output token now  type changed from np to `list[list[int]]`

3. fix https://github.com/vllm-project/vllm/pull/29262
    `xformers` backend for multimodal now has been deprecated
4. fix https://github.com/vllm-project/vllm/pull/29342

5. fix https://github.com/vllm-project/vllm/pull/28579
6. fix https://github.com/vllm-project/vllm/pull/28718
7. fix https://github.com/vllm-project/vllm/issues/28665
8. fix https://github.com/vllm-project/vllm/pull/26847
vllm introduced the `optimization-level`, some default config has been
changed, and the param `--enforce-eager` has been deprecated
9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple
for sampler.
10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the
related patch to avoid this kind of error.

Co-authored-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: wangli <wangli858794774@gmail.com>


- vLLM version: v0.11.2

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
wangxiyuan
2025-12-02 22:10:52 +08:00
committed by GitHub
parent 4588cdac02
commit 7f2673ea2d
60 changed files with 383 additions and 374 deletions

View File

@@ -7,7 +7,7 @@ import torch.nn as nn
import torch.nn.functional as F
from vllm.config import (CUDAGraphMode, VllmConfig,
get_layers_from_vllm_config, set_current_vllm_config)
from vllm.forward_context import BatchDescriptor, get_forward_context
from vllm.forward_context import get_forward_context
from vllm.logger import init_logger
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.model_loader import get_model_loader
@@ -314,8 +314,7 @@ class MtpProposer(Proposer):
break
def generate_token_ids(self,
sampled_token_ids: Union[torch.Tensor,
list[np.ndarray]],
sampled_token_ids: torch.Tensor | list[list[int]],
sampling_metadata: SamplingMetadata = None,
scheduler_output: SchedulerOutput = None,
spec_decode_metadata: SpecDecodeMetadata = None,
@@ -392,7 +391,6 @@ class MtpProposer(Proposer):
common_attn_metadata.query_start_loc = \
query_start_loc_pcp_full[:num_reqs + 1]
if self.speculative_config.disable_padded_drafter_batch:
assert isinstance(sampled_token_ids, list)
# NOTE: Currently, MTP-fullgraph is incompatibility with pcp
token_indices_to_sample = None
common_attn_metadata, token_indices =\
@@ -451,7 +449,7 @@ class MtpProposer(Proposer):
def _prepare_inputs(
self,
common_attn_metadata: CommonAttentionMetadata,
sampled_token_ids: list[np.ndarray],
sampled_token_ids: list[list[int]],
num_draft_tokens: list[int],
) -> tuple[CommonAttentionMetadata, torch.Tensor]:
"""
@@ -695,13 +693,11 @@ class MtpProposer(Proposer):
2))) and (scheduler_output.total_num_scheduled_tokens
== self.runner.input_batch.num_reqs *
(self.num_speculative_tokens + 1))
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
uniform_decode=uniform_decode)
else:
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
uniform_decode=False)
uniform_decode = False
has_lora = len(self.runner.input_batch.lora_id_to_lora_request) > 0
aclgraph_runtime_mode, batch_descriptor = \
self.runner.aclgraph_dispatcher.dispatch(batch_descriptor)
self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(
) and aclgraph_runtime_mode == CUDAGraphMode.FULL:
@@ -929,7 +925,7 @@ class MtpProposer(Proposer):
def prepare_next_token_ids_cpu(
self,
sampled_token_ids: list[np.ndarray],
sampled_token_ids: list[list[int]],
requests: dict[str, CachedRequestState],
gpu_input_batch: InputBatch,
num_scheduled_tokens: dict[str, int],
@@ -944,7 +940,7 @@ class MtpProposer(Proposer):
req_ids = gpu_input_batch.req_ids
next_token_ids: list[int] = []
for i, token_ids in enumerate(sampled_token_ids):
if token_ids.shape[0] > 0:
if token_ids:
# Common case.
next_token_id = token_ids[-1]
else:
@@ -955,7 +951,7 @@ class MtpProposer(Proposer):
seq_len = req_state.num_computed_tokens + num_scheduled_tokens[
req_id]
next_token_id = req_state.get_token_id(seq_len)
next_token_ids.append(next_token_id.item())
next_token_ids.append(next_token_id)
next_token_ids = torch.tensor(next_token_ids,
dtype=torch.int32,
device=self.input_ids.device)