upgrade vLLM to main (#4608)

1. fix https://github.com/vllm-project/vllm/pull/28542 The model structure modifications we involved in are: - Qwen2.5-VL(still exist some patch) - Qwen2-VL - Qwen2 - DeepSeek series - Qwen-moe series 2. fix https://github.com/vllm-project/vllm/pull/29121 the output token now type changed from np to `list[list[int]]` 3. fix https://github.com/vllm-project/vllm/pull/29262 `xformers` backend for multimodal now has been deprecated 4. fix https://github.com/vllm-project/vllm/pull/29342 5. fix https://github.com/vllm-project/vllm/pull/28579 6. fix https://github.com/vllm-project/vllm/pull/28718 7. fix https://github.com/vllm-project/vllm/issues/28665 8. fix https://github.com/vllm-project/vllm/pull/26847 vllm introduced the `optimization-level`, some default config has been changed, and the param `--enforce-eager` has been deprecated 9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple for sampler. 10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the related patch to avoid this kind of error. Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
2025-12-02 22:10:52 +08:00
parent 4588cdac02
commit 7f2673ea2d
60 changed files with 383 additions and 374 deletions
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -7,7 +7,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from vllm.config import (CUDAGraphMode, VllmConfig,
                         get_layers_from_vllm_config, set_current_vllm_config)
-from vllm.forward_context import BatchDescriptor, get_forward_context
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.model_loader import get_model_loader
@@ -314,8 +314,7 @@ class MtpProposer(Proposer):
                break

    def generate_token_ids(self,
-                           sampled_token_ids: Union[torch.Tensor,
-                                                    list[np.ndarray]],
+                           sampled_token_ids: torch.Tensor | list[list[int]],
                           sampling_metadata: SamplingMetadata = None,
                           scheduler_output: SchedulerOutput = None,
                           spec_decode_metadata: SpecDecodeMetadata = None,
@@ -392,7 +391,6 @@ class MtpProposer(Proposer):
                common_attn_metadata.query_start_loc = \
                    query_start_loc_pcp_full[:num_reqs + 1]
            if self.speculative_config.disable_padded_drafter_batch:
-                assert isinstance(sampled_token_ids, list)
                # NOTE: Currently, MTP-fullgraph is incompatibility with pcp
                token_indices_to_sample = None
                common_attn_metadata, token_indices =\
@@ -451,7 +449,7 @@ class MtpProposer(Proposer):
    def _prepare_inputs(
        self,
        common_attn_metadata: CommonAttentionMetadata,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
        num_draft_tokens: list[int],
    ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
        """
@@ -695,13 +693,11 @@ class MtpProposer(Proposer):
                      2))) and (scheduler_output.total_num_scheduled_tokens
                                == self.runner.input_batch.num_reqs *
                                (self.num_speculative_tokens + 1))
-            batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
-                                               uniform_decode=uniform_decode)
        else:
-            batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
-                                               uniform_decode=False)
+            uniform_decode = False
+        has_lora = len(self.runner.input_batch.lora_id_to_lora_request) > 0
        aclgraph_runtime_mode, batch_descriptor = \
-            self.runner.aclgraph_dispatcher.dispatch(batch_descriptor)
+            self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)

        if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(
        ) and aclgraph_runtime_mode == CUDAGraphMode.FULL:
@@ -929,7 +925,7 @@ class MtpProposer(Proposer):

    def prepare_next_token_ids_cpu(
        self,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
        requests: dict[str, CachedRequestState],
        gpu_input_batch: InputBatch,
        num_scheduled_tokens: dict[str, int],
@@ -944,7 +940,7 @@ class MtpProposer(Proposer):
        req_ids = gpu_input_batch.req_ids
        next_token_ids: list[int] = []
        for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids.shape[0] > 0:
+            if token_ids:
                # Common case.
                next_token_id = token_ids[-1]
            else:
@@ -955,7 +951,7 @@ class MtpProposer(Proposer):
                seq_len = req_state.num_computed_tokens + num_scheduled_tokens[
                    req_id]
                next_token_id = req_state.get_token_id(seq_len)
-            next_token_ids.append(next_token_id.item())
+            next_token_ids.append(next_token_id)
        next_token_ids = torch.tensor(next_token_ids,
                                      dtype=torch.int32,
                                      device=self.input_ids.device)