upgrade vLLM to main (#4608)

1. fix https://github.com/vllm-project/vllm/pull/28542 The model structure modifications we involved in are: - Qwen2.5-VL(still exist some patch) - Qwen2-VL - Qwen2 - DeepSeek series - Qwen-moe series 2. fix https://github.com/vllm-project/vllm/pull/29121 the output token now type changed from np to `list[list[int]]` 3. fix https://github.com/vllm-project/vllm/pull/29262 `xformers` backend for multimodal now has been deprecated 4. fix https://github.com/vllm-project/vllm/pull/29342 5. fix https://github.com/vllm-project/vllm/pull/28579 6. fix https://github.com/vllm-project/vllm/pull/28718 7. fix https://github.com/vllm-project/vllm/issues/28665 8. fix https://github.com/vllm-project/vllm/pull/26847 vllm introduced the `optimization-level`, some default config has been changed, and the param `--enforce-eager` has been deprecated 9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple for sampler. 10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the related patch to avoid this kind of error. Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
2025-12-02 22:10:52 +08:00
parent 4588cdac02
commit 7f2673ea2d
60 changed files with 383 additions and 374 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -39,9 +39,9 @@ import torch._dynamo.cache_size
 import torch.distributed as dist
 import torch.nn as nn
 from tqdm import tqdm  # type: ignore
-from vllm.attention import AttentionType, get_attn_backend
-from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend, AttentionType
 from vllm.attention.layer import Attention, MLAAttention
+from vllm.attention.selector import get_attn_backend
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
 from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
@@ -53,7 +53,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.distributed.parallel_state import (get_dcp_group, get_dp_group,
                                             get_pp_group, get_tp_group,
                                             is_global_first_rank)
-from vllm.forward_context import BatchDescriptor, get_forward_context
+from vllm.forward_context import get_forward_context
 from vllm.logger import logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.mamba.abstract import MambaBase
@@ -244,11 +244,9 @@ class AsyncNPUModelRunnerOutput(AsyncModelRunnerOutput):
        # Release the device tensor once the copy has completed
        del self._sampled_token_ids

-        valid_sampled_token_ids: list[np.ndarray] = [
-            row for row in self._sampled_token_ids_cpu.numpy()
-        ]
+        valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
        for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i] = np.array([])
+            valid_sampled_token_ids[i].clear()

        output = self._model_runner_output
        output.sampled_token_ids = valid_sampled_token_ids
@@ -332,7 +330,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        # Ascend-specific configurations
        self.ascend_config = get_ascend_config()
        if self.ascend_config.ascend_scheduler_config.enabled:
-            self.chunked_prefill_enabled = self.scheduler_config.chunked_prefill_enabled
+            self.chunked_prefill_enabled = self.scheduler_config.enable_chunked_prefill
        else:
            self.chunked_prefill_enabled = True
        self.weight_prefetch_method = WeightPrefetchMethod(
@@ -2130,7 +2128,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):

    def propose_draft_token_ids(
        self,
-        valid_sampled_token_ids: Union[torch.Tensor, list[np.ndarray]],
+        valid_sampled_token_ids: torch.Tensor | list[list[int]],
        sampling_metadata: SamplingMetadata,
        scheduler_output: "SchedulerOutput",
        spec_decode_metadata: SpecDecodeMetadata,
@@ -2309,10 +2307,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
            scheduler_output.total_num_scheduled_tokens
            == self.input_batch.num_reqs * max_query_len)
-        batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
-                                           uniform_decode=uniform_decode)
+        has_lora = len(self.input_batch.lora_id_to_lora_request) > 0
        aclgraph_runtime_mode, batch_descriptor = \
-            self.aclgraph_dispatcher.dispatch(batch_descriptor)
+            self.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)

        # Run forward pass
        with ProfileExecuteDuration().capture_async("forward"):
@@ -2510,18 +2507,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                max_gen_len = sampled_token_ids.shape[-1]
                if max_gen_len == 1:
                    # No spec decode tokens. It's a tensor.
-                    valid_sampled_token_ids: list[np.ndarray] = [
-                        row for row in sampled_token_ids.cpu().numpy()
-                    ]
+                    valid_sampled_token_ids = sampled_token_ids.tolist()
                else:
                    # Includes spec decode tokens. It's a numpy array
-                    valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                    valid_sampled_token_ids, _ = self.rejection_sampler.parse_output(
                        sampled_token_ids,
                        self.input_batch.vocab_size,
                    )
                # Mask out the sampled tokens that should not be sampled.
                for i in discard_sampled_tokens_req_indices:
-                    valid_sampled_token_ids[int(i)] = np.array([])
+                    valid_sampled_token_ids[int(i)].clear()
            else:
                valid_sampled_token_ids = []
                invalid_req_indices = discard_sampled_tokens_req_indices.tolist(
@@ -2547,17 +2542,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            # the sampled tokens back, because there's no direct communication
            # between the first-stage worker and the last-stage worker.
            for req_idx in range(num_sampled_tokens):
-                sampled_ids: np.ndarray | None
                if self.use_async_scheduling:
-                    sampled_ids = (np.array([-1]) if req_idx
-                                   not in invalid_req_indices_set else None)
+                    sampled_ids = [-1] * 1 if \
+                        req_idx not in invalid_req_indices_set else None
                else:
                    sampled_ids = valid_sampled_token_ids[req_idx]
-                if sampled_ids is None or sampled_ids.shape[0] == 0:
+                if not sampled_ids:
                    continue

                start_idx = self.input_batch.num_tokens_no_spec[req_idx]
-                end_idx = start_idx + sampled_ids.shape[0]
+                end_idx = start_idx + len(sampled_ids)
                assert end_idx <= self.model_config.max_model_len, (
                    "Sampled token IDs exceed the max model length. "
                    f"Total number of tokens: {end_idx} > max_model_len: "
@@ -2571,7 +2565,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                self.input_batch.num_tokens[req_idx] = end_idx
                req_id = self.input_batch.req_ids[req_idx]
                req_state = self.requests[req_id]
-                req_state.output_token_ids.extend(sampled_ids.tolist())
+                req_state.output_token_ids.extend(sampled_ids)

        def propose_draft_token_ids(sampled_token_ids):
            assert self.spec_decode_common_attn_metadata is not None
@@ -2877,7 +2871,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        assert aclgraph_runtime_mode is None or aclgraph_runtime_mode in {
            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
        }
-
        # In multi-DP scenarios, there may be situations where all DP groups are executing dummy runs.
        # If sequence parallelism is enabled, it is essential to ensure that num_tokens is divisible by tp_size.
        if self.use_aclgraph and enable_sp(self.vllm_config):
@@ -2971,19 +2964,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                    k: v[:num_tokens]
                    for k, v in self.intermediate_tensors.items()
                })
-
+            has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
            # filter out the valid batch descriptor
            _ag_mode, batch_descriptor = \
-                self.aclgraph_dispatcher.dispatch(
-                    BatchDescriptor(num_tokens=num_tokens,
-                                    uniform_decode=uniform_decode))
+                self.aclgraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
            if aclgraph_runtime_mode is not None:
                # we allow forcing NONE when the dispatcher disagrees to support
                # warm ups for aclgraph capture
-                assert aclgraph_runtime_mode == CUDAGraphMode.NONE or \
-                    aclgraph_runtime_mode == _ag_mode, (
-                    f"Aclgraph runtime mode mismatch at dummy_run. "
-                    f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}.")
+                if aclgraph_runtime_mode != CUDAGraphMode.NONE and aclgraph_runtime_mode != _ag_mode:
+                    raise ValueError(
+                        f"Aclgraph runtime mode mismatch at dummy_run. "
+                        f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}."
+                    )
            else:
                aclgraph_runtime_mode = _ag_mode

@@ -4466,18 +4458,3 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            self.input_ids_pcp_full_cpu[:total_num_scheduled_tokens_pcp_full],
            non_blocking=True,
        )
-
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
-        # This is a short term mitigation for issue mentioned in
-        # https://github.com/vllm-project/vllm/issues/22754.
-        # `tolist` would trigger a cuda wise stream sync, which
-        # would block other copy ops from other cuda streams.
-        # A cuda event sync would avoid such a situation. Since
-        # this is in the critical path of every single model
-        # forward loop, this has caused perf issue for a disagg
-        # setup.
-        pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
-        pinned.copy_(sampled_token_ids, non_blocking=True)
-        self.transfer_event.record()
-        self.transfer_event.synchronize()
-        return [row for row in pinned.numpy()]