Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -126,6 +126,9 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
        # tp-dp combination broken:
        # https://github.com/vllm-project/vllm/issues/34458
        and cfg.parallel_config.data_parallel_size == 1
+        # tp-pp combination broken:
+        # https://github.com/vllm-project/vllm/issues/35426
+        and cfg.parallel_config.pipeline_parallel_size == 1
    )


@@ -857,7 +860,7 @@ class VllmConfig:
                self.compilation_config.pass_config.fuse_gemm_comms = False
            else:
                # Compute SP threshold early; disable if None (model too
-                # small) before +rms_norm gets forced into custom_ops.
+                # small for SP to be beneficial).
                pass_config = self.compilation_config.pass_config
                if pass_config.sp_min_token_num is None:
                    from vllm.compilation.passes.fusion.sequence_parallelism import (
@@ -880,15 +883,13 @@ class VllmConfig:
                    self.compilation_config.pass_config.enable_sp = False
                    self.compilation_config.pass_config.fuse_gemm_comms = False

-        if self.compilation_config.pass_config.enable_sp:
-            if "-rms_norm" in self.compilation_config.custom_ops:
-                logger.warning(
-                    "RMS norm force disabled, sequence parallelism might break"
-                )
-            else:
-                self.compilation_config.custom_ops.append("+rms_norm")
+        from vllm.utils.torch_utils import HAS_OPAQUE_TYPE

-        if self.compilation_config.fast_moe_cold_start is None:
+        if HAS_OPAQUE_TYPE:
+            # On torch >= 2.11 the hoisted OpaqueObject approach supersedes
+            # fast_moe_cold_start, so force it off.
+            self.compilation_config.fast_moe_cold_start = False
+        elif self.compilation_config.fast_moe_cold_start is None:
            # resolve default behavior: try to be as safe as possible
            # this config is unsafe if any spec decoding draft model has a MOE.
            # We'll conservatively turn it off if we see spec decoding.
@@ -907,9 +908,9 @@ class VllmConfig:
                ):
                    logger.warning_once(
                        "Pooling models do not support full cudagraphs. "
-                        "Overriding cudagraph_mode to PIECEWISE."
+                        "Overriding cudagraph_mode to NONE."
                    )
-                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
                elif (
                    model_config.is_encoder_decoder
                    and self.compilation_config.cudagraph_mode
@@ -924,6 +925,33 @@ class VllmConfig:
                        CUDAGraphMode.FULL_DECODE_ONLY
                    )

+            # Check if KV connector requires PIECEWISE mode for CUDA graphs
+            if (
+                self.kv_transfer_config is not None
+                and self.kv_transfer_config.is_kv_transfer_instance
+                and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+            ):
+                # Lazy import to avoid circular dependencies
+                from vllm.distributed.kv_transfer.kv_connector.factory import (
+                    KVConnectorFactory,
+                )
+
+                connector_cls = KVConnectorFactory.get_connector_class(
+                    self.kv_transfer_config
+                )
+                if connector_cls.requires_piecewise_for_cudagraph(
+                    self.kv_transfer_config.kv_connector_extra_config
+                ):
+                    logger.warning_once(
+                        "KV connector %s requires PIECEWISE CUDA graph mode "
+                        "due to layerwise async operations that cannot be "
+                        "captured in CUDA graphs. "
+                        "Overriding cudagraph_mode from %s to PIECEWISE.",
+                        connector_cls.__name__,
+                        self.compilation_config.cudagraph_mode.name,
+                    )
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
            # disable cudagraph when enforce eager execution
            if self.model_config is not None and self.model_config.enforce_eager:
                logger.info("Cudagraph is disabled under eager mode")
@@ -1113,6 +1141,20 @@ class VllmConfig:

        if not self.instance_id:
            self.instance_id = random_uuid()[:5]
+            
+        def is_ixserver_connector(kv_transfer_config) -> bool:
+            if kv_transfer_config is not None and hasattr(
+                kv_transfer_config, "kv_connector"
+            ):
+                connector = kv_transfer_config.kv_connector
+                if isinstance(connector, str):
+                    connector_name = connector
+                else:
+                    connector_name = getattr(
+                        type(connector), "__name__", str(connector)
+                    )
+                return "IxServer" in connector_name
+            return False

        # Hybrid KV cache manager (HMA) runtime rules:
        # - Explicit enable (--no-disable-kv-cache-manager): error if runtime
@@ -1154,21 +1196,29 @@ class VllmConfig:
        if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
            # Default to disable HMA, but only if the user didn't express a preference.
            if self.kv_transfer_config is not None:
+                if is_ixserver_connector(self.kv_transfer_config):
+                    pass
                # NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
-                need_disable_hybrid_kv_cache_manager = True
-                logger.warning(
-                    "Turning off hybrid kv cache manager because "
-                    "`--kv-transfer-config` is set. This will reduce the "
-                    "performance of vLLM on LLMs with sliding window attention "
-                    "or Mamba attention. If you are a developer of kv connector"
-                    ", please consider supporting hybrid kv cache manager for "
-                    "your connector by making sure your connector is a subclass"
-                    " of `SupportsHMA` defined in kv_connector/v1/base.py and"
-                    " use --no-disable-hybrid-kv-cache-manager to start vLLM."
+                else:
+                    need_disable_hybrid_kv_cache_manager = True
+                    logger.warning(
+                        "Turning off hybrid kv cache manager because "
+                        "`--kv-transfer-config` is set. This will reduce the "
+                        "performance of vLLM on LLMs with sliding window attention "
+                        "or Mamba attention. If you are a developer of kv connector"
+                        ", please consider supporting hybrid kv cache manager for "
+                        "your connector by making sure your connector is a subclass"
+                        " of `SupportsHMA` defined in kv_connector/v1/base.py and"
+                        " use --no-disable-hybrid-kv-cache-manager to start vLLM."
+                    )
+                    self.scheduler_config.disable_hybrid_kv_cache_manager = (
+                    need_disable_hybrid_kv_cache_manager
+                )
+                    
+            else:
+                self.scheduler_config.disable_hybrid_kv_cache_manager = (
+                    need_disable_hybrid_kv_cache_manager
                )
-            self.scheduler_config.disable_hybrid_kv_cache_manager = (
-                need_disable_hybrid_kv_cache_manager
-            )
        elif (
            self.scheduler_config.disable_hybrid_kv_cache_manager is False
            and need_disable_hybrid_kv_cache_manager
@@ -1466,22 +1516,22 @@ class VllmConfig:
        if compile_range_end is not None:
            computed_compile_ranges_split_points.append(compile_range_end)

-        # # Add the compile ranges for flashinfer
-        # if compilation_config.pass_config.fuse_allreduce_rms:
-        #     tp_size = self.parallel_config.tensor_parallel_size
-        #     max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
-        #     if max_size is not None:
-        #         max_token_num = max_size // (
-        #             self.model_config.get_hidden_size()
-        #             * self.model_config.dtype.itemsize
-        #         )
-        #         if compile_range_end is not None and max_token_num < compile_range_end:
-        #             computed_compile_ranges_split_points.append(max_token_num)
-        #         else:
-        #             logger.debug(
-        #                 "Max num batched tokens below allreduce-rms fusion threshold, "
-        #                 "allreduce-rms fusion will be enabled for all num_tokens."
-        #             )
+        # Add the compile ranges for flashinfer
+        if compilation_config.pass_config.fuse_allreduce_rms:
+            tp_size = self.parallel_config.tensor_parallel_size
+            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
+            if max_size is not None:
+                max_token_num = max_size // (
+                    self.model_config.get_hidden_size()
+                    * self.model_config.dtype.itemsize
+                )
+                if compile_range_end is not None and max_token_num < compile_range_end:
+                    computed_compile_ranges_split_points.append(max_token_num)
+                else:
+                    logger.debug(
+                        "Max num batched tokens below allreduce-rms fusion threshold, "
+                        "allreduce-rms fusion will be enabled for all num_tokens."
+                    )

        # Add the compile ranges for sequence parallelism
        if compilation_config.pass_config.enable_sp:
@@ -1618,6 +1668,7 @@ class VllmConfig:
            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
            f"data_parallel_size={self.parallel_config.data_parallel_size}, "  # noqa
            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
+            f"quantization={self.model_config.quantization}, "
            f"enforce_eager={self.model_config.enforce_eager}, "
            f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, "  # noqa
            f"kv_cache_dtype={self.cache_config.cache_dtype}, "