Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/platforms/init.py
+++ b/vllm/platforms/init.py
@@ -86,10 +86,7 @@ def cuda_platform_plugin() -> str | None:
            pynvml.nvmlShutdown()
    except Exception as e:
        logger.debug("Exception happens when checking CUDA platform: %s", str(e))
-        nvml_related_error = (
-            "nvml" in e.__class__.__name__.lower() or "nvml" in str(e).lower()
-        )
-        if not nvml_related_error:
+        if "nvml" not in e.__class__.__name__.lower():
            # If the error is not related to NVML, re-raise it.
            raise e

@@ -105,40 +102,7 @@ def cuda_platform_plugin() -> str | None:
            logger.debug("Confirmed CUDA platform is available on Jetson.")
            is_cuda = True
        else:
-            try:
-                import torch
-
-                if vllm_version_matches_substr("cpu"):
-                    logger.debug(
-                        "CUDA platform torch fallback is skipped because "
-                        "vLLM is built with CPU."
-                    )
-                elif not torch.cuda._is_compiled():
-                    logger.debug(
-                        "CUDA platform torch fallback reports CUDA is not compiled."
-                    )
-                elif not torch.cuda.is_available():
-                    logger.debug(
-                        "CUDA platform torch fallback reports "
-                        "torch.cuda.is_available()=False."
-                    )
-                elif torch.cuda.device_count() <= 0:
-                    logger.debug(
-                        "CUDA platform torch fallback reports no GPU is found."
-                    )
-                else:
-                    logger.debug(
-                        "Confirmed CUDA platform is available via torch fallback "
-                        "after NVML check failed."
-                    )
-                    is_cuda = True
-            except Exception as torch_e:
-                logger.debug(
-                    "CUDA platform torch fallback failed because: %s", str(torch_e)
-                )
-
-            if not is_cuda:
-                logger.debug("CUDA platform is not available because: %s", str(e))
+            logger.debug("CUDA platform is not available because: %s", str(e))

    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -483,3 +483,27 @@ class CpuPlatform(Platform):
    @classmethod
    def support_hybrid_kv_cache(cls) -> bool:
        return True
+
+    @classmethod
+    def import_kernels(cls) -> None:
+        if Platform.get_cpu_architecture() in (CpuArchEnum.X86,):
+            if torch._C._cpu._is_avx512_supported():
+                try:
+                    import vllm._C  # noqa: F401
+                except ImportError as e:
+                    logger.warning("Failed to import from vllm._C: %r", e)
+            else:
+                # Note: The lib name is _C_AVX2, but the module name is _C.
+                # This will cause a exception "dynamic module does define
+                # module export function". But the library is imported
+                # successfully. So ignore the exception for now, until we find
+                # a solution.
+                try:
+                    import vllm._C_AVX2  # noqa: F401
+                except ImportError as e:
+                    logger.warning("Failed to import from vllm._C_AVX2: %r", e)
+        else:
+            try:
+                import vllm._C  # noqa: F401
+            except ImportError as e:
+                logger.warning("Failed to import from vllm._C: %r", e)
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -6,10 +6,13 @@ pynvml. However, it should not initialize cuda context.

 import os
 from collections.abc import Callable
+from datetime import timedelta
 from functools import cache, wraps
 from typing import TYPE_CHECKING, TypeVar

 import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
 from typing_extensions import ParamSpec

 # import custom ops, trigger op registration
@@ -414,6 +417,7 @@ class CudaPlatformBase(Platform):
            AttentionBackendEnum.FLASH_ATTN,
            AttentionBackendEnum.TRITON_ATTN,
            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.FLASHINFER,
        ]

    @classmethod
@@ -481,6 +485,37 @@ class CudaPlatformBase(Platform):
    def get_static_graph_wrapper_cls(cls) -> str:
        return "vllm.compilation.cuda_graph.CUDAGraphWrapper"

+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(
+            prefix_store, group_rank, group_size, backend_options
+        )
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
+
    @classmethod
    def device_count(cls) -> int:
        return cuda_device_count_stateless()
@@ -556,7 +591,7 @@ class NvmlCudaPlatform(CudaPlatformBase):
            physical_device_id = cls.device_id_to_physical_device_id(device_id)
            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
-            return DeviceCapability(major=major, minor=minor)
+            return DeviceCapability(major=9, minor=0)
        except RuntimeError:
            return None

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -220,12 +220,13 @@ class Platform:
    @classmethod
    def import_kernels(cls) -> None:
        """Import any platform-specific C kernels."""
-        try:
-            import vllm._C  # noqa: F401
-        except ImportError as e:
-            logger.warning("Failed to import from vllm._C: %r", e)
-        with contextlib.suppress(ImportError):
-            import vllm._moe_C  # noqa: F401
+        pass
+        # try:
+        #     import vllm._C  # noqa: F401
+        # except ImportError as e:
+        #     logger.warning("Failed to import from vllm._C: %r", e)
+        # with contextlib.suppress(ImportError):
+        #     import vllm._moe_C  # noqa: F401

    @classmethod
    def get_attn_backend_cls(
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -2,10 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
+from datetime import timedelta
 from functools import cache, lru_cache, wraps
 from typing import TYPE_CHECKING

+import regex as re
 import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available

 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -61,13 +65,29 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
    "0x744c": "AMD_Radeon_RX7900XTX",
 }

-# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`
-if "HIP_VISIBLE_DEVICES" in os.environ:
-    val = os.environ["HIP_VISIBLE_DEVICES"]
-    if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
-        assert val == cuda_val
-    else:
-        os.environ["CUDA_VISIBLE_DEVICES"] = val
+
+def _sync_hip_cuda_env_vars():
+    """Ensure HIP_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES are consistent.
+    Treats empty string as unset. Raises on genuine conflicts."""
+    hip_val = os.environ.get("HIP_VISIBLE_DEVICES") or None
+    cuda_val = os.environ.get("CUDA_VISIBLE_DEVICES") or None
+
+    if hip_val is not None and cuda_val is not None:
+        if hip_val != cuda_val:
+            raise ValueError(
+                f"Inconsistent GPU visibility env vars: "
+                f"HIP_VISIBLE_DEVICES='{hip_val}' vs "
+                f"CUDA_VISIBLE_DEVICES='{cuda_val}'. "
+                f"Please set only one, or ensure they match."
+            )
+    elif hip_val is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = hip_val
+    elif cuda_val is not None:
+        os.environ["HIP_VISIBLE_DEVICES"] = cuda_val
+
+
+# Sync at import time - catches misconfigurations from process start.
+_sync_hip_cuda_env_vars()

 # AMDSMI utils
 # Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`,
@@ -131,6 +151,77 @@ _ON_GFX942 = "gfx942" in _GCN_ARCH
 _ON_GFX950 = "gfx950" in _GCN_ARCH


+def _capability_from_gcn_arch(gcn_arch: str) -> tuple[int, int] | None:
+    """
+    Parse (major, minor) from a GCN arch string, mirroring how
+    HIP derives hipDeviceProp_t.major / .minor.
+
+    Format: gfx<MAJOR><MINOR><STEPPING>
+      - 1-digit major  (gfx9xx):  "gfx" + M + m + stepping
+      - 2-digit major  (gfx1xxx): "gfx" + MM + m + stepping
+
+    Examples:
+      gfx90a  -> (9, 0)    gfx942  -> (9, 4)    gfx950 -> (9, 5)
+      gfx1100 -> (11, 0)   gfx1101 -> (11, 0)   gfx1200 -> (12, 0)
+
+    Returns None only when the string is not gfx-prefixed at all
+    (i.e. not a ROCm arch string). Raises on any string that looks
+    like a GCN arch but does not match a known layout.
+    """
+    m = re.match(r"gfx(\d+)", gcn_arch)
+    if not m:
+        # Not a gfx string at all — caller should fall back to torch.cuda
+        return None
+
+    digits = m.group(1)
+    n = len(digits)
+
+    if n < 2:
+        raise ValueError(
+            f"GCN arch '{gcn_arch}' has too few digits ({n}) after 'gfx' "
+            f"to derive a (major, minor) capability. "
+            f"Please file a vLLM issue with your GPU model."
+        )
+
+    if n in (2, 3):
+        # 1-digit major: gfx9 family
+        # len 2: major + minor          (e.g. gfx90 from gfx90a)
+        # len 3: major + minor + step   (e.g. gfx942)
+        major = int(digits[0])
+        minor = int(digits[1])
+    elif n == 4:
+        # 2-digit major: gfx10xx, gfx11xx, gfx12xx
+        # major(2) + minor(1) + stepping(1)
+        major = int(digits[:2])
+        minor = int(digits[2])
+    elif n >= 5:
+        raise ValueError(
+            f"GCN arch '{gcn_arch}' has {n} digits after 'gfx', which "
+            f"exceeds the known 4-digit layout (MMms). Cannot determine "
+            f"major/minor split unambiguously. "
+            f"Please file a vLLM issue with your GPU model."
+        )
+
+    if major < 9:
+        raise ValueError(
+            f"Parsed unknown ROCm architecture from GCN arch '{gcn_arch}': "
+            f"major={major}, minor={minor}. "
+            f"Major version < 9 is not expected for any supported AMD GPU. "
+            f"Please file a vLLM issue with your GPU model."
+        )
+
+    if major > 12:
+        raise ValueError(
+            f"Parsed unknown ROCm architecture from GCN arch '{gcn_arch}': "
+            f"major={major}, minor={minor}. "
+            f"Major version > 12 is beyond currently known AMD generations. "
+            f"Please file a vLLM issue with your GPU model so support "
+            f"can be added."
+        )
+
+    return (major, minor)
+
+
 def on_gfx1x() -> bool:
    return _ON_GFX1X

@@ -441,6 +532,15 @@ class RocmPlatform(Platform):
    @classmethod
    @lru_cache(maxsize=8)
    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
+        cap = _capability_from_gcn_arch(_GCN_ARCH)
+        if cap is not None:
+            return DeviceCapability(major=cap[0], minor=cap[1])
+
+        logger.warning_once(
+            "Could not derive device capability from GCN arch '%s', "
+            "falling back to torch.cuda (this will initialize CUDA).",
+            _GCN_ARCH,
+        )
        major, minor = torch.cuda.get_device_capability(device_id)
        return DeviceCapability(major=major, minor=minor)

@@ -492,7 +592,6 @@ class RocmPlatform(Platform):
        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
        use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled()
        use_aiter_fused_se = rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
-        use_aiter_triton_rope = rocm_aiter_ops.is_triton_rotary_embed_enabled()
        #  Aiter rms norm perform best when CUDA Graph capture is enabled.
        if (
            use_aiter_rms_norm
@@ -519,9 +618,9 @@ class RocmPlatform(Platform):
            and "-grouped_topk" not in compilation_config.custom_ops
        ):
            compilation_config.custom_ops.append("+grouped_topk")
-        # Enable rotary embedding when using AITER if its not disabled by user
+        # Enable rotary embedding customop when using AITER if not disabled by user
        if (
-            use_aiter_triton_rope
+            rocm_aiter_ops.is_enabled()
            and "+rotary_embedding" not in compilation_config.custom_ops
            and "-rotary_embedding" not in compilation_config.custom_ops
        ):
@@ -656,6 +755,37 @@ class RocmPlatform(Platform):
    def get_static_graph_wrapper_cls(cls) -> str:
        return "vllm.compilation.cuda_graph.CUDAGraphWrapper"

+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(
+            prefix_store, group_rank, group_size, backend_options
+        )
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
+
    @classmethod
    def device_count(cls) -> int:
        return cuda_device_count_stateless()
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -221,6 +221,12 @@ class XPUPlatform(Platform):
                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
            )

+        # In some cases, the internal memory type cache can misdetect GPU
+        # memory as host memory, also leading to invalid memory access.
+        # This cache can be disabled by setting UCX_MEMTYPE_CACHE=n.
+        # ref. https://openucx.readthedocs.io/en/master/faq.html
+        os.environ["UCX_MEMTYPE_CACHE"] = "n"
+
    @classmethod
    def support_hybrid_kv_cache(cls) -> bool:
        return True