Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -172,7 +172,7 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:

    # Further check if the ModularKernel implementation uses the DeepGemmExperts
    return isinstance(
-        module.quant_method.moe_mk, (DeepGemmExperts, TritonOrDeepGemmExperts)
+        module.quant_method.moe_kernel, (DeepGemmExperts, TritonOrDeepGemmExperts)
    )


--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -88,9 +88,14 @@ def flashinfer_autotune(runner: "GPUModelRunner") -> None:
    Without autotuning, FlashInfer will rely on heuristics, which may
    be significantly slower.
    """
-    from vllm.utils.flashinfer import autotune
+    import vllm.utils.flashinfer as fi_utils
+
+    with torch.inference_mode(), fi_utils.autotune():
+        # Certain FlashInfer kernels (e.g. nvfp4 routed moe) are
+        # incompatible with autotuning. This state is used to skip
+        # those kernels during the autotuning process.
+        fi_utils._is_fi_autotuning = True

-    with torch.inference_mode(), autotune():
        # We skip EPLB here since we don't want to record dummy metrics
        # When autotuning with number of tokens m, flashinfer will autotune
        # operations for all number of tokens up to m.
@@ -100,3 +105,5 @@ def flashinfer_autotune(runner: "GPUModelRunner") -> None:
            skip_eplb=True,
            is_profile=True,
        )
+
+        fi_utils._is_fi_autotuning = False