Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -172,7 +172,7 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
# Further check if the ModularKernel implementation uses the DeepGemmExperts
return isinstance(
module.quant_method.moe_mk, (DeepGemmExperts, TritonOrDeepGemmExperts)
module.quant_method.moe_kernel, (DeepGemmExperts, TritonOrDeepGemmExperts)
)

View File

@@ -88,9 +88,14 @@ def flashinfer_autotune(runner: "GPUModelRunner") -> None:
Without autotuning, FlashInfer will rely on heuristics, which may
be significantly slower.
"""
from vllm.utils.flashinfer import autotune
import vllm.utils.flashinfer as fi_utils
with torch.inference_mode(), fi_utils.autotune():
# Certain FlashInfer kernels (e.g. nvfp4 routed moe) are
# incompatible with autotuning. This state is used to skip
# those kernels during the autotuning process.
fi_utils._is_fi_autotuning = True
with torch.inference_mode(), autotune():
# We skip EPLB here since we don't want to record dummy metrics
# When autotuning with number of tokens m, flashinfer will autotune
# operations for all number of tokens up to m.
@@ -100,3 +105,5 @@ def flashinfer_autotune(runner: "GPUModelRunner") -> None:
skip_eplb=True,
is_profile=True,
)
fi_utils._is_fi_autotuning = False