Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -172,7 +172,7 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
|
||||
|
||||
# Further check if the ModularKernel implementation uses the DeepGemmExperts
|
||||
return isinstance(
|
||||
module.quant_method.moe_mk, (DeepGemmExperts, TritonOrDeepGemmExperts)
|
||||
module.quant_method.moe_kernel, (DeepGemmExperts, TritonOrDeepGemmExperts)
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -88,9 +88,14 @@ def flashinfer_autotune(runner: "GPUModelRunner") -> None:
|
||||
Without autotuning, FlashInfer will rely on heuristics, which may
|
||||
be significantly slower.
|
||||
"""
|
||||
from vllm.utils.flashinfer import autotune
|
||||
import vllm.utils.flashinfer as fi_utils
|
||||
|
||||
with torch.inference_mode(), fi_utils.autotune():
|
||||
# Certain FlashInfer kernels (e.g. nvfp4 routed moe) are
|
||||
# incompatible with autotuning. This state is used to skip
|
||||
# those kernels during the autotuning process.
|
||||
fi_utils._is_fi_autotuning = True
|
||||
|
||||
with torch.inference_mode(), autotune():
|
||||
# We skip EPLB here since we don't want to record dummy metrics
|
||||
# When autotuning with number of tokens m, flashinfer will autotune
|
||||
# operations for all number of tokens up to m.
|
||||
@@ -100,3 +105,5 @@ def flashinfer_autotune(runner: "GPUModelRunner") -> None:
|
||||
skip_eplb=True,
|
||||
is_profile=True,
|
||||
)
|
||||
|
||||
fi_utils._is_fi_autotuning = False
|
||||
|
||||
Reference in New Issue
Block a user