Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -19,9 +19,6 @@ import torch

 import vllm.envs as envs
 from vllm.logger import init_logger
-# from vllm.model_executor.layers.batch_invariant import (
-#     vllm_is_batch_invariant,
-# )
 from vllm.platforms import current_platform

 logger = init_logger(__name__)
@@ -140,6 +137,7 @@ autotune = _lazy_import_wrapper(
    "autotune",
    fallback_fn=lambda *args, **kwargs: contextlib.nullcontext(),
 )
+_is_fi_autotuning: bool = False


@functools.cache
@@ -279,6 +277,9 @@ def supports_trtllm_attention() -> bool:
    TRTLLM attention is supported if the platform is SM100,
    NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
    """
+    from vllm.model_executor.layers.batch_invariant import (
+        vllm_is_batch_invariant,
+    )
    # Batch-invariant mode disables TRTLLM attention
    if vllm_is_batch_invariant():
        return False
@@ -734,7 +735,7 @@ def should_use_flashinfer_for_blockscale_fp8_gemm(

    # Verify DeepGEMM N/K dims requirements
    # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
-    # test inside kernels/quatization/test_block_fp8.py
+    # test inside kernels/quantization/test_block_fp8.py
    N_MULTIPLE = 64
    K_MULTIPLE = 128