Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -19,9 +19,6 @@ import torch
import vllm.envs as envs
from vllm.logger import init_logger
# from vllm.model_executor.layers.batch_invariant import (
# vllm_is_batch_invariant,
# )
from vllm.platforms import current_platform
logger = init_logger(__name__)
@@ -140,6 +137,7 @@ autotune = _lazy_import_wrapper(
"autotune",
fallback_fn=lambda *args, **kwargs: contextlib.nullcontext(),
)
_is_fi_autotuning: bool = False
@functools.cache
@@ -279,6 +277,9 @@ def supports_trtllm_attention() -> bool:
TRTLLM attention is supported if the platform is SM100,
NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
"""
from vllm.model_executor.layers.batch_invariant import (
vllm_is_batch_invariant,
)
# Batch-invariant mode disables TRTLLM attention
if vllm_is_batch_invariant():
return False
@@ -734,7 +735,7 @@ def should_use_flashinfer_for_blockscale_fp8_gemm(
# Verify DeepGEMM N/K dims requirements
# NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
# test inside kernels/quatization/test_block_fp8.py
# test inside kernels/quantization/test_block_fp8.py
N_MULTIPLE = 64
K_MULTIPLE = 128