Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -19,9 +19,6 @@ import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
# from vllm.model_executor.layers.batch_invariant import (
|
||||
# vllm_is_batch_invariant,
|
||||
# )
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -140,6 +137,7 @@ autotune = _lazy_import_wrapper(
|
||||
"autotune",
|
||||
fallback_fn=lambda *args, **kwargs: contextlib.nullcontext(),
|
||||
)
|
||||
_is_fi_autotuning: bool = False
|
||||
|
||||
|
||||
@functools.cache
|
||||
@@ -279,6 +277,9 @@ def supports_trtllm_attention() -> bool:
|
||||
TRTLLM attention is supported if the platform is SM100,
|
||||
NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
|
||||
"""
|
||||
from vllm.model_executor.layers.batch_invariant import (
|
||||
vllm_is_batch_invariant,
|
||||
)
|
||||
# Batch-invariant mode disables TRTLLM attention
|
||||
if vllm_is_batch_invariant():
|
||||
return False
|
||||
@@ -734,7 +735,7 @@ def should_use_flashinfer_for_blockscale_fp8_gemm(
|
||||
|
||||
# Verify DeepGEMM N/K dims requirements
|
||||
# NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
|
||||
# test inside kernels/quatization/test_block_fp8.py
|
||||
# test inside kernels/quantization/test_block_fp8.py
|
||||
N_MULTIPLE = 64
|
||||
K_MULTIPLE = 128
|
||||
|
||||
|
||||
Reference in New Issue
Block a user