Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -26,6 +26,9 @@ if TYPE_CHECKING:
VLLM_ENGINE_READY_TIMEOUT_S: int = 600
VLLM_API_KEY: str | None = None
VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
VLLM_ENABLE_PP_MIX_ILU_SCHEDULING: bool = False
VLLM_ENABLE_PP_ILU_OPT: bool = False
VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE: int = 0
S3_ACCESS_KEY_ID: str | None = None
S3_SECRET_ACCESS_KEY: str | None = None
S3_ENDPOINT_URL: str | None = None
@@ -35,7 +38,7 @@ if TYPE_CHECKING:
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
VLLM_NO_USAGE_STATS: bool = False
VLLM_DO_NOT_TRACK: bool = False
VLLM_USAGE_SOURCE: str = ""
VLLM_USAGE_SOURCE: str = "production"
VLLM_CONFIGURE_LOGGING: bool = True
VLLM_LOGGING_LEVEL: str = "INFO"
VLLM_LOGGING_PREFIX: str = ""
@@ -48,7 +51,7 @@ if TYPE_CHECKING:
VLLM_USE_FLASHINFER_SAMPLER: bool | None = None
VLLM_PP_LAYER_PARTITION: str | None = None
VLLM_CPU_KVCACHE_SPACE: int | None = 0
VLLM_CPU_OMP_THREADS_BIND: str = ""
VLLM_CPU_OMP_THREADS_BIND: str = "auto"
VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None
VLLM_CPU_SGL_KERNEL: bool = False
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
@@ -89,13 +92,14 @@ if TYPE_CHECKING:
VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
VLLM_LORA_RESOLVER_HF_REPO_LIST: str | None = None
VLLM_USE_AOT_COMPILE: bool = False
VLLM_USE_BYTECODE_HOOK: bool = False
VLLM_USE_BYTECODE_HOOK: bool = True
VLLM_FORCE_AOT_LOAD: bool = False
VLLM_USE_MEGA_AOT_ARTIFACT: bool = False
VLLM_USE_TRITON_AWQ: bool = False
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False
VLLM_DISABLED_KERNELS: list[str] = []
VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE: bool = True
VLLM_DISABLE_PYNCCL: bool = False
VLLM_USE_OINK_OPS: bool = False
VLLM_ROCM_USE_AITER: bool = False
@@ -106,7 +110,7 @@ if TYPE_CHECKING:
VLLM_ROCM_USE_AITER_MLA: bool = True
VLLM_ROCM_USE_AITER_MHA: bool = True
VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False
VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = True
VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = False
VLLM_ROCM_USE_AITER_FP8BMM: bool = True
VLLM_ROCM_USE_AITER_FP4BMM: bool = True
VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
@@ -168,7 +172,7 @@ if TYPE_CHECKING:
VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = (
"latency"
)
VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "auto"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "trtllm"
VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
VLLM_XGRAMMAR_CACHE_MB: int = 0
VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
@@ -231,7 +235,7 @@ if TYPE_CHECKING:
VLLM_USE_FBGEMM: bool = False
VLLM_GC_DEBUG: str = ""
VLLM_DEBUG_WORKSPACE: bool = False
VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = True
VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
VLLM_USE_V2_MODEL_RUNNER: bool = False
@@ -243,17 +247,33 @@ if TYPE_CHECKING:
VLLM_LORA_DISABLE_PDL: bool = False
VLLM_ENABLE_CUDA_COMPATIBILITY: bool = False
VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
VLLM_FLAT_LOGPROBS: bool = False
# optional envs we add.
VLLM_W8A8_MOE_USE_W4A8: bool = False
VLLM_WNA16_MOE_USE_W4A8: bool = False
VLLM_W8A8_FORMAT: str = "TN"
VLLM_W4A8_FORMAT: str = "TN"
VLLM_W4A8_VERSION: int = 2
VLLM_MIX_QUANTIZATION_TYPE: str = ""
VLLM_MLA_CUSTOMIZE: bool = True
VLLM_USE_INT8_MLA: bool = False
VLLM_W8A8_LINEAR_USE_W4A8: bool = False
VLLM_FORCE_NCCL_COMM: bool =False
VLLM_KV_DISABLE_CROSS_GROUP_SHARE: bool = False
# support Iluvatar IxServer
VLLM_ATTN_OPT_LEVEL: int = 0
VLLM_MOE_OPT_LEVEL: int = 0
VLLM_LINEAR_OPT_LEVEL: int = 0
VLLM_OPT_EXCLUDE_LAYERS: str = ""
VLLM_LINEAR_SPECIFIED_LAYERS: str = ""
VLLM_LINEAR_SPECIFIED_KEYS: str = ""
VLLM_LINEAR_SPECIFIED_OPT_LEVEL: int = 0
VLLM_USE_LORA_FUSION: bool = False
VLLM_USE_SILU_QUANT_FUSION: bool = False
# static quant for attention
VLLM_ATTN_STATIC_QUANT_SCALE_FILE_PATH: str = ""
def get_default_cache_root():
return os.getenv(
@@ -478,6 +498,8 @@ def get_env_or_set_default(
logger = logging.getLogger(__name__)
IGNORED_UNKNOWN_VARS = {"VLLM_ENFORCE_CUDA_GRAPH"}
environment_variables: dict[str, Callable[[], Any]] = {
# ================== Installation Time Env Vars ==================
# Target device of vLLM, supporting [cuda (by default),
@@ -648,6 +670,22 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
).lower()
== "true",
# When set to 1, scheduler.schedule() delegates to schedule_opt()
# for PP mix ILU scheduling.
"VLLM_ENABLE_PP_MIX_ILU_SCHEDULING": lambda: os.environ.get(
"VLLM_ENABLE_PP_MIX_ILU_SCHEDULING", "0"
) == "1",
# When set to 1, use step_with_batch_queue_ilu_opt (async batch queue with
# background thread). Batch queue size can be controlled via
# VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE.
"VLLM_ENABLE_PP_ILU_OPT": lambda: os.environ.get(
"VLLM_ENABLE_PP_ILU_OPT", "0"
) == "1",
# Batch queue size used when VLLM_ENABLE_PP_ILU_OPT=1.
# If <= 0, EngineCore falls back to base_batch_queue_size * 2.
"VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE": lambda: int(
os.environ.get("VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE", "0")
),
# S3 access information, used for tensorizer to load model from S3
"S3_ACCESS_KEY_ID": lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
"S3_SECRET_ACCESS_KEY": lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
@@ -862,7 +900,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
),
# Time in ms for the zmq client to wait for a response from the backend
# server for simple data operations
"VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
"VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "1000000")),
# Timeout in seconds for keeping HTTP connections alive in API server
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
@@ -907,6 +945,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DISABLED_KERNELS": lambda: []
if "VLLM_DISABLED_KERNELS" not in os.environ
else os.environ["VLLM_DISABLED_KERNELS"].split(","),
"VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE": lambda: bool(
int(os.getenv("VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE", "1"))
),
# Disable pynccl (using torch.distributed instead)
"VLLM_DISABLE_PYNCCL": lambda: (
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
@@ -957,9 +998,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
os.getenv("VLLM_ROCM_USE_AITER_FP4_ASM_GEMM", "False").lower() in ("true", "1")
),
# Whether to use aiter rope.
# By default is enabled.
# By default is disabled.
"VLLM_ROCM_USE_AITER_TRITON_ROPE": lambda: (
os.getenv("VLLM_ROCM_USE_AITER_TRITON_ROPE", "True").lower() in ("true", "1")
os.getenv("VLLM_ROCM_USE_AITER_TRITON_ROPE", "False").lower() in ("true", "1")
),
# Whether to use aiter triton fp8 bmm kernel
# By default is enabled.
@@ -1305,9 +1346,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Flashinfer fused allreduce backend.
# "auto" will default to "mnnvl", which performs mostly same/better than "trtllm".
# But "mnnvl" backend does not support fuse with quantization.
# TODO: Default is "trtllm" right now because "mnnvl" has issues with cudagraph:
# https://github.com/vllm-project/vllm/issues/35772
# Should switch back to "auto" if the issue is resolved.
"VLLM_FLASHINFER_ALLREDUCE_BACKEND": env_with_choices(
"VLLM_FLASHINFER_ALLREDUCE_BACKEND",
"auto",
"trtllm",
["auto", "trtllm", "mnnvl"],
),
# Control the workspace buffer size for the FlashInfer backend.
@@ -1478,41 +1522,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
),
# Allows vllm to find tuned config under customized folder
"VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
# vLLM do not support W4A8 and W8A8, we add it. For MOE, we default use W8A8, If set to true, we use W4A8.
"VLLM_W8A8_LINEAR_USE_W4A8":
lambda: os.environ.get("VLLM_W8A8_LINEAR_USE_W4A8", "0").lower() in
("1", "true"),
"VLLM_W8A8_MOE_USE_W4A8":
lambda: os.environ.get("VLLM_W8A8_MOE_USE_W4A8", "0").lower() in
("1", "true"),
"VLLM_FORCE_NCCL_COMM":
lambda: os.environ.get("VLLM_FORCE_NCCL_COMM", "0").lower() in
("1", "true"),
# If set to true, we use int8 mla attention for decode stage.
"VLLM_USE_INT8_MLA":
lambda: os.environ.get("VLLM_USE_INT8_MLA", "0").lower() in
("1", "true"),
# For W4A8 MOE, we default use TN gemm format, choices: [TN, NN].
"VLLM_W4A8_FORMAT":
lambda: os.environ.get("VLLM_W4A8_FORMAT", "TN").upper(),
"VLLM_W4A8_VERSION":
# For W4A8 MOE, we default use version 2, choices: [1, 2].
lambda: int(os.environ.get("VLLM_W4A8_VERSION", "2")),
# temp param to support compressed-tensor's multi-quantization
"VLLM_MIX_QUANTIZATION_TYPE":
lambda: os.environ.get("VLLM_MIX_QUANTIZATION_TYPE", "").upper(),
# Use Customize mlp impl for faster speed and less gpu memory usage.
"VLLM_MLA_CUSTOMIZE":
lambda: os.environ.get("VLLM_MLA_CUSTOMIZE", "1").lower() in
("1", "true"),
# Valid values are container,code_interpreter,web_search_preview
# ex VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter
# If the server_label of your mcp tool is not in this list it will
@@ -1607,7 +1616,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DEBUG_WORKSPACE": lambda: bool(int(os.getenv("VLLM_DEBUG_WORKSPACE", "0"))),
# Disables parallel execution of shared_experts via separate cuda stream
"VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool(
int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0"))
int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "1"))
),
# Limits when we run shared_experts in a separate stream.
# We found out that for large batch sizes, the separate stream
@@ -1662,10 +1671,93 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_CUDA_COMPATIBILITY_PATH": lambda: os.environ.get(
"VLLM_CUDA_COMPATIBILITY_PATH", None
),
# control kv cache share cross group
"VLLM_KV_DISABLE_CROSS_GROUP_SHARE":
lambda: os.environ.get("VLLM_KV_DISABLE_CROSS_GROUP_SHARE", "0").lower() in
("1", "true")
# Whether it is a scale up launch engine for elastic EP,
# Should only be set by EngineCoreClient.
"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": lambda: bool(
int(os.getenv("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH", "0"))
),
# Whether to wait for all requests to drain before sending the
# scaling command in elastic EP.
"VLLM_ELASTIC_EP_DRAIN_REQUESTS": lambda: bool(
int(os.getenv("VLLM_ELASTIC_EP_DRAIN_REQUESTS", "0"))
),
# Flag to enable FlatLogprobs whose GC overhead is significantly smaller than
# the original list[dict[int, Logprob]] approach.
# After enabled, PromptLogprobs and SampleLogprobs would populated as
# FlatLogprobs.
"VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
# vLLM do not support W4A8 and W8A8, we add it. For MOE, we default use W8A8, If set to true, we use W4A8.
"VLLM_W8A8_MOE_USE_W4A8":
lambda: os.environ.get("VLLM_W8A8_MOE_USE_W4A8", "0").lower() in
("1", "true"),
"VLLM_WNA16_MOE_USE_W4A8":
lambda: os.environ.get("VLLM_WNA16_MOE_USE_W4A8", "0").lower() in
("1", "true"),
# If set to true, we use int8 mla attention for decode stage.
"VLLM_USE_INT8_MLA":
lambda: os.environ.get("VLLM_USE_INT8_MLA", "0").lower() in
("1", "true"),
# For attn, 0 for f16qkv, 1 for i8qkv, 2 for i8qkf16v
"VLLM_ATTN_OPT_LEVEL":
lambda: int(os.environ.get("VLLM_ATTN_OPT_LEVEL", "0")),
# For W8A8 MOE, we default use TN gemm format, choices: [TN, NN].However, GEMV(Cuda Graph) only supports NN.
"VLLM_W8A8_FORMAT":
lambda: os.environ.get("VLLM_W8A8_FORMAT", "TN").upper(),
# For W4A8 MOE, we default use TN gemm format, choices: [TN, NN].
"VLLM_W4A8_FORMAT":
lambda: os.environ.get("VLLM_W4A8_FORMAT", "TN").upper(),
"VLLM_W4A8_VERSION":
# For W4A8 MOE, we default use version 2, choices: [1, 2].
lambda: int(os.environ.get("VLLM_W4A8_VERSION", "2")),
# temp param to support compressed-tensor's multi-quantization
"VLLM_MIX_QUANTIZATION_TYPE":
lambda: os.environ.get("VLLM_MIX_QUANTIZATION_TYPE", "").upper(),
# Use Customize mlp impl for faster speed and less gpu memory usage.
"VLLM_MLA_CUSTOMIZE":
lambda: os.environ.get("VLLM_MLA_CUSTOMIZE", "1").lower() in
("1", "true"),
# support Iluvatar IxServer
# Does vLLM support Iluvatar IxServer which is a distributed inference framework.
"VLLM_MOE_OPT_LEVEL":
lambda: int(os.getenv("VLLM_MOE_OPT_LEVEL", "0")),
"VLLM_LINEAR_OPT_LEVEL":
lambda: int(os.getenv("VLLM_LINEAR_OPT_LEVEL", "0")),
"VLLM_OPT_EXCLUDE_LAYERS":
lambda: os.environ.get("VLLM_OPT_EXCLUDE_LAYERS", "").upper(),
"VLLM_LINEAR_SPECIFIED_LAYERS":
lambda: os.environ.get("VLLM_LINEAR_SPECIFIED_LAYERS", "").upper(),
"VLLM_LINEAR_SPECIFIED_KEYS":
lambda: os.environ.get("VLLM_LINEAR_SPECIFIED_KEYS", "").lower(),
"VLLM_LINEAR_SPECIFIED_OPT_LEVEL":
lambda: int(os.getenv("VLLM_LINEAR_SPECIFIED_OPT_LEVEL", "0")),
"VLLM_USE_LORA_FUSION":
lambda: os.environ.get("VLLM_USE_LORA_FUSION", "0").lower() in
("1", "true"),
"VLLM_USE_SILU_QUANT_FUSION":
lambda: os.environ.get("VLLM_USE_SILU_QUANT_FUSION", "0").lower() in
("1", "true"),
"VLLM_ATTN_STATIC_QUANT_SCALE_FILE_PATH":
lambda: os.environ.get("VLLM_ATTN_STATIC_QUANT_SCALE_FILE_PATH", ""),
}
@@ -1738,6 +1830,8 @@ def is_set(name: str):
def validate_environ(hard_fail: bool) -> None:
for env in os.environ:
if env.startswith("VLLM_") and env not in environment_variables:
if env in IGNORED_UNKNOWN_VARS:
continue
if hard_fail:
raise ValueError(f"Unknown vLLM environment variable detected: {env}")
else:
@@ -1801,10 +1895,7 @@ def compile_factors() -> dict[str, object]:
"VLLM_ENABLE_V1_MULTIPROCESSING",
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",
"VLLM_CPU_KVCACHE_SPACE",
"VLLM_CPU_OMP_THREADS_BIND",
"VLLM_CPU_NUM_OF_RESERVED_CPU",
"VLLM_CPU_MOE_PREPACK",
"VLLM_CPU_SGL_KERNEL",
"VLLM_TEST_FORCE_LOAD_FORMAT",
"VLLM_ENABLE_CUDA_COMPATIBILITY",
"VLLM_CUDA_COMPATIBILITY_PATH",