Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
203
vllm/envs.py
203
vllm/envs.py
@@ -26,6 +26,9 @@ if TYPE_CHECKING:
|
||||
VLLM_ENGINE_READY_TIMEOUT_S: int = 600
|
||||
VLLM_API_KEY: str | None = None
|
||||
VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
|
||||
VLLM_ENABLE_PP_MIX_ILU_SCHEDULING: bool = False
|
||||
VLLM_ENABLE_PP_ILU_OPT: bool = False
|
||||
VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE: int = 0
|
||||
S3_ACCESS_KEY_ID: str | None = None
|
||||
S3_SECRET_ACCESS_KEY: str | None = None
|
||||
S3_ENDPOINT_URL: str | None = None
|
||||
@@ -35,7 +38,7 @@ if TYPE_CHECKING:
|
||||
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
|
||||
VLLM_NO_USAGE_STATS: bool = False
|
||||
VLLM_DO_NOT_TRACK: bool = False
|
||||
VLLM_USAGE_SOURCE: str = ""
|
||||
VLLM_USAGE_SOURCE: str = "production"
|
||||
VLLM_CONFIGURE_LOGGING: bool = True
|
||||
VLLM_LOGGING_LEVEL: str = "INFO"
|
||||
VLLM_LOGGING_PREFIX: str = ""
|
||||
@@ -48,7 +51,7 @@ if TYPE_CHECKING:
|
||||
VLLM_USE_FLASHINFER_SAMPLER: bool | None = None
|
||||
VLLM_PP_LAYER_PARTITION: str | None = None
|
||||
VLLM_CPU_KVCACHE_SPACE: int | None = 0
|
||||
VLLM_CPU_OMP_THREADS_BIND: str = ""
|
||||
VLLM_CPU_OMP_THREADS_BIND: str = "auto"
|
||||
VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None
|
||||
VLLM_CPU_SGL_KERNEL: bool = False
|
||||
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
|
||||
@@ -89,13 +92,14 @@ if TYPE_CHECKING:
|
||||
VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
|
||||
VLLM_LORA_RESOLVER_HF_REPO_LIST: str | None = None
|
||||
VLLM_USE_AOT_COMPILE: bool = False
|
||||
VLLM_USE_BYTECODE_HOOK: bool = False
|
||||
VLLM_USE_BYTECODE_HOOK: bool = True
|
||||
VLLM_FORCE_AOT_LOAD: bool = False
|
||||
VLLM_USE_MEGA_AOT_ARTIFACT: bool = False
|
||||
VLLM_USE_TRITON_AWQ: bool = False
|
||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||
VLLM_SKIP_P2P_CHECK: bool = False
|
||||
VLLM_DISABLED_KERNELS: list[str] = []
|
||||
VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE: bool = True
|
||||
VLLM_DISABLE_PYNCCL: bool = False
|
||||
VLLM_USE_OINK_OPS: bool = False
|
||||
VLLM_ROCM_USE_AITER: bool = False
|
||||
@@ -106,7 +110,7 @@ if TYPE_CHECKING:
|
||||
VLLM_ROCM_USE_AITER_MLA: bool = True
|
||||
VLLM_ROCM_USE_AITER_MHA: bool = True
|
||||
VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False
|
||||
VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = True
|
||||
VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = False
|
||||
VLLM_ROCM_USE_AITER_FP8BMM: bool = True
|
||||
VLLM_ROCM_USE_AITER_FP4BMM: bool = True
|
||||
VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
|
||||
@@ -168,7 +172,7 @@ if TYPE_CHECKING:
|
||||
VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = (
|
||||
"latency"
|
||||
)
|
||||
VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "auto"
|
||||
VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "trtllm"
|
||||
VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
|
||||
VLLM_XGRAMMAR_CACHE_MB: int = 0
|
||||
VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
|
||||
@@ -231,7 +235,7 @@ if TYPE_CHECKING:
|
||||
VLLM_USE_FBGEMM: bool = False
|
||||
VLLM_GC_DEBUG: str = ""
|
||||
VLLM_DEBUG_WORKSPACE: bool = False
|
||||
VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
|
||||
VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = True
|
||||
VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
|
||||
VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
|
||||
VLLM_USE_V2_MODEL_RUNNER: bool = False
|
||||
@@ -243,17 +247,33 @@ if TYPE_CHECKING:
|
||||
VLLM_LORA_DISABLE_PDL: bool = False
|
||||
VLLM_ENABLE_CUDA_COMPATIBILITY: bool = False
|
||||
VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
|
||||
VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
|
||||
VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
|
||||
VLLM_FLAT_LOGPROBS: bool = False
|
||||
# optional envs we add.
|
||||
VLLM_W8A8_MOE_USE_W4A8: bool = False
|
||||
VLLM_WNA16_MOE_USE_W4A8: bool = False
|
||||
VLLM_W8A8_FORMAT: str = "TN"
|
||||
VLLM_W4A8_FORMAT: str = "TN"
|
||||
VLLM_W4A8_VERSION: int = 2
|
||||
VLLM_MIX_QUANTIZATION_TYPE: str = ""
|
||||
VLLM_MLA_CUSTOMIZE: bool = True
|
||||
VLLM_USE_INT8_MLA: bool = False
|
||||
VLLM_W8A8_LINEAR_USE_W4A8: bool = False
|
||||
VLLM_FORCE_NCCL_COMM: bool =False
|
||||
VLLM_KV_DISABLE_CROSS_GROUP_SHARE: bool = False
|
||||
# support Iluvatar IxServer
|
||||
|
||||
VLLM_ATTN_OPT_LEVEL: int = 0
|
||||
VLLM_MOE_OPT_LEVEL: int = 0
|
||||
VLLM_LINEAR_OPT_LEVEL: int = 0
|
||||
VLLM_OPT_EXCLUDE_LAYERS: str = ""
|
||||
VLLM_LINEAR_SPECIFIED_LAYERS: str = ""
|
||||
VLLM_LINEAR_SPECIFIED_KEYS: str = ""
|
||||
VLLM_LINEAR_SPECIFIED_OPT_LEVEL: int = 0
|
||||
|
||||
VLLM_USE_LORA_FUSION: bool = False
|
||||
|
||||
VLLM_USE_SILU_QUANT_FUSION: bool = False
|
||||
# static quant for attention
|
||||
VLLM_ATTN_STATIC_QUANT_SCALE_FILE_PATH: str = ""
|
||||
|
||||
def get_default_cache_root():
|
||||
return os.getenv(
|
||||
@@ -478,6 +498,8 @@ def get_env_or_set_default(
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
IGNORED_UNKNOWN_VARS = {"VLLM_ENFORCE_CUDA_GRAPH"}
|
||||
|
||||
environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# ================== Installation Time Env Vars ==================
|
||||
# Target device of vLLM, supporting [cuda (by default),
|
||||
@@ -648,6 +670,22 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
|
||||
).lower()
|
||||
== "true",
|
||||
# When set to 1, scheduler.schedule() delegates to schedule_opt()
|
||||
# for PP mix ILU scheduling.
|
||||
"VLLM_ENABLE_PP_MIX_ILU_SCHEDULING": lambda: os.environ.get(
|
||||
"VLLM_ENABLE_PP_MIX_ILU_SCHEDULING", "0"
|
||||
) == "1",
|
||||
# When set to 1, use step_with_batch_queue_ilu_opt (async batch queue with
|
||||
# background thread). Batch queue size can be controlled via
|
||||
# VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE.
|
||||
"VLLM_ENABLE_PP_ILU_OPT": lambda: os.environ.get(
|
||||
"VLLM_ENABLE_PP_ILU_OPT", "0"
|
||||
) == "1",
|
||||
# Batch queue size used when VLLM_ENABLE_PP_ILU_OPT=1.
|
||||
# If <= 0, EngineCore falls back to base_batch_queue_size * 2.
|
||||
"VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE": lambda: int(
|
||||
os.environ.get("VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE", "0")
|
||||
),
|
||||
# S3 access information, used for tensorizer to load model from S3
|
||||
"S3_ACCESS_KEY_ID": lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
|
||||
"S3_SECRET_ACCESS_KEY": lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
|
||||
@@ -862,7 +900,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
),
|
||||
# Time in ms for the zmq client to wait for a response from the backend
|
||||
# server for simple data operations
|
||||
"VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
|
||||
"VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "1000000")),
|
||||
# Timeout in seconds for keeping HTTP connections alive in API server
|
||||
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
|
||||
os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
|
||||
@@ -907,6 +945,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_DISABLED_KERNELS": lambda: []
|
||||
if "VLLM_DISABLED_KERNELS" not in os.environ
|
||||
else os.environ["VLLM_DISABLED_KERNELS"].split(","),
|
||||
"VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE": lambda: bool(
|
||||
int(os.getenv("VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE", "1"))
|
||||
),
|
||||
# Disable pynccl (using torch.distributed instead)
|
||||
"VLLM_DISABLE_PYNCCL": lambda: (
|
||||
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
|
||||
@@ -957,9 +998,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
os.getenv("VLLM_ROCM_USE_AITER_FP4_ASM_GEMM", "False").lower() in ("true", "1")
|
||||
),
|
||||
# Whether to use aiter rope.
|
||||
# By default is enabled.
|
||||
# By default is disabled.
|
||||
"VLLM_ROCM_USE_AITER_TRITON_ROPE": lambda: (
|
||||
os.getenv("VLLM_ROCM_USE_AITER_TRITON_ROPE", "True").lower() in ("true", "1")
|
||||
os.getenv("VLLM_ROCM_USE_AITER_TRITON_ROPE", "False").lower() in ("true", "1")
|
||||
),
|
||||
# Whether to use aiter triton fp8 bmm kernel
|
||||
# By default is enabled.
|
||||
@@ -1305,9 +1346,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Flashinfer fused allreduce backend.
|
||||
# "auto" will default to "mnnvl", which performs mostly same/better than "trtllm".
|
||||
# But "mnnvl" backend does not support fuse with quantization.
|
||||
# TODO: Default is "trtllm" right now because "mnnvl" has issues with cudagraph:
|
||||
# https://github.com/vllm-project/vllm/issues/35772
|
||||
# Should switch back to "auto" if the issue is resolved.
|
||||
"VLLM_FLASHINFER_ALLREDUCE_BACKEND": env_with_choices(
|
||||
"VLLM_FLASHINFER_ALLREDUCE_BACKEND",
|
||||
"auto",
|
||||
"trtllm",
|
||||
["auto", "trtllm", "mnnvl"],
|
||||
),
|
||||
# Control the workspace buffer size for the FlashInfer backend.
|
||||
@@ -1478,41 +1522,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
),
|
||||
# Allows vllm to find tuned config under customized folder
|
||||
"VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
|
||||
# vLLM do not support W4A8 and W8A8, we add it. For MOE, we default use W8A8, If set to true, we use W4A8.
|
||||
|
||||
"VLLM_W8A8_LINEAR_USE_W4A8":
|
||||
lambda: os.environ.get("VLLM_W8A8_LINEAR_USE_W4A8", "0").lower() in
|
||||
("1", "true"),
|
||||
|
||||
"VLLM_W8A8_MOE_USE_W4A8":
|
||||
lambda: os.environ.get("VLLM_W8A8_MOE_USE_W4A8", "0").lower() in
|
||||
("1", "true"),
|
||||
|
||||
"VLLM_FORCE_NCCL_COMM":
|
||||
lambda: os.environ.get("VLLM_FORCE_NCCL_COMM", "0").lower() in
|
||||
("1", "true"),
|
||||
# If set to true, we use int8 mla attention for decode stage.
|
||||
|
||||
"VLLM_USE_INT8_MLA":
|
||||
lambda: os.environ.get("VLLM_USE_INT8_MLA", "0").lower() in
|
||||
("1", "true"),
|
||||
|
||||
# For W4A8 MOE, we default use TN gemm format, choices: [TN, NN].
|
||||
"VLLM_W4A8_FORMAT":
|
||||
lambda: os.environ.get("VLLM_W4A8_FORMAT", "TN").upper(),
|
||||
|
||||
"VLLM_W4A8_VERSION":
|
||||
# For W4A8 MOE, we default use version 2, choices: [1, 2].
|
||||
lambda: int(os.environ.get("VLLM_W4A8_VERSION", "2")),
|
||||
|
||||
# temp param to support compressed-tensor's multi-quantization
|
||||
"VLLM_MIX_QUANTIZATION_TYPE":
|
||||
lambda: os.environ.get("VLLM_MIX_QUANTIZATION_TYPE", "").upper(),
|
||||
|
||||
# Use Customize mlp impl for faster speed and less gpu memory usage.
|
||||
"VLLM_MLA_CUSTOMIZE":
|
||||
lambda: os.environ.get("VLLM_MLA_CUSTOMIZE", "1").lower() in
|
||||
("1", "true"),
|
||||
# Valid values are container,code_interpreter,web_search_preview
|
||||
# ex VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter
|
||||
# If the server_label of your mcp tool is not in this list it will
|
||||
@@ -1607,7 +1616,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_DEBUG_WORKSPACE": lambda: bool(int(os.getenv("VLLM_DEBUG_WORKSPACE", "0"))),
|
||||
# Disables parallel execution of shared_experts via separate cuda stream
|
||||
"VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool(
|
||||
int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0"))
|
||||
int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "1"))
|
||||
),
|
||||
# Limits when we run shared_experts in a separate stream.
|
||||
# We found out that for large batch sizes, the separate stream
|
||||
@@ -1662,10 +1671,93 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_CUDA_COMPATIBILITY_PATH": lambda: os.environ.get(
|
||||
"VLLM_CUDA_COMPATIBILITY_PATH", None
|
||||
),
|
||||
# control kv cache share cross group
|
||||
"VLLM_KV_DISABLE_CROSS_GROUP_SHARE":
|
||||
lambda: os.environ.get("VLLM_KV_DISABLE_CROSS_GROUP_SHARE", "0").lower() in
|
||||
("1", "true")
|
||||
# Whether it is a scale up launch engine for elastic EP,
|
||||
# Should only be set by EngineCoreClient.
|
||||
"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": lambda: bool(
|
||||
int(os.getenv("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH", "0"))
|
||||
),
|
||||
# Whether to wait for all requests to drain before sending the
|
||||
# scaling command in elastic EP.
|
||||
"VLLM_ELASTIC_EP_DRAIN_REQUESTS": lambda: bool(
|
||||
int(os.getenv("VLLM_ELASTIC_EP_DRAIN_REQUESTS", "0"))
|
||||
),
|
||||
# Flag to enable FlatLogprobs whose GC overhead is significantly smaller than
|
||||
# the original list[dict[int, Logprob]] approach.
|
||||
# After enabled, PromptLogprobs and SampleLogprobs would populated as
|
||||
# FlatLogprobs.
|
||||
"VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
|
||||
|
||||
# vLLM do not support W4A8 and W8A8, we add it. For MOE, we default use W8A8, If set to true, we use W4A8.
|
||||
|
||||
"VLLM_W8A8_MOE_USE_W4A8":
|
||||
lambda: os.environ.get("VLLM_W8A8_MOE_USE_W4A8", "0").lower() in
|
||||
("1", "true"),
|
||||
|
||||
"VLLM_WNA16_MOE_USE_W4A8":
|
||||
lambda: os.environ.get("VLLM_WNA16_MOE_USE_W4A8", "0").lower() in
|
||||
("1", "true"),
|
||||
|
||||
# If set to true, we use int8 mla attention for decode stage.
|
||||
|
||||
"VLLM_USE_INT8_MLA":
|
||||
lambda: os.environ.get("VLLM_USE_INT8_MLA", "0").lower() in
|
||||
("1", "true"),
|
||||
|
||||
# For attn, 0 for f16qkv, 1 for i8qkv, 2 for i8qkf16v
|
||||
"VLLM_ATTN_OPT_LEVEL":
|
||||
lambda: int(os.environ.get("VLLM_ATTN_OPT_LEVEL", "0")),
|
||||
|
||||
# For W8A8 MOE, we default use TN gemm format, choices: [TN, NN].However, GEMV(Cuda Graph) only supports NN.
|
||||
"VLLM_W8A8_FORMAT":
|
||||
lambda: os.environ.get("VLLM_W8A8_FORMAT", "TN").upper(),
|
||||
|
||||
# For W4A8 MOE, we default use TN gemm format, choices: [TN, NN].
|
||||
"VLLM_W4A8_FORMAT":
|
||||
lambda: os.environ.get("VLLM_W4A8_FORMAT", "TN").upper(),
|
||||
|
||||
"VLLM_W4A8_VERSION":
|
||||
# For W4A8 MOE, we default use version 2, choices: [1, 2].
|
||||
lambda: int(os.environ.get("VLLM_W4A8_VERSION", "2")),
|
||||
|
||||
# temp param to support compressed-tensor's multi-quantization
|
||||
"VLLM_MIX_QUANTIZATION_TYPE":
|
||||
lambda: os.environ.get("VLLM_MIX_QUANTIZATION_TYPE", "").upper(),
|
||||
|
||||
# Use Customize mlp impl for faster speed and less gpu memory usage.
|
||||
"VLLM_MLA_CUSTOMIZE":
|
||||
lambda: os.environ.get("VLLM_MLA_CUSTOMIZE", "1").lower() in
|
||||
("1", "true"),
|
||||
|
||||
# support Iluvatar IxServer
|
||||
# Does vLLM support Iluvatar IxServer which is a distributed inference framework.
|
||||
"VLLM_MOE_OPT_LEVEL":
|
||||
lambda: int(os.getenv("VLLM_MOE_OPT_LEVEL", "0")),
|
||||
|
||||
"VLLM_LINEAR_OPT_LEVEL":
|
||||
lambda: int(os.getenv("VLLM_LINEAR_OPT_LEVEL", "0")),
|
||||
|
||||
"VLLM_OPT_EXCLUDE_LAYERS":
|
||||
lambda: os.environ.get("VLLM_OPT_EXCLUDE_LAYERS", "").upper(),
|
||||
|
||||
"VLLM_LINEAR_SPECIFIED_LAYERS":
|
||||
lambda: os.environ.get("VLLM_LINEAR_SPECIFIED_LAYERS", "").upper(),
|
||||
|
||||
"VLLM_LINEAR_SPECIFIED_KEYS":
|
||||
lambda: os.environ.get("VLLM_LINEAR_SPECIFIED_KEYS", "").lower(),
|
||||
|
||||
"VLLM_LINEAR_SPECIFIED_OPT_LEVEL":
|
||||
lambda: int(os.getenv("VLLM_LINEAR_SPECIFIED_OPT_LEVEL", "0")),
|
||||
|
||||
"VLLM_USE_LORA_FUSION":
|
||||
lambda: os.environ.get("VLLM_USE_LORA_FUSION", "0").lower() in
|
||||
("1", "true"),
|
||||
|
||||
"VLLM_USE_SILU_QUANT_FUSION":
|
||||
lambda: os.environ.get("VLLM_USE_SILU_QUANT_FUSION", "0").lower() in
|
||||
("1", "true"),
|
||||
|
||||
"VLLM_ATTN_STATIC_QUANT_SCALE_FILE_PATH":
|
||||
lambda: os.environ.get("VLLM_ATTN_STATIC_QUANT_SCALE_FILE_PATH", ""),
|
||||
}
|
||||
|
||||
|
||||
@@ -1738,6 +1830,8 @@ def is_set(name: str):
|
||||
def validate_environ(hard_fail: bool) -> None:
|
||||
for env in os.environ:
|
||||
if env.startswith("VLLM_") and env not in environment_variables:
|
||||
if env in IGNORED_UNKNOWN_VARS:
|
||||
continue
|
||||
if hard_fail:
|
||||
raise ValueError(f"Unknown vLLM environment variable detected: {env}")
|
||||
else:
|
||||
@@ -1801,10 +1895,7 @@ def compile_factors() -> dict[str, object]:
|
||||
"VLLM_ENABLE_V1_MULTIPROCESSING",
|
||||
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",
|
||||
"VLLM_CPU_KVCACHE_SPACE",
|
||||
"VLLM_CPU_OMP_THREADS_BIND",
|
||||
"VLLM_CPU_NUM_OF_RESERVED_CPU",
|
||||
"VLLM_CPU_MOE_PREPACK",
|
||||
"VLLM_CPU_SGL_KERNEL",
|
||||
"VLLM_TEST_FORCE_LOAD_FORMAT",
|
||||
"VLLM_ENABLE_CUDA_COMPATIBILITY",
|
||||
"VLLM_CUDA_COMPATIBILITY_PATH",
|
||||
|
||||
Reference in New Issue
Block a user