[gpt-oss] Add gpt-oss mxfp4 support
This commit is contained in:
32
vllm/envs.py
32
vllm/envs.py
@@ -112,8 +112,10 @@ if TYPE_CHECKING:
|
||||
VLLM_DP_SIZE: int = 1
|
||||
VLLM_DP_MASTER_IP: str = ""
|
||||
VLLM_DP_MASTER_PORT: int = 0
|
||||
VLLM_MOE_DP_CHUNK_SIZE: int = 256
|
||||
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
|
||||
VLLM_MXFP4_USE_MARLIN: Optional[bool] = None
|
||||
VLLM_V0_USE_OUTLINES_CACHE: bool = False
|
||||
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
|
||||
VLLM_USE_DEEP_GEMM: bool = False
|
||||
@@ -128,6 +130,8 @@ if TYPE_CHECKING:
|
||||
VLLM_SLEEP_WHEN_IDLE: bool = False
|
||||
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
|
||||
MACA_VLLM_USE_TN_2_NN: bool = True
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
|
||||
|
||||
def get_default_cache_root():
|
||||
return os.getenv(
|
||||
@@ -149,6 +153,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
|
||||
return int(value)
|
||||
|
||||
|
||||
def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
|
||||
if value is None:
|
||||
return None
|
||||
return bool(int(value))
|
||||
|
||||
|
||||
def get_vllm_port() -> Optional[int]:
|
||||
"""Get the port from VLLM_PORT environment variable.
|
||||
|
||||
@@ -769,6 +779,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_DP_MASTER_IP":
|
||||
lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"),
|
||||
|
||||
# In the context of executing MoE models with Data-Parallel, Expert-Parallel
|
||||
# and Batched All-to-All dispatch/combine kernels, VLLM_MOE_DP_CHUNK_SIZE
|
||||
# dictates the quantum of tokens that can be dispatched from a DP
|
||||
# rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
|
||||
# units.
|
||||
"VLLM_MOE_DP_CHUNK_SIZE":
|
||||
lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
|
||||
|
||||
# Port of the master node in the data parallel setting
|
||||
"VLLM_DP_MASTER_PORT":
|
||||
lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
|
||||
@@ -794,6 +812,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_MARLIN_USE_ATOMIC_ADD":
|
||||
lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
|
||||
|
||||
# Whether to use marlin kernel in mxfp4 quantization method
|
||||
"VLLM_MXFP4_USE_MARLIN":
|
||||
lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)),
|
||||
|
||||
# Whether to turn on the outlines cache for V0
|
||||
# This cache is unbounded and on disk, so it's not safe to use in
|
||||
# an environment with potentially malicious users.
|
||||
@@ -810,6 +832,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_USE_DEEP_GEMM":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
|
||||
|
||||
# If set to 1, use the FlashInfer
|
||||
# MXFP8 (activation) x MXFP4 (weight) MoE backend.
|
||||
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
|
||||
|
||||
# If set to 1, use the FlashInfer
|
||||
# BF16 (activation) x MXFP4 (weight) MoE backend.
|
||||
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))),
|
||||
|
||||
# Control the cache sized used by the xgrammar compiler. The default
|
||||
# of 512 MB should be enough for roughly 1000 JSON schemas.
|
||||
# It can be changed with this variable if needed for some reason.
|
||||
|
||||
Reference in New Issue
Block a user