Files
enginex-mlu590-vllm/vllm_mlu/_mlu_utils.py
2026-04-24 09:58:03 +08:00

107 lines
4.6 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
import os
import torch
import vllm.envs as envs
def _check_env(env, default=False):
if env in os.environ:
return os.environ[env].lower() in ["true", "1"]
return default
def _check_env_value(env, default=0):
if env in os.environ:
if not os.environ[env].isdigit():
raise ValueError(f"'{env}' should be set with integer")
value = int(os.environ[env])
return value
return default
def _check_env_float(env, default=0):
if env in os.environ:
try:
value = float(os.environ[env])
except ValueError:
raise ValueError(f"'{env}' should be set with float")
return value
return default
# VLLM_LATENCY_DEBUG: Get more kernel info for benchmark latency.
VLLM_LATENCY_DEBUG = _check_env("VLLM_LATENCY_DEBUG", default=False)
# VLLM_LATENCY_DEBUG_NO_DEVICE: Get more kernel info(without device) for benchmark latency.
VLLM_LATENCY_DEBUG_NO_DEVICE = _check_env("VLLM_LATENCY_DEBUG_NO_DEVICE", default=False)
# VLLM_DUMP_TENSORS: Dump each layer outputs when running vLLM inference.
VLLM_DUMP_OUTPUTS = _check_env("VLLM_DUMP_OUTPUTS", default=False)
# VLLM_DUMP_MLU_INFO: Get device info when running vLLM inference.
VLLM_DUMP_MLU_INFO = _check_env("VLLM_DUMP_MLU_INFO", default=False)
# VLLM_DUMP_MLU_INFO_DEBUG: Dump device debug info when running vLLM inference.
VLLM_DUMP_MLU_INFO_DEBUG = _check_env("VLLM_DUMP_MLU_INFO_DEBUG", default=False)
# VLLM_SCHEDULER_PROFILE: Profiling vLLM scheduler.
VLLM_SCHEDULER_PROFILE = _check_env("VLLM_SCHEDULER_PROFILE", default=False)
# VLLM_GRAPH_DEBUG: Debug the graph status when running decoder, default value is True.
# Set to False to disable warning messages.
VLLM_GRAPH_DEBUG = _check_env("VLLM_GRAPH_DEBUG", default=True)
# VLLM_AVG_MOE_EN: make moe experts workload balance, default value is False.
VLLM_AVG_MOE_EN = _check_env("VLLM_AVG_MOE_EN", default=False) or _check_env("VLLM_RANDOM_MOE_EN", default=False)
VLLM_RANDOM_MOE_EN = _check_env("VLLM_RANDOM_MOE_EN", default=False)
# VLLM_LOGITS_USE_ALL_GATHER: use allgather for logits collection, default value is False.
VLLM_LOGITS_USE_ALL_GATHER = _check_env("VLLM_LOGITS_USE_ALL_GATHER", default=False)
VLLM_LATENCY_DEBUG_EN = (VLLM_LATENCY_DEBUG or VLLM_LATENCY_DEBUG_NO_DEVICE)
VLLM_LATENCY_DEBUG_WITH_DEVICE_EN = (VLLM_LATENCY_DEBUG and not VLLM_LATENCY_DEBUG_NO_DEVICE)
VLLM_DUMP_MLU_INFO_EN = (VLLM_LATENCY_DEBUG_WITH_DEVICE_EN and VLLM_DUMP_MLU_INFO)
VLLM_DUMP_MLU_INFO_DEBUG = (VLLM_DUMP_MLU_INFO_DEBUG and VLLM_DUMP_MLU_INFO_EN)
# VLLM_V1_USE_UNCHUNK_SCHED: v1 use unchunk scheduler, default value is True.
VLLM_V1_USE_UNCHUNK_SCHED = _check_env("VLLM_V1_USE_UNCHUNK_SCHED", default=True)
# VLLM_V1_MIN_PREFILL_BATCH: the min scheduling batch in v1, default is 1.
VLLM_V1_MIN_PREFILL_BATCH = _check_env_value("VLLM_V1_MIN_PREFILL_BATCH", default=1)
# VLLM_V1_USE_FULL_GRAPH: v1 use full graph capture, default value is True.
VLLM_V1_USE_FULL_GRAPH = _check_env("VLLM_V1_USE_FULL_GRAPH", default=True)
# VLLM_V1_BENCHMARK: v1 benchmark, default value is False.
VLLM_V1_BENCHMARK = _check_env("VLLM_V1_BENCHMARK", default=False)
# VLLM_MTP_DEBUG: use to show mtp accepted rate, default value is False.
VLLM_MTP_DEBUG = _check_env("VLLM_MTP_DEBUG", default=False)
# VLLM_MTP_NO_QUANT: mtp use origin dtype, quant_config use None
VLLM_MTP_NO_QUANT = _check_env("VLLM_MTP_NO_QUANT", default=False)
# VLLM_MTP_FIXED_ACCEPTANCE_RATE: use fixed acceptance rate, default value is None.
VLLM_MTP_FIXED_ACCEPTANCE_RATE = _check_env_float("VLLM_MTP_FIXED_ACCEPTANCE_RATE", default=None)
# VLLM_MTP_NO_QUANT: mtp use origin dtype, quant_config use None
VLLM_MTP_NO_QUANT = _check_env("VLLM_MTP_NO_QUANT", default=False)
# VLLM_V1_UNCHUNK_SCHED_LOG: print v1 unchunk scheduler state
VLLM_V1_UNCHUNK_SCHED_LOG = _check_env("VLLM_V1_UNCHUNK_SCHED_LOG", default=False)
# VLLM_MOE_PREFILL_CHUNK_SIZE: in number of tokens. enabled when > 0.
VLLM_MOE_PREFILL_CHUNK_SIZE = _check_env_value("VLLM_MOE_PREFILL_CHUNK_SIZE", default=0)
# VLLM_CI_ACCURACY_TEST: CI accuracy test, default value is False.
VLLM_CI_ACCURACY_TEST = _check_env("VLLM_CI_ACCURACY_TEST", default=False)
# VLLM_DISAGG_TRANS_ALL_BLOCKS: optimize the performance of disagg
VLLM_DISAGG_TRANS_ALL_BLOCKS = _check_env("VLLM_DISAGG_TRANS_ALL_BLOCKS", default=True)
# vllm disagg debug
VLLM_DISAGG_CNPX_EXECUTE = _check_env("VLLM_DISAGG_CNPX_EXECUTE", default=False)
VLLM_DISAGG_CNPX_REQUEST = _check_env("VLLM_DISAGG_CNPX_REQUEST", default=False)
VLLM_DISAGG_FAKE_DECODER = _check_env("VLLM_DISAGG_FAKE_DECODER", default=False)