107 lines
4.6 KiB
Python
107 lines
4.6 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
|
|
|
import os
|
|
import torch
|
|
import vllm.envs as envs
|
|
|
|
|
|
def _check_env(env, default=False):
|
|
if env in os.environ:
|
|
return os.environ[env].lower() in ["true", "1"]
|
|
return default
|
|
|
|
|
|
def _check_env_value(env, default=0):
|
|
if env in os.environ:
|
|
if not os.environ[env].isdigit():
|
|
raise ValueError(f"'{env}' should be set with integer")
|
|
value = int(os.environ[env])
|
|
return value
|
|
return default
|
|
|
|
|
|
def _check_env_float(env, default=0):
|
|
if env in os.environ:
|
|
try:
|
|
value = float(os.environ[env])
|
|
except ValueError:
|
|
raise ValueError(f"'{env}' should be set with float")
|
|
return value
|
|
return default
|
|
|
|
|
|
# VLLM_LATENCY_DEBUG: Get more kernel info for benchmark latency.
|
|
VLLM_LATENCY_DEBUG = _check_env("VLLM_LATENCY_DEBUG", default=False)
|
|
|
|
# VLLM_LATENCY_DEBUG_NO_DEVICE: Get more kernel info(without device) for benchmark latency.
|
|
VLLM_LATENCY_DEBUG_NO_DEVICE = _check_env("VLLM_LATENCY_DEBUG_NO_DEVICE", default=False)
|
|
|
|
# VLLM_DUMP_TENSORS: Dump each layer outputs when running vLLM inference.
|
|
VLLM_DUMP_OUTPUTS = _check_env("VLLM_DUMP_OUTPUTS", default=False)
|
|
|
|
# VLLM_DUMP_MLU_INFO: Get device info when running vLLM inference.
|
|
VLLM_DUMP_MLU_INFO = _check_env("VLLM_DUMP_MLU_INFO", default=False)
|
|
|
|
# VLLM_DUMP_MLU_INFO_DEBUG: Dump device debug info when running vLLM inference.
|
|
VLLM_DUMP_MLU_INFO_DEBUG = _check_env("VLLM_DUMP_MLU_INFO_DEBUG", default=False)
|
|
|
|
# VLLM_SCHEDULER_PROFILE: Profiling vLLM scheduler.
|
|
VLLM_SCHEDULER_PROFILE = _check_env("VLLM_SCHEDULER_PROFILE", default=False)
|
|
|
|
# VLLM_GRAPH_DEBUG: Debug the graph status when running decoder, default value is True.
|
|
# Set to False to disable warning messages.
|
|
VLLM_GRAPH_DEBUG = _check_env("VLLM_GRAPH_DEBUG", default=True)
|
|
|
|
# VLLM_AVG_MOE_EN: make moe experts workload balance, default value is False.
|
|
VLLM_AVG_MOE_EN = _check_env("VLLM_AVG_MOE_EN", default=False) or _check_env("VLLM_RANDOM_MOE_EN", default=False)
|
|
VLLM_RANDOM_MOE_EN = _check_env("VLLM_RANDOM_MOE_EN", default=False)
|
|
|
|
# VLLM_LOGITS_USE_ALL_GATHER: use allgather for logits collection, default value is False.
|
|
VLLM_LOGITS_USE_ALL_GATHER = _check_env("VLLM_LOGITS_USE_ALL_GATHER", default=False)
|
|
|
|
VLLM_LATENCY_DEBUG_EN = (VLLM_LATENCY_DEBUG or VLLM_LATENCY_DEBUG_NO_DEVICE)
|
|
VLLM_LATENCY_DEBUG_WITH_DEVICE_EN = (VLLM_LATENCY_DEBUG and not VLLM_LATENCY_DEBUG_NO_DEVICE)
|
|
VLLM_DUMP_MLU_INFO_EN = (VLLM_LATENCY_DEBUG_WITH_DEVICE_EN and VLLM_DUMP_MLU_INFO)
|
|
VLLM_DUMP_MLU_INFO_DEBUG = (VLLM_DUMP_MLU_INFO_DEBUG and VLLM_DUMP_MLU_INFO_EN)
|
|
|
|
# VLLM_V1_USE_UNCHUNK_SCHED: v1 use unchunk scheduler, default value is True.
|
|
VLLM_V1_USE_UNCHUNK_SCHED = _check_env("VLLM_V1_USE_UNCHUNK_SCHED", default=True)
|
|
|
|
# VLLM_V1_MIN_PREFILL_BATCH: the min scheduling batch in v1, default is 1.
|
|
VLLM_V1_MIN_PREFILL_BATCH = _check_env_value("VLLM_V1_MIN_PREFILL_BATCH", default=1)
|
|
|
|
# VLLM_V1_USE_FULL_GRAPH: v1 use full graph capture, default value is True.
|
|
VLLM_V1_USE_FULL_GRAPH = _check_env("VLLM_V1_USE_FULL_GRAPH", default=True)
|
|
|
|
# VLLM_V1_BENCHMARK: v1 benchmark, default value is False.
|
|
VLLM_V1_BENCHMARK = _check_env("VLLM_V1_BENCHMARK", default=False)
|
|
|
|
# VLLM_MTP_DEBUG: use to show mtp accepted rate, default value is False.
|
|
VLLM_MTP_DEBUG = _check_env("VLLM_MTP_DEBUG", default=False)
|
|
|
|
# VLLM_MTP_NO_QUANT: mtp use origin dtype, quant_config use None
|
|
VLLM_MTP_NO_QUANT = _check_env("VLLM_MTP_NO_QUANT", default=False)
|
|
|
|
# VLLM_MTP_FIXED_ACCEPTANCE_RATE: use fixed acceptance rate, default value is None.
|
|
VLLM_MTP_FIXED_ACCEPTANCE_RATE = _check_env_float("VLLM_MTP_FIXED_ACCEPTANCE_RATE", default=None)
|
|
|
|
# VLLM_MTP_NO_QUANT: mtp use origin dtype, quant_config use None
|
|
VLLM_MTP_NO_QUANT = _check_env("VLLM_MTP_NO_QUANT", default=False)
|
|
|
|
# VLLM_V1_UNCHUNK_SCHED_LOG: print v1 unchunk scheduler state
|
|
VLLM_V1_UNCHUNK_SCHED_LOG = _check_env("VLLM_V1_UNCHUNK_SCHED_LOG", default=False)
|
|
|
|
# VLLM_MOE_PREFILL_CHUNK_SIZE: in number of tokens. enabled when > 0.
|
|
VLLM_MOE_PREFILL_CHUNK_SIZE = _check_env_value("VLLM_MOE_PREFILL_CHUNK_SIZE", default=0)
|
|
|
|
# VLLM_CI_ACCURACY_TEST: CI accuracy test, default value is False.
|
|
VLLM_CI_ACCURACY_TEST = _check_env("VLLM_CI_ACCURACY_TEST", default=False)
|
|
|
|
# VLLM_DISAGG_TRANS_ALL_BLOCKS: optimize the performance of disagg
|
|
VLLM_DISAGG_TRANS_ALL_BLOCKS = _check_env("VLLM_DISAGG_TRANS_ALL_BLOCKS", default=True)
|
|
|
|
# vllm disagg debug
|
|
VLLM_DISAGG_CNPX_EXECUTE = _check_env("VLLM_DISAGG_CNPX_EXECUTE", default=False)
|
|
VLLM_DISAGG_CNPX_REQUEST = _check_env("VLLM_DISAGG_CNPX_REQUEST", default=False)
|
|
VLLM_DISAGG_FAKE_DECODER = _check_env("VLLM_DISAGG_FAKE_DECODER", default=False) |