2025-04-15 16:09:36 +08:00
|
|
|
#
|
|
|
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
#
|
|
|
|
|
# This file is mainly Adapted from vllm-project/vllm/vllm/envs.py
|
|
|
|
|
# Copyright 2023 The vLLM team.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
|
|
|
|
|
2025-04-03 14:52:34 +08:00
|
|
|
import os
|
|
|
|
|
from typing import Any, Callable, Dict
|
|
|
|
|
|
2025-04-15 16:09:36 +08:00
|
|
|
# The begin-* and end* here are used by the documentation generator
|
|
|
|
|
# to extract the used env vars.
|
|
|
|
|
|
|
|
|
|
# begin-env-vars-definition
|
|
|
|
|
|
2025-04-03 14:52:34 +08:00
|
|
|
env_variables: Dict[str, Callable[[], Any]] = {
|
2025-06-06 09:48:43 +08:00
|
|
|
# max compile thread number for package building. Usually, it is set to
|
|
|
|
|
# the number of CPU cores. If not set, the default value is None, which
|
|
|
|
|
# means all number of CPU cores will be used.
|
2025-04-12 10:24:53 +08:00
|
|
|
"MAX_JOBS":
|
|
|
|
|
lambda: os.getenv("MAX_JOBS", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The build type of the package. It can be one of the following values:
|
|
|
|
|
# Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
|
2025-04-12 10:24:53 +08:00
|
|
|
"CMAKE_BUILD_TYPE":
|
|
|
|
|
lambda: os.getenv("CMAKE_BUILD_TYPE"),
|
2025-06-06 09:48:43 +08:00
|
|
|
# Whether to compile custom kernels. If not set, the default value is True.
|
|
|
|
|
# If set to False, the custom kernels will not be compiled. Please note that
|
|
|
|
|
# the sleep mode feature will be disabled as well if custom kernels are not
|
|
|
|
|
# compiled.
|
2025-04-03 14:52:34 +08:00
|
|
|
"COMPILE_CUSTOM_KERNELS":
|
2025-04-12 10:24:53 +08:00
|
|
|
lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The CXX compiler used for compiling the package. If not set, the default
|
|
|
|
|
# value is None, which means the system default CXX compiler will be used.
|
|
|
|
|
"CXX_COMPILER":
|
|
|
|
|
lambda: os.getenv("CXX_COMPILER", None),
|
|
|
|
|
# The C compiler used for compiling the package. If not set, the default
|
|
|
|
|
# value is None, which means the system default C compiler will be used.
|
|
|
|
|
"C_COMPILER":
|
|
|
|
|
lambda: os.getenv("C_COMPILER", None),
|
|
|
|
|
# The version of the Ascend chip. If not set, the default value is
|
|
|
|
|
# ASCEND910B1. It's used for package building. Please make sure that the
|
|
|
|
|
# version is correct.
|
2025-04-12 10:24:53 +08:00
|
|
|
"SOC_VERSION":
|
|
|
|
|
lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
|
|
|
|
|
# If set, vllm-ascend will print verbose logs during compilation
|
|
|
|
|
"VERBOSE":
|
|
|
|
|
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The home path for CANN toolkit. If not set, the default value is
|
|
|
|
|
# /usr/local/Ascend/ascend-toolkit/latest
|
2025-04-12 10:24:53 +08:00
|
|
|
"ASCEND_HOME_PATH":
|
|
|
|
|
lambda: os.getenv("ASCEND_HOME_PATH", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The path for HCCN Tool, the tool will be called by disaggregated prefilling
|
|
|
|
|
# case.
|
2025-04-15 15:11:35 +08:00
|
|
|
"HCCN_PATH":
|
|
|
|
|
lambda: os.getenv("HCCN_PATH", "/usr/local/Ascend/driver/tools/hccn_tool"),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The path for HCCL library, it's used by pyhccl communicator backend. If
|
|
|
|
|
# not set, the default value is libhccl.so。
|
2025-04-17 14:57:52 +08:00
|
|
|
"HCCL_SO_PATH":
|
2025-06-06 09:48:43 +08:00
|
|
|
# The prefill device id for disaggregated prefilling case.
|
2025-04-17 14:57:52 +08:00
|
|
|
lambda: os.environ.get("HCCL_SO_PATH", None),
|
2025-04-15 15:11:35 +08:00
|
|
|
"PROMPT_DEVICE_ID":
|
|
|
|
|
lambda: os.getenv("PROMPT_DEVICE_ID", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The decode device id for disaggregated prefilling case.
|
2025-04-15 15:11:35 +08:00
|
|
|
"DECODE_DEVICE_ID":
|
|
|
|
|
lambda: os.getenv("DECODE_DEVICE_ID", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The port number for llmdatadist communication. If not set, the default
|
|
|
|
|
# value is 26000.
|
2025-04-15 15:11:35 +08:00
|
|
|
"LLMDATADIST_COMM_PORT":
|
|
|
|
|
lambda: os.getenv("LLMDATADIST_COMM_PORT", "26000"),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The wait time for llmdatadist sync cache. If not set, the default value is
|
|
|
|
|
# 5000ms.
|
2025-04-15 15:11:35 +08:00
|
|
|
"LLMDATADIST_SYNC_CACHE_WAIT_TIME":
|
2025-04-17 14:57:52 +08:00
|
|
|
lambda: os.getenv("LLMDATADIST_SYNC_CACHE_WAIT_TIME", "5000"),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The version of vllm is installed. This value is used for developers who
|
|
|
|
|
# installed vllm from source locally. In this case, the version of vllm is
|
|
|
|
|
# usually changed. For example, if the version of vllm is "0.9.0", but when
|
|
|
|
|
# it's installed from source, the version of vllm is usually set to "0.9.1".
|
|
|
|
|
# In this case, developers need to set this value to "0.9.0" to make sure
|
|
|
|
|
# that the correct package is installed.
|
2025-04-28 14:19:06 +08:00
|
|
|
"VLLM_VERSION":
|
|
|
|
|
lambda: os.getenv("VLLM_VERSION", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# Whether to enable the trace recompiles from pytorch.
|
2025-05-31 06:03:03 +08:00
|
|
|
"VLLM_ASCEND_TRACE_RECOMPILES":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),
|
2025-06-23 22:03:38 +08:00
|
|
|
# Whether to enable fused_experts_allgather_ep. MoeInitRoutingV3 and
|
|
|
|
|
# GroupedMatmulFinalizeRouting operators are combined to implement EP.
|
|
|
|
|
"VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP", '0'))
|
|
|
|
|
),
|
2025-06-07 16:46:58 +08:00
|
|
|
"VLLM_ASCEND_ENABLE_DBO":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DBO", '0'))),
|
2025-06-06 09:48:43 +08:00
|
|
|
# Whether to enable the model execute time observe profile. Disable it when
|
|
|
|
|
# running vllm ascend in production environment.
|
2025-06-06 09:29:34 +08:00
|
|
|
"VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE", '0'))
|
|
|
|
|
),
|
[CI]Moe alltoall communication optimization (#1067)
[CI]Moe alltoall communication optimization
The DeepSeek V3/R1 model has 256 routing experts. During parallel
inference, if the load of an EP rank is high, the overall communication
and computing time is slowed down, which becomes a weakness of parallel
inference because the load is unevenly distributed. However, the data
volume in the prefill phase is large, and the inter-card communication
time consumption/calculation time consumption and the data volume are
closely related to each other. Therefore, less non-linear precision loss
can be used to obtain a near-linear performance improvement.
During parallel inference, global synchronization occurs during
communication. As a result, the card with low load completes the
calculation first and waits for the card with the highest load to
complete the calculation. Therefore, if the load is unbalanced, the card
with high load slows down the overall time consumption. Significant
performance gains can be achieved by discarding a small number of
tokens, which is unacceptable in some precision-sensitive scenarios.
However, similar to quantification, it is a solution that uses an
acceptable precision loss in some scenarios for performance. In
addition, a trade-off between performance and precision can be achieved
by configuring a proportion of discarded tokens.
Perform the test on A3. The batch size is 8 (B), the prompt length is
3.5K tokens (S), and the parallel configuration is as follows: AttnDP=2,
AttnTP=8, MoeTP=1, and MoeEP=16. In this sence, we got a 10%-15%
performance gain.
Plus, the next version, we'll have an alltoallv moe.
---------
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-06-07 10:15:56 +08:00
|
|
|
# MOE_ALL2ALL_BUFFER:
|
|
|
|
|
# 0: default, normal init.
|
|
|
|
|
# 1: enable moe_all2all_buffer.
|
|
|
|
|
"MOE_ALL2ALL_BUFFER":
|
|
|
|
|
lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))),
|
2025-06-07 19:45:46 +08:00
|
|
|
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf
|
|
|
|
|
# training, the optimized model may not be suitable. In this case, set this
|
|
|
|
|
# value to False to disable the optimized model.
|
|
|
|
|
"USE_OPTIMIZED_MODEL":
|
|
|
|
|
lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
|
2025-07-11 08:55:06 +08:00
|
|
|
# SELECT_GATING_TOPK_SOTFMAX_EXPERTS is the equivalent of select_experts in non-quantized scenarios.
|
|
|
|
|
# In theory, it should have better performance than select_experts.
|
|
|
|
|
# Subsequent versions will remove the SELECT_GATING_TOPK_SOTFMAX_EXPERTS tag and use it as the default mode.
|
|
|
|
|
"SELECT_GATING_TOPK_SOTFMAX_EXPERTS":
|
|
|
|
|
lambda: bool(int(os.getenv("SELECT_GATING_TOPK_SOTFMAX_EXPERTS", '0'))),
|
2025-07-07 22:37:14 +08:00
|
|
|
# The tolerance of the kv cache size, if the difference between the
|
|
|
|
|
# actual kv cache size and the cached kv cache size is less than this value,
|
|
|
|
|
# then the cached kv cache size will be used.
|
|
|
|
|
"VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE":
|
|
|
|
|
lambda: int(
|
|
|
|
|
os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)),
|
2025-04-03 14:52:34 +08:00
|
|
|
}
|
|
|
|
|
|
2025-04-15 16:09:36 +08:00
|
|
|
# end-env-vars-definition
|
|
|
|
|
|
2025-04-03 14:52:34 +08:00
|
|
|
|
|
|
|
|
def __getattr__(name: str):
|
|
|
|
|
# lazy evaluation of environment variables
|
|
|
|
|
if name in env_variables:
|
|
|
|
|
return env_variables[name]()
|
|
|
|
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __dir__():
|
|
|
|
|
return list(env_variables.keys())
|