init v0.11.0rc0
This commit is contained in:
@@ -131,6 +131,26 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
||||
# this feature is supported in A2, and eager mode will get better performance.
|
||||
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
|
||||
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))),
|
||||
# Whether to enable FlashComm optimization when tensor parallel is enabled.
|
||||
# This feature will get better performance when concurrency is large.
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM":
|
||||
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))),
|
||||
# Whether to enable MLP weight prefetch, only used in small concurrency.
|
||||
"VLLM_ASCEND_ENABLE_PREFETCH_MLP":
|
||||
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_PREFETCH_MLP", '0'))),
|
||||
# buffer size for gate up prefetch
|
||||
"VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE":
|
||||
lambda: int(
|
||||
os.getenv("VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE", 18 * 1024 * 1024)),
|
||||
# buffer size for down proj prefetch
|
||||
"VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE":
|
||||
lambda: int(
|
||||
os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", 18 * 1024 * 1024)),
|
||||
# Whether to enable dense model and general optimizations for better performance.
|
||||
# Since we modified the base parent class `linear`, this optimization is also applicable to other model types.
|
||||
# However, there might be hidden issues, and it is currently recommended to prioritize its use with dense models.
|
||||
"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE":
|
||||
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE", '0'))),
|
||||
# Whether to enable mlp optimize when tensor parallel is enabled.
|
||||
# this feature in eager mode will get better performance.
|
||||
"VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
|
||||
@@ -139,11 +159,16 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
||||
# caused by the initialization of the Mooncake connector.
|
||||
"PHYSICAL_DEVICES":
|
||||
lambda: os.getenv("PHYSICAL_DEVICES", None),
|
||||
# Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
|
||||
"MSMONITOR_USE_DAEMON":
|
||||
lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),
|
||||
# Timeout (in seconds) for delayed KVCache block release. In the prefill
|
||||
# node, if a request is marked for delayed KV block release and the blocks
|
||||
# are not freed within this timeout, they will be forcibly released.
|
||||
"VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT":
|
||||
lambda: int(os.getenv("VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT", 250)),
|
||||
"VLLM_ASCEND_ENABLE_MLAPO":
|
||||
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLAPO", '0'))),
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
@@ -157,4 +182,4 @@ def __getattr__(name: str):
|
||||
|
||||
|
||||
def __dir__():
|
||||
return list(env_variables.keys())
|
||||
return list(env_variables.keys())
|
||||
Reference in New Issue
Block a user