init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -131,6 +131,26 @@ env_variables: Dict[str, Callable[[], Any]] = {
    # this feature is supported in A2, and eager mode will get better performance.
    "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))),
+    # Whether to enable FlashComm optimization when tensor parallel is enabled.
+    # This feature will get better performance when concurrency is large.
+    "VLLM_ASCEND_ENABLE_FLASHCOMM":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))),
+    # Whether to enable MLP weight prefetch, only used in small concurrency.
+    "VLLM_ASCEND_ENABLE_PREFETCH_MLP":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_PREFETCH_MLP", '0'))),
+    # buffer size for gate up prefetch
+    "VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE":
+    lambda: int(
+        os.getenv("VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE", 18 * 1024 * 1024)),
+    # buffer size for down proj prefetch
+    "VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE":
+    lambda: int(
+        os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", 18 * 1024 * 1024)),
+    # Whether to enable dense model and general optimizations for better performance.
+    # Since we modified the base parent class `linear`, this optimization is also applicable to other model types.
+    # However, there might be hidden issues, and it is currently recommended to prioritize its use with dense models.
+    "VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE", '0'))),
    # Whether to enable mlp optimize when tensor parallel is enabled.
    # this feature in eager mode will get better performance.
    "VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
@@ -139,11 +159,16 @@ env_variables: Dict[str, Callable[[], Any]] = {
    # caused by the initialization of the Mooncake connector.
    "PHYSICAL_DEVICES":
    lambda: os.getenv("PHYSICAL_DEVICES", None),
+    # Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
+    "MSMONITOR_USE_DAEMON":
+    lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),
    # Timeout (in seconds) for delayed KVCache block release. In the prefill
    # node, if a request is marked for delayed KV block release and the blocks
    # are not freed within this timeout, they will be forcibly released.
    "VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT":
    lambda: int(os.getenv("VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT", 250)),
+    "VLLM_ASCEND_ENABLE_MLAPO":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLAPO", '0'))),
 }

 # end-env-vars-definition
@@ -157,4 +182,4 @@ def __getattr__(name: str):


 def __dir__():
-    return list(env_variables.keys())
+    return list(env_variables.keys())