From ae394767d4354d02c3fd62754d38b4e555710557 Mon Sep 17 00:00:00 2001 From: fems14 <74094523+fems14@users.noreply.github.com> Date: Thu, 5 Mar 2026 21:04:11 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90main=E3=80=91ADXL/HIXL=20supports=20Fa?= =?UTF-8?q?bricMem=20Mode=20(#6806)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83b47f67b1dfad505606070ae4d9f83e50ad4ebd --------- Signed-off-by: fems14 <1804143737@qq.com> --- .../pd_colocated_mooncake_multi_instance.md | 2 +- .../pd_disaggregation_mooncake_multi_node.md | 2 +- .../pd_disaggregation_mooncake_single_node.md | 2 +- .../user_guide/feature_guide/kv_pool.md | 11 +++- .../ascend_store/backend/mooncake_backend.py | 62 ++++++++++--------- .../kv_pool/ascend_store/pool_worker.py | 7 --- 6 files changed, 46 insertions(+), 40 deletions(-) diff --git a/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md b/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md index 9f76687a..e993d4c4 100644 --- a/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md +++ b/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md @@ -121,7 +121,7 @@ Moonshot AI. Installation and compilation guide: First, obtain the Mooncake project using the following command: ```bash -git clone -b v0.3.8.post1 --depth 1 https://github.com/kvcache-ai/Mooncake.git +git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git cd Mooncake git submodule update --init --recursive ``` diff --git a/docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md b/docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md index 2683d988..71b570ab 100644 --- a/docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md +++ b/docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md @@ -177,7 +177,7 @@ Mooncake is the serving platform for Kimi, a leading LLM service provided by Moo First, we need to obtain the Mooncake project. Refer to the following command: ```shell -git clone -b v0.3.8.post1 --depth 1 https://github.com/kvcache-ai/Mooncake.git +git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git ``` (Optional) Replace go install url if the network is poor diff --git a/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md b/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md index e8a49f6f..6e25d3b1 100644 --- a/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md +++ b/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md @@ -98,7 +98,7 @@ Mooncake is the serving platform for Kimi, a leading LLM service provided by Moo First, we need to obtain the Mooncake project. Refer to the following command: ```shell -git clone -b v0.3.8.post1 --depth 1 https://github.com/kvcache-ai/Mooncake.git +git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git ``` (Optional) Replace go install url if the network is poor. diff --git a/docs/source/user_guide/feature_guide/kv_pool.md b/docs/source/user_guide/feature_guide/kv_pool.md index 2c630e7f..a6a49c16 100644 --- a/docs/source/user_guide/feature_guide/kv_pool.md +++ b/docs/source/user_guide/feature_guide/kv_pool.md @@ -42,7 +42,7 @@ export PYTHONHASHSEED=0 First, we need to obtain the Mooncake project. Refer to the following command: ```shell - git clone -b v0.3.7.post2 --depth 1 https://github.com/kvcache-ai/Mooncake.git + git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git ``` (Optional) Replace go install url if the network is poor @@ -85,6 +85,15 @@ export PYTHONHASHSEED=0 export LD_LIBRARY_PATH=/usr/local/lib64/python3.11/site-packages/mooncake:$LD_LIBRARY_PATH ``` +### Environment Variables Description + +`export ASCEND_ENABLE_USE_FABRIC_MEM=1`: Enable unified memory address direct transmission scheme and only can be used for 800 I/T A3 series. Required supporting hardware versions are as follows: + + HDK >=26.0 + CANN >= 9.0 + +`export ASCEND_BUFFER_POOL=4:8`: ASCEND_BUFFER_POOL is the environment variable for configuring the number and size of buffer on NPU Device for aggregation and KV transfer,the value 4:8 means we allocate 4 buffers of size 8MB. It only can be used for 800 I/T A2 series. + ### Run Mooncake Master #### 1.Configure mooncake.json diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py index 11674b92..a3136760 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py @@ -7,16 +7,14 @@ from dataclasses import dataclass import torch # Third Party -from mooncake.store import ReplicateConfig # type: ignore from vllm.config import ParallelConfig from vllm.logger import logger from vllm.utils.network_utils import get_ip from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import Backend from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import global_te -from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type -DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB +DEFAULT_GLOBAL_SEGMENT_SIZE = 1073741824 # 1.0 GiB DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB @@ -35,18 +33,34 @@ class MooncakeBackend(Backend): self.rank = parallel_config.rank if self.config.protocol == "ascend": local_hostname = get_ip() - transfer_engine = global_te.get_transfer_engine(local_hostname, device_name=None) - self.local_seg = local_hostname + ":" + str(transfer_engine.get_rpc_port()) - ret = self.store.setup( - self.local_seg, - self.config.metadata_server, - self.config.global_segment_size, - self.config.local_buffer_size, - self.config.protocol, - self.config.device_name, - self.config.master_server_address, - transfer_engine.get_engine(), - ) + # ASCEND_ENABLE_USE_FABRIC_MEM: Enable unified memory address direct transmission scheme + # and only can be used for 800 I/T A3 series. + # Required supporting hardware versions are as follows: + if os.getenv("ASCEND_ENABLE_USE_FABRIC_MEM", "0") != "1": + transfer_engine = global_te.get_transfer_engine(local_hostname, device_name=None) + self.local_seg = local_hostname + ":" + str(transfer_engine.get_rpc_port()) + ret = self.store.setup( + self.local_seg, + self.config.metadata_server, + self.config.global_segment_size, + self.config.local_buffer_size, + self.config.protocol, + self.config.device_name, + self.config.master_server_address, + transfer_engine.get_engine(), + ) + else: + self.local_seg = local_hostname + ret = self.store.setup( + self.local_seg, + self.config.metadata_server, + self.config.global_segment_size, + 0, + self.config.protocol, + self.config.device_name, + self.config.master_server_address, + ) + if ret != 0: msg = "Initialize mooncake failed." logger.error(msg) @@ -57,21 +71,15 @@ class MooncakeBackend(Backend): torch.npu.set_device(device) def register_buffer(self, ptrs: list[int], lengths: list[int]): - global_te.register_buffer(ptrs, lengths) + if os.getenv("ASCEND_ENABLE_USE_FABRIC_MEM", "0") != "1": + global_te.register_buffer(ptrs, lengths) def exists(self, keys: list[str]) -> list[int]: return self.store.batch_is_exist(keys) def put(self, keys: list[str], addrs: list[list[int]], sizes: list[list[int]]): try: - soc_version = get_ascend_device_type() - if soc_version in {AscendDeviceType.A2}: - res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes) - else: - config = ReplicateConfig() - config.preferred_segment = self.local_seg - config.prefer_alloc_in_same_node = True - res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes, config) + res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes) for value in res: if value < 0: logger.error(f"Failed to put key {keys},res:{res}") @@ -80,11 +88,7 @@ class MooncakeBackend(Backend): def get(self, keys: list[str], addrs: list[list[int]], sizes: list[list[int]]): try: - soc_version = get_ascend_device_type() - if soc_version in {AscendDeviceType.A2}: - res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes) - else: - res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes, True) + res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes) for value in res: if value < 0: logger.error(f"Failed to get key {keys}, res:{res}") diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_worker.py b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_worker.py index 7dadd5ae..4e6cb0a5 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_worker.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_worker.py @@ -30,7 +30,6 @@ from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.kv_transfer import KVCacheStoreSendingThread, KVTransferThread, ) -from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type backend_map = { "mooncake": { @@ -98,12 +97,6 @@ class KVPoolWorker: self.head_or_tp_rank = self.tp_rank self.put_step = 1 - soc_version = get_ascend_device_type() - # be removed later - if self.backend == "mooncake" and soc_version in {AscendDeviceType.A3}: - self.head_or_tp_rank = self.tp_rank - self.put_step = 1 - self.metadata = KeyMetadata( model_config.model.rstrip("/").split("/")[-1], self.head_or_tp_rank,