【main】ADXL/HIXL supports FabricMem Mode (#6806)

### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 --------- Signed-off-by: fems14 <1804143737@qq.com>
2026-03-05 21:04:11 +08:00
parent 50441e4650
commit ae394767d4
6 changed files with 46 additions and 40 deletions
--- a/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md
+++ b/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md
@@ -121,7 +121,7 @@ Moonshot AI. Installation and compilation guide:
 First, obtain the Mooncake project using the following command:
 ```bash
-git clone -b v0.3.8.post1 --depth 1 https://github.com/kvcache-ai/Mooncake.git
+git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git
 cd Mooncake
 git submodule update --init --recursive
 ```
--- a/docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md
+++ b/docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md
@@ -177,7 +177,7 @@ Mooncake is the serving platform for Kimi, a leading LLM service provided by Moo
 First, we need to obtain the Mooncake project. Refer to the following command:
 ```shell
-git clone -b v0.3.8.post1 --depth 1 https://github.com/kvcache-ai/Mooncake.git
+git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git
 ```
 (Optional) Replace go install url if the network is poor
--- a/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md
+++ b/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md
@@ -98,7 +98,7 @@ Mooncake is the serving platform for Kimi, a leading LLM service provided by Moo
 First, we need to obtain the Mooncake project. Refer to the following command:
 ```shell
-git clone -b v0.3.8.post1 --depth 1 https://github.com/kvcache-ai/Mooncake.git
+git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git
 ```
 (Optional) Replace go install url if the network is poor.
--- a/docs/source/user_guide/feature_guide/kv_pool.md
+++ b/docs/source/user_guide/feature_guide/kv_pool.md
@@ -42,7 +42,7 @@ export PYTHONHASHSEED=0
        First, we need to obtain the Mooncake project. Refer to the following command:
        ```shell
-        git clone -b v0.3.7.post2 --depth 1 https://github.com/kvcache-ai/Mooncake.git
+        git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git
        ```
        (Optional) Replace go install url if the network is poor
@@ -85,6 +85,15 @@ export PYTHONHASHSEED=0
        export LD_LIBRARY_PATH=/usr/local/lib64/python3.11/site-packages/mooncake:$LD_LIBRARY_PATH
        ```
 ### Environment Variables Description
 `export ASCEND_ENABLE_USE_FABRIC_MEM=1`: Enable unified memory address direct transmission scheme and only can be used for 800 I/T A3 series. Required supporting hardware versions are as follows:
    HDK >=26.0
    CANN >= 9.0
 `export ASCEND_BUFFER_POOL=4:8`: ASCEND_BUFFER_POOL is the environment variable for configuring the number and size of buffer on NPU Device for aggregation and KV transfer，the value 4:8 means we allocate 4 buffers of size 8MB. It only can be used for 800 I/T A2 series.
 ### Run Mooncake Master
 #### 1.Configure mooncake.json
--- a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py
+++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py
@@ -7,16 +7,14 @@ from dataclasses import dataclass
 import torch
 # Third Party
 from mooncake.store import ReplicateConfig  # type: ignore
 from vllm.config import ParallelConfig
 from vllm.logger import logger
 from vllm.utils.network_utils import get_ip
 from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import Backend
 from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import global_te
 from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
-DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
+DEFAULT_GLOBAL_SEGMENT_SIZE = 1073741824  # 1.0 GiB
 DEFAULT_LOCAL_BUFFER_SIZE = 1073741824  # 1.0 GiB
@@ -35,6 +33,10 @@ class MooncakeBackend(Backend):
        self.rank = parallel_config.rank
        if self.config.protocol == "ascend":
            local_hostname = get_ip()
            # ASCEND_ENABLE_USE_FABRIC_MEM: Enable unified memory address direct transmission scheme
            # and only can be used for 800 I/T A3 series.
            # Required supporting hardware versions are as follows:
            if os.getenv("ASCEND_ENABLE_USE_FABRIC_MEM", "0") != "1":
                transfer_engine = global_te.get_transfer_engine(local_hostname, device_name=None)
                self.local_seg = local_hostname + ":" + str(transfer_engine.get_rpc_port())
                ret = self.store.setup(
@@ -47,6 +49,18 @@ class MooncakeBackend(Backend):
                    self.config.master_server_address,
                    transfer_engine.get_engine(),
                )
            else:
                self.local_seg = local_hostname
                ret = self.store.setup(
                    self.local_seg,
                    self.config.metadata_server,
                    self.config.global_segment_size,
                    0,
                    self.config.protocol,
                    self.config.device_name,
                    self.config.master_server_address,
                )
        if ret != 0:
            msg = "Initialize mooncake failed."
            logger.error(msg)
@@ -57,6 +71,7 @@ class MooncakeBackend(Backend):
        torch.npu.set_device(device)
    def register_buffer(self, ptrs: list[int], lengths: list[int]):
        if os.getenv("ASCEND_ENABLE_USE_FABRIC_MEM", "0") != "1":
            global_te.register_buffer(ptrs, lengths)
    def exists(self, keys: list[str]) -> list[int]:
@@ -64,14 +79,7 @@ class MooncakeBackend(Backend):
    def put(self, keys: list[str], addrs: list[list[int]], sizes: list[list[int]]):
        try:
            soc_version = get_ascend_device_type()
            if soc_version in {AscendDeviceType.A2}:
            res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes)
            else:
                config = ReplicateConfig()
                config.preferred_segment = self.local_seg
                config.prefer_alloc_in_same_node = True
                res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes, config)
            for value in res:
                if value < 0:
                    logger.error(f"Failed to put key {keys},res:{res}")
@@ -80,11 +88,7 @@ class MooncakeBackend(Backend):
    def get(self, keys: list[str], addrs: list[list[int]], sizes: list[list[int]]):
        try:
            soc_version = get_ascend_device_type()
            if soc_version in {AscendDeviceType.A2}:
            res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes)
            else:
                res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes, True)
            for value in res:
                if value < 0:
                    logger.error(f"Failed to get key {keys}, res:{res}")
--- a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_worker.py
+++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_worker.py
@@ -30,7 +30,6 @@ from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.kv_transfer import
    KVCacheStoreSendingThread,
    KVTransferThread,
 )
 from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
 backend_map = {
    "mooncake": {
@@ -98,12 +97,6 @@ class KVPoolWorker:
            self.head_or_tp_rank = self.tp_rank
            self.put_step = 1
        soc_version = get_ascend_device_type()
        # be removed later
        if self.backend == "mooncake" and soc_version in {AscendDeviceType.A3}:
            self.head_or_tp_rank = self.tp_rank
            self.put_step = 1
        self.metadata = KeyMetadata(
            model_config.model.rstrip("/").split("/")[-1],
            self.head_or_tp_rank,