【main】ADXL/HIXL supports FabricMem Mode (#6806)
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
83b47f67b1
---------
Signed-off-by: fems14 <1804143737@qq.com>
This commit is contained in:
@@ -121,7 +121,7 @@ Moonshot AI. Installation and compilation guide:
|
|||||||
First, obtain the Mooncake project using the following command:
|
First, obtain the Mooncake project using the following command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone -b v0.3.8.post1 --depth 1 https://github.com/kvcache-ai/Mooncake.git
|
git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git
|
||||||
cd Mooncake
|
cd Mooncake
|
||||||
git submodule update --init --recursive
|
git submodule update --init --recursive
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -177,7 +177,7 @@ Mooncake is the serving platform for Kimi, a leading LLM service provided by Moo
|
|||||||
First, we need to obtain the Mooncake project. Refer to the following command:
|
First, we need to obtain the Mooncake project. Refer to the following command:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git clone -b v0.3.8.post1 --depth 1 https://github.com/kvcache-ai/Mooncake.git
|
git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git
|
||||||
```
|
```
|
||||||
|
|
||||||
(Optional) Replace go install url if the network is poor
|
(Optional) Replace go install url if the network is poor
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ Mooncake is the serving platform for Kimi, a leading LLM service provided by Moo
|
|||||||
First, we need to obtain the Mooncake project. Refer to the following command:
|
First, we need to obtain the Mooncake project. Refer to the following command:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git clone -b v0.3.8.post1 --depth 1 https://github.com/kvcache-ai/Mooncake.git
|
git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git
|
||||||
```
|
```
|
||||||
|
|
||||||
(Optional) Replace go install url if the network is poor.
|
(Optional) Replace go install url if the network is poor.
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ export PYTHONHASHSEED=0
|
|||||||
First, we need to obtain the Mooncake project. Refer to the following command:
|
First, we need to obtain the Mooncake project. Refer to the following command:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git clone -b v0.3.7.post2 --depth 1 https://github.com/kvcache-ai/Mooncake.git
|
git clone -b v0.3.9 --depth 1 https://github.com/kvcache-ai/Mooncake.git
|
||||||
```
|
```
|
||||||
|
|
||||||
(Optional) Replace go install url if the network is poor
|
(Optional) Replace go install url if the network is poor
|
||||||
@@ -85,6 +85,15 @@ export PYTHONHASHSEED=0
|
|||||||
export LD_LIBRARY_PATH=/usr/local/lib64/python3.11/site-packages/mooncake:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/lib64/python3.11/site-packages/mooncake:$LD_LIBRARY_PATH
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Environment Variables Description
|
||||||
|
|
||||||
|
`export ASCEND_ENABLE_USE_FABRIC_MEM=1`: Enable unified memory address direct transmission scheme and only can be used for 800 I/T A3 series. Required supporting hardware versions are as follows:
|
||||||
|
|
||||||
|
HDK >=26.0
|
||||||
|
CANN >= 9.0
|
||||||
|
|
||||||
|
`export ASCEND_BUFFER_POOL=4:8`: ASCEND_BUFFER_POOL is the environment variable for configuring the number and size of buffer on NPU Device for aggregation and KV transfer,the value 4:8 means we allocate 4 buffers of size 8MB. It only can be used for 800 I/T A2 series.
|
||||||
|
|
||||||
### Run Mooncake Master
|
### Run Mooncake Master
|
||||||
|
|
||||||
#### 1.Configure mooncake.json
|
#### 1.Configure mooncake.json
|
||||||
|
|||||||
@@ -7,16 +7,14 @@ from dataclasses import dataclass
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
# Third Party
|
# Third Party
|
||||||
from mooncake.store import ReplicateConfig # type: ignore
|
|
||||||
from vllm.config import ParallelConfig
|
from vllm.config import ParallelConfig
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
from vllm.utils.network_utils import get_ip
|
from vllm.utils.network_utils import get_ip
|
||||||
|
|
||||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import Backend
|
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import Backend
|
||||||
from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import global_te
|
from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import global_te
|
||||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
|
||||||
|
|
||||||
DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB
|
DEFAULT_GLOBAL_SEGMENT_SIZE = 1073741824 # 1.0 GiB
|
||||||
DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB
|
DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB
|
||||||
|
|
||||||
|
|
||||||
@@ -35,18 +33,34 @@ class MooncakeBackend(Backend):
|
|||||||
self.rank = parallel_config.rank
|
self.rank = parallel_config.rank
|
||||||
if self.config.protocol == "ascend":
|
if self.config.protocol == "ascend":
|
||||||
local_hostname = get_ip()
|
local_hostname = get_ip()
|
||||||
transfer_engine = global_te.get_transfer_engine(local_hostname, device_name=None)
|
# ASCEND_ENABLE_USE_FABRIC_MEM: Enable unified memory address direct transmission scheme
|
||||||
self.local_seg = local_hostname + ":" + str(transfer_engine.get_rpc_port())
|
# and only can be used for 800 I/T A3 series.
|
||||||
ret = self.store.setup(
|
# Required supporting hardware versions are as follows:
|
||||||
self.local_seg,
|
if os.getenv("ASCEND_ENABLE_USE_FABRIC_MEM", "0") != "1":
|
||||||
self.config.metadata_server,
|
transfer_engine = global_te.get_transfer_engine(local_hostname, device_name=None)
|
||||||
self.config.global_segment_size,
|
self.local_seg = local_hostname + ":" + str(transfer_engine.get_rpc_port())
|
||||||
self.config.local_buffer_size,
|
ret = self.store.setup(
|
||||||
self.config.protocol,
|
self.local_seg,
|
||||||
self.config.device_name,
|
self.config.metadata_server,
|
||||||
self.config.master_server_address,
|
self.config.global_segment_size,
|
||||||
transfer_engine.get_engine(),
|
self.config.local_buffer_size,
|
||||||
)
|
self.config.protocol,
|
||||||
|
self.config.device_name,
|
||||||
|
self.config.master_server_address,
|
||||||
|
transfer_engine.get_engine(),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.local_seg = local_hostname
|
||||||
|
ret = self.store.setup(
|
||||||
|
self.local_seg,
|
||||||
|
self.config.metadata_server,
|
||||||
|
self.config.global_segment_size,
|
||||||
|
0,
|
||||||
|
self.config.protocol,
|
||||||
|
self.config.device_name,
|
||||||
|
self.config.master_server_address,
|
||||||
|
)
|
||||||
|
|
||||||
if ret != 0:
|
if ret != 0:
|
||||||
msg = "Initialize mooncake failed."
|
msg = "Initialize mooncake failed."
|
||||||
logger.error(msg)
|
logger.error(msg)
|
||||||
@@ -57,21 +71,15 @@ class MooncakeBackend(Backend):
|
|||||||
torch.npu.set_device(device)
|
torch.npu.set_device(device)
|
||||||
|
|
||||||
def register_buffer(self, ptrs: list[int], lengths: list[int]):
|
def register_buffer(self, ptrs: list[int], lengths: list[int]):
|
||||||
global_te.register_buffer(ptrs, lengths)
|
if os.getenv("ASCEND_ENABLE_USE_FABRIC_MEM", "0") != "1":
|
||||||
|
global_te.register_buffer(ptrs, lengths)
|
||||||
|
|
||||||
def exists(self, keys: list[str]) -> list[int]:
|
def exists(self, keys: list[str]) -> list[int]:
|
||||||
return self.store.batch_is_exist(keys)
|
return self.store.batch_is_exist(keys)
|
||||||
|
|
||||||
def put(self, keys: list[str], addrs: list[list[int]], sizes: list[list[int]]):
|
def put(self, keys: list[str], addrs: list[list[int]], sizes: list[list[int]]):
|
||||||
try:
|
try:
|
||||||
soc_version = get_ascend_device_type()
|
res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes)
|
||||||
if soc_version in {AscendDeviceType.A2}:
|
|
||||||
res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes)
|
|
||||||
else:
|
|
||||||
config = ReplicateConfig()
|
|
||||||
config.preferred_segment = self.local_seg
|
|
||||||
config.prefer_alloc_in_same_node = True
|
|
||||||
res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes, config)
|
|
||||||
for value in res:
|
for value in res:
|
||||||
if value < 0:
|
if value < 0:
|
||||||
logger.error(f"Failed to put key {keys},res:{res}")
|
logger.error(f"Failed to put key {keys},res:{res}")
|
||||||
@@ -80,11 +88,7 @@ class MooncakeBackend(Backend):
|
|||||||
|
|
||||||
def get(self, keys: list[str], addrs: list[list[int]], sizes: list[list[int]]):
|
def get(self, keys: list[str], addrs: list[list[int]], sizes: list[list[int]]):
|
||||||
try:
|
try:
|
||||||
soc_version = get_ascend_device_type()
|
res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes)
|
||||||
if soc_version in {AscendDeviceType.A2}:
|
|
||||||
res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes)
|
|
||||||
else:
|
|
||||||
res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes, True)
|
|
||||||
for value in res:
|
for value in res:
|
||||||
if value < 0:
|
if value < 0:
|
||||||
logger.error(f"Failed to get key {keys}, res:{res}")
|
logger.error(f"Failed to get key {keys}, res:{res}")
|
||||||
|
|||||||
@@ -30,7 +30,6 @@ from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.kv_transfer import
|
|||||||
KVCacheStoreSendingThread,
|
KVCacheStoreSendingThread,
|
||||||
KVTransferThread,
|
KVTransferThread,
|
||||||
)
|
)
|
||||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
|
||||||
|
|
||||||
backend_map = {
|
backend_map = {
|
||||||
"mooncake": {
|
"mooncake": {
|
||||||
@@ -98,12 +97,6 @@ class KVPoolWorker:
|
|||||||
self.head_or_tp_rank = self.tp_rank
|
self.head_or_tp_rank = self.tp_rank
|
||||||
self.put_step = 1
|
self.put_step = 1
|
||||||
|
|
||||||
soc_version = get_ascend_device_type()
|
|
||||||
# be removed later
|
|
||||||
if self.backend == "mooncake" and soc_version in {AscendDeviceType.A3}:
|
|
||||||
self.head_or_tp_rank = self.tp_rank
|
|
||||||
self.put_step = 1
|
|
||||||
|
|
||||||
self.metadata = KeyMetadata(
|
self.metadata = KeyMetadata(
|
||||||
model_config.model.rstrip("/").split("/")[-1],
|
model_config.model.rstrip("/").split("/")[-1],
|
||||||
self.head_or_tp_rank,
|
self.head_or_tp_rank,
|
||||||
|
|||||||
Reference in New Issue
Block a user