[Refactor]Refactor of vllm_ascend/distributed module (#5719)
### What this PR does / why we need it?
Based on the RFC:https://github.com/vllm-project/vllm-ascend/issues/5604
This PR is a refactoring of vllm_ascend/distributed, moving all
kv_transfer realtaed codes into a dedicated folder, which has already
been done in vLLM
### Does this PR introduce _any_ user-facing change?
NA
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
---------
Signed-off-by: lty <linhebiwen@gmail.com>
This commit is contained in:
@@ -23,7 +23,7 @@ def register():
|
||||
|
||||
|
||||
def register_connector():
|
||||
from vllm_ascend.distributed import register_connector
|
||||
from vllm_ascend.distributed.kv_transfer import register_connector
|
||||
register_connector()
|
||||
|
||||
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from vllm.distributed.kv_transfer.kv_connector.factory import \
|
||||
KVConnectorFactory
|
||||
|
||||
|
||||
def register_connector():
|
||||
KVConnectorFactory.register_connector(
|
||||
"MooncakeConnectorV1", "vllm_ascend.distributed.mooncake_connector",
|
||||
"MooncakeConnector")
|
||||
|
||||
KVConnectorFactory.register_connector(
|
||||
"MooncakeConnectorStoreV1",
|
||||
"vllm_ascend.distributed.kvpool.ascend_store_connector",
|
||||
"AscendStoreConnector")
|
||||
|
||||
KVConnectorFactory.register_connector(
|
||||
"AscendStoreConnector",
|
||||
"vllm_ascend.distributed.kvpool.ascend_store_connector",
|
||||
"AscendStoreConnector")
|
||||
|
||||
KVConnectorFactory.register_connector(
|
||||
"MooncakeLayerwiseConnector",
|
||||
"vllm_ascend.distributed.mooncake_layerwise_connector",
|
||||
"MooncakeLayerwiseConnector")
|
||||
|
||||
KVConnectorFactory.register_connector(
|
||||
"UCMConnector", "vllm_ascend.distributed.ucm_connector",
|
||||
"UCMConnectorV1")
|
||||
|
||||
45
vllm_ascend/distributed/kv_transfer/__init__.py
Normal file
45
vllm_ascend/distributed/kv_transfer/__init__.py
Normal file
@@ -0,0 +1,45 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from vllm.distributed.kv_transfer.kv_connector.factory import \
|
||||
KVConnectorFactory
|
||||
|
||||
|
||||
def register_connector():
|
||||
KVConnectorFactory.register_connector(
|
||||
"MooncakeConnectorV1",
|
||||
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector",
|
||||
"MooncakeConnector")
|
||||
|
||||
KVConnectorFactory.register_connector(
|
||||
"MooncakeConnectorStoreV1",
|
||||
"vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.ascend_store_connector",
|
||||
"AscendStoreConnector")
|
||||
|
||||
KVConnectorFactory.register_connector(
|
||||
"AscendStoreConnector",
|
||||
"vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.ascend_store_connector",
|
||||
"AscendStoreConnector")
|
||||
|
||||
KVConnectorFactory.register_connector(
|
||||
"MooncakeLayerwiseConnector",
|
||||
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_layerwise_connector",
|
||||
"MooncakeLayerwiseConnector")
|
||||
|
||||
KVConnectorFactory.register_connector(
|
||||
"UCMConnector", "vllm_ascend.distributed.kv_transfer.ucm_connector",
|
||||
"UCMConnectorV1")
|
||||
@@ -41,8 +41,8 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.request import RequestStatus
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
|
||||
from vllm_ascend.distributed.mooncake_transfer_engine import global_te
|
||||
from vllm_ascend.distributed.utils import get_transfer_timeout_value
|
||||
from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import global_te
|
||||
from vllm_ascend.distributed.kv_transfer.utils.utils import get_transfer_timeout_value
|
||||
from vllm_ascend.utils import is_vl_model
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -32,11 +32,12 @@ from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.distributed.mooncake_connector import GET_META_MSG
|
||||
from vllm_ascend.distributed.mooncake_transfer_engine import global_te
|
||||
from vllm_ascend.distributed.utils import (align_memory,
|
||||
get_transfer_timeout_value,
|
||||
kv_alltoall_and_rearrange)
|
||||
from vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector import \
|
||||
GET_META_MSG
|
||||
from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import \
|
||||
global_te
|
||||
from vllm_ascend.distributed.kv_transfer.utils.utils import (
|
||||
align_memory, get_transfer_timeout_value, kv_alltoall_and_rearrange)
|
||||
from vllm_ascend.utils import npu_stream_switch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -16,9 +16,10 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.request import Request
|
||||
from vllm.v1.serial_utils import MsgpackDecoder
|
||||
|
||||
from vllm_ascend.distributed.kvpool.pool_scheduler import (
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.pool_scheduler import (
|
||||
KVPoolScheduler, get_zmq_rpc_path_lookup)
|
||||
from vllm_ascend.distributed.kvpool.pool_worker import KVPoolWorker
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.pool_worker import \
|
||||
KVPoolWorker
|
||||
|
||||
|
||||
class AscendStoreConnector(KVConnectorBase_V1):
|
||||
@@ -5,7 +5,8 @@ import torch
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.logger import logger
|
||||
|
||||
from vllm_ascend.distributed.kvpool.backend.backend import Backend
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import \
|
||||
Backend
|
||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||
|
||||
|
||||
@@ -11,8 +11,10 @@ from vllm.config import ParallelConfig
|
||||
from vllm.logger import logger
|
||||
from vllm.utils.network_utils import get_ip
|
||||
|
||||
from vllm_ascend.distributed.kvpool.backend.backend import Backend
|
||||
from vllm_ascend.distributed.mooncake_transfer_engine import global_te
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import \
|
||||
Backend
|
||||
from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import \
|
||||
global_te
|
||||
|
||||
DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB
|
||||
DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB
|
||||
@@ -7,10 +7,11 @@ from typing import Any
|
||||
import torch
|
||||
from vllm.logger import logger
|
||||
|
||||
from vllm_ascend.distributed.kvpool.backend.backend import Backend
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import \
|
||||
Backend
|
||||
|
||||
# isort: off
|
||||
from vllm_ascend.distributed.kvpool.config_data import (
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.config_data import (
|
||||
ChunkedTokenDatabase,
|
||||
LasyerMultiBlockReqMeta,
|
||||
ReqMeta,
|
||||
@@ -14,7 +14,7 @@ from vllm.utils.network_utils import make_zmq_socket
|
||||
from vllm.utils.torch_utils import get_dtype_size
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec
|
||||
|
||||
from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.cpu_offload.cpu_kv_cache_manager import \
|
||||
CPUKVCacheManager
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.request import Request
|
||||
from vllm.v1.serial_utils import MsgpackEncoder
|
||||
|
||||
from vllm_ascend.distributed.kvpool.config_data import (
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.config_data import (
|
||||
AscendConnectorMetadata, LoadSpec, ReqMeta, RequestTracker)
|
||||
|
||||
|
||||
@@ -11,15 +11,16 @@ from vllm.distributed import (get_decode_context_model_parallel_rank,
|
||||
from vllm.logger import logger
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||
|
||||
from vllm_ascend.distributed.kvpool.backend.backend import Backend
|
||||
from vllm_ascend.distributed.kvpool.backend.memcache_backend import \
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import \
|
||||
Backend
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.memcache_backend import \
|
||||
MemcacheBackend
|
||||
from vllm_ascend.distributed.kvpool.backend.mooncake_backend import \
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.mooncake_backend import \
|
||||
MooncakeBackend
|
||||
from vllm_ascend.distributed.kvpool.config_data import (
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.config_data import (
|
||||
AscendConnectorMetadata, ChunkedTokenDatabase, KeyMetadata,
|
||||
LasyerMultiBlockReqMeta, ReqMeta)
|
||||
from vllm_ascend.distributed.kvpool.kv_transfer import (
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.kv_transfer import (
|
||||
KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread,
|
||||
KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread)
|
||||
|
||||
@@ -24,7 +24,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
|
||||
MambaSpec, MLAAttentionSpec)
|
||||
|
||||
from vllm_ascend.distributed.cpu_offload_manager.metadata import (
|
||||
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.metadata import (
|
||||
MetadataServer, MetadataServerProc, MLAConfig)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
61
vllm_ascend/distributed/kv_transfer/utils/utils.py
Normal file
61
vllm_ascend/distributed/kv_transfer/utils/utils.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm_ascend.distributed.parallel_state import get_p_tp_group
|
||||
|
||||
|
||||
def kv_alltoall_and_rearrange(pd_tp_ratio: int, key: torch.Tensor,
|
||||
value: torch.TensorType):
|
||||
if pd_tp_ratio <= 1:
|
||||
return None, None
|
||||
elif key is None or value is None:
|
||||
raise ValueError("key or value is None")
|
||||
k_output = alltoall_and_rearrange(pd_tp_ratio, key)
|
||||
v_output = alltoall_and_rearrange(pd_tp_ratio, value)
|
||||
return k_output, v_output
|
||||
|
||||
|
||||
def alltoall_and_rearrange(tp_ratio: int, input_tensor: torch.Tensor):
|
||||
num_kv_heads = input_tensor.size(1)
|
||||
output_tensor = torch.zeros_like(input_tensor)
|
||||
dist.all_to_all_single(output_tensor,
|
||||
input_tensor,
|
||||
group=get_p_tp_group().device_group)
|
||||
input_tensor = 0
|
||||
result = rearrange_output(output_tensor, tp_ratio, num_kv_heads)
|
||||
output_tensor = 0
|
||||
return result
|
||||
|
||||
|
||||
def rearrange_output(base_output: torch.Tensor, cut_num: int,
|
||||
num_kv_heads: int):
|
||||
size_0 = base_output.size(0)
|
||||
if size_0 % cut_num != 0:
|
||||
raise ValueError(
|
||||
f"The size of dim 0 [{size_0}] must be divisible by the cut_num [{cut_num}]"
|
||||
)
|
||||
chunk_size = size_0 // cut_num
|
||||
reshaped = base_output.view(cut_num, chunk_size, -1)
|
||||
transposed = reshaped.transpose(0, 1)
|
||||
return transposed.contiguous().view(size_0, num_kv_heads, -1)
|
||||
|
||||
|
||||
def align_memory(tensor: torch.Tensor, alignment: int) -> torch.Tensor:
|
||||
data_ptr = tensor.data_ptr()
|
||||
aligned_addr = (data_ptr + alignment - 1) // alignment * alignment
|
||||
offset = (aligned_addr - data_ptr) // tensor.element_size()
|
||||
return tensor[int(offset):]
|
||||
|
||||
|
||||
def get_transfer_timeout_value():
|
||||
ascend_transfer_timeout = os.getenv("ASCEND_TRANSFER_TIMEOUT", "")
|
||||
if len(ascend_transfer_timeout) > 0:
|
||||
return int(ascend_transfer_timeout)
|
||||
hccl_rdma_timeout = int(os.getenv('HCCL_RDMA_TIMEOUT',
|
||||
'20')) # type: ignore
|
||||
hccl_rdma_retry_cnt = int(os.getenv('HCCL_RDMA_RETRY_CNT',
|
||||
'7')) # type: ignore
|
||||
return int((4.096 * (2**hccl_rdma_timeout)) * hccl_rdma_retry_cnt // 1000 +
|
||||
3000)
|
||||
@@ -1 +0,0 @@
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
@@ -6,63 +5,7 @@ import torch.distributed as dist
|
||||
from vllm.distributed.parallel_state import GroupCoordinator, get_dp_group
|
||||
from vllm.forward_context import get_forward_context
|
||||
|
||||
from vllm_ascend.distributed.parallel_state import (get_fc3_quant_x_group,
|
||||
get_p_tp_group)
|
||||
|
||||
|
||||
def kv_alltoall_and_rearrange(pd_tp_ratio: int, key: torch.Tensor,
|
||||
value: torch.TensorType):
|
||||
if pd_tp_ratio <= 1:
|
||||
return None, None
|
||||
elif key is None or value is None:
|
||||
raise ValueError("key or value is None")
|
||||
k_output = alltoall_and_rearrange(pd_tp_ratio, key)
|
||||
v_output = alltoall_and_rearrange(pd_tp_ratio, value)
|
||||
return k_output, v_output
|
||||
|
||||
|
||||
def alltoall_and_rearrange(tp_ratio: int, input_tensor: torch.Tensor):
|
||||
num_kv_heads = input_tensor.size(1)
|
||||
output_tensor = torch.zeros_like(input_tensor)
|
||||
dist.all_to_all_single(output_tensor,
|
||||
input_tensor,
|
||||
group=get_p_tp_group().device_group)
|
||||
input_tensor = 0
|
||||
result = rearrange_output(output_tensor, tp_ratio, num_kv_heads)
|
||||
output_tensor = 0
|
||||
return result
|
||||
|
||||
|
||||
def rearrange_output(base_output: torch.Tensor, cut_num: int,
|
||||
num_kv_heads: int):
|
||||
size_0 = base_output.size(0)
|
||||
if size_0 % cut_num != 0:
|
||||
raise ValueError(
|
||||
f"The size of dim 0 [{size_0}] must be divisible by the cut_num [{cut_num}]"
|
||||
)
|
||||
chunk_size = size_0 // cut_num
|
||||
reshaped = base_output.view(cut_num, chunk_size, -1)
|
||||
transposed = reshaped.transpose(0, 1)
|
||||
return transposed.contiguous().view(size_0, num_kv_heads, -1)
|
||||
|
||||
|
||||
def align_memory(tensor: torch.Tensor, alignment: int) -> torch.Tensor:
|
||||
data_ptr = tensor.data_ptr()
|
||||
aligned_addr = (data_ptr + alignment - 1) // alignment * alignment
|
||||
offset = (aligned_addr - data_ptr) // tensor.element_size()
|
||||
return tensor[int(offset):]
|
||||
|
||||
|
||||
def get_transfer_timeout_value():
|
||||
ascend_transfer_timeout = os.getenv("ASCEND_TRANSFER_TIMEOUT", "")
|
||||
if len(ascend_transfer_timeout) > 0:
|
||||
return int(ascend_transfer_timeout)
|
||||
hccl_rdma_timeout = int(os.getenv('HCCL_RDMA_TIMEOUT',
|
||||
'20')) # type: ignore
|
||||
hccl_rdma_retry_cnt = int(os.getenv('HCCL_RDMA_RETRY_CNT',
|
||||
'7')) # type: ignore
|
||||
return int((4.096 * (2**hccl_rdma_timeout)) * hccl_rdma_retry_cnt // 1000 +
|
||||
3000)
|
||||
from vllm_ascend.distributed.parallel_state import get_fc3_quant_x_group
|
||||
|
||||
|
||||
def fc3_all_gather_and_maybe_unpad_impl(x: torch.Tensor, ) -> torch.Tensor:
|
||||
@@ -90,6 +33,7 @@ def fc3_all_gather_and_maybe_unpad_impl(x: torch.Tensor, ) -> torch.Tensor:
|
||||
result[offset:offset + num_tokens_dp] = x[idx, :num_tokens_dp]
|
||||
offset += num_tokens_dp
|
||||
x = result
|
||||
|
||||
return x
|
||||
|
||||
|
||||
|
||||
@@ -23,7 +23,8 @@ from torch.distributed import Backend
|
||||
from vllm.distributed.parallel_state import (GroupCoordinator,
|
||||
_get_unique_name, _register_group)
|
||||
|
||||
from vllm_ascend.distributed.communicator import NPUCommunicator
|
||||
from vllm_ascend.distributed.device_communicators.npu_communicator import \
|
||||
NPUCommunicator
|
||||
from vllm_ascend.utils import create_hccl_pg_options
|
||||
|
||||
|
||||
|
||||
@@ -411,7 +411,7 @@ class NPUPlatform(Platform):
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
return "vllm_ascend.distributed.communicator.NPUCommunicator"
|
||||
return "vllm_ascend.distributed.device_communicators.npu_communicator.NPUCommunicator"
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls):
|
||||
|
||||
Reference in New Issue
Block a user