[Refactor]Refactor of vllm_ascend/distributed module (#5719)

### What this PR does / why we need it?
Based on the RFC:https://github.com/vllm-project/vllm-ascend/issues/5604

This PR is a refactoring of vllm_ascend/distributed, moving all
kv_transfer realtaed codes into a dedicated folder, which has already
been done in vLLM

### Does this PR introduce _any_ user-facing change?
NA

### How was this patch tested?


- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef

---------

Signed-off-by: lty <linhebiwen@gmail.com>
This commit is contained in:
lty
2026-01-15 08:57:40 +08:00
committed by GitHub
parent f34b3b8ee9
commit 295018ec0f
56 changed files with 300 additions and 293 deletions

View File

@@ -23,7 +23,7 @@ def register():
def register_connector():
from vllm_ascend.distributed import register_connector
from vllm_ascend.distributed.kv_transfer import register_connector
register_connector()

View File

@@ -1,44 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from vllm.distributed.kv_transfer.kv_connector.factory import \
KVConnectorFactory
def register_connector():
KVConnectorFactory.register_connector(
"MooncakeConnectorV1", "vllm_ascend.distributed.mooncake_connector",
"MooncakeConnector")
KVConnectorFactory.register_connector(
"MooncakeConnectorStoreV1",
"vllm_ascend.distributed.kvpool.ascend_store_connector",
"AscendStoreConnector")
KVConnectorFactory.register_connector(
"AscendStoreConnector",
"vllm_ascend.distributed.kvpool.ascend_store_connector",
"AscendStoreConnector")
KVConnectorFactory.register_connector(
"MooncakeLayerwiseConnector",
"vllm_ascend.distributed.mooncake_layerwise_connector",
"MooncakeLayerwiseConnector")
KVConnectorFactory.register_connector(
"UCMConnector", "vllm_ascend.distributed.ucm_connector",
"UCMConnectorV1")

View File

@@ -0,0 +1,45 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from vllm.distributed.kv_transfer.kv_connector.factory import \
KVConnectorFactory
def register_connector():
KVConnectorFactory.register_connector(
"MooncakeConnectorV1",
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector",
"MooncakeConnector")
KVConnectorFactory.register_connector(
"MooncakeConnectorStoreV1",
"vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.ascend_store_connector",
"AscendStoreConnector")
KVConnectorFactory.register_connector(
"AscendStoreConnector",
"vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.ascend_store_connector",
"AscendStoreConnector")
KVConnectorFactory.register_connector(
"MooncakeLayerwiseConnector",
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_layerwise_connector",
"MooncakeLayerwiseConnector")
KVConnectorFactory.register_connector(
"UCMConnector", "vllm_ascend.distributed.kv_transfer.ucm_connector",
"UCMConnectorV1")

View File

@@ -41,8 +41,8 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import RequestStatus
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
from vllm_ascend.distributed.mooncake_transfer_engine import global_te
from vllm_ascend.distributed.utils import get_transfer_timeout_value
from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import global_te
from vllm_ascend.distributed.kv_transfer.utils.utils import get_transfer_timeout_value
from vllm_ascend.utils import is_vl_model
if TYPE_CHECKING:

View File

@@ -32,11 +32,12 @@ from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.mooncake_connector import GET_META_MSG
from vllm_ascend.distributed.mooncake_transfer_engine import global_te
from vllm_ascend.distributed.utils import (align_memory,
get_transfer_timeout_value,
kv_alltoall_and_rearrange)
from vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector import \
GET_META_MSG
from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import \
global_te
from vllm_ascend.distributed.kv_transfer.utils.utils import (
align_memory, get_transfer_timeout_value, kv_alltoall_and_rearrange)
from vllm_ascend.utils import npu_stream_switch
if TYPE_CHECKING:

View File

@@ -16,9 +16,10 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request
from vllm.v1.serial_utils import MsgpackDecoder
from vllm_ascend.distributed.kvpool.pool_scheduler import (
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.pool_scheduler import (
KVPoolScheduler, get_zmq_rpc_path_lookup)
from vllm_ascend.distributed.kvpool.pool_worker import KVPoolWorker
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.pool_worker import \
KVPoolWorker
class AscendStoreConnector(KVConnectorBase_V1):

View File

@@ -5,7 +5,8 @@ import torch
from vllm.config import ParallelConfig
from vllm.logger import logger
from vllm_ascend.distributed.kvpool.backend.backend import Backend
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import \
Backend
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type

View File

@@ -11,8 +11,10 @@ from vllm.config import ParallelConfig
from vllm.logger import logger
from vllm.utils.network_utils import get_ip
from vllm_ascend.distributed.kvpool.backend.backend import Backend
from vllm_ascend.distributed.mooncake_transfer_engine import global_te
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import \
Backend
from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import \
global_te
DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB
DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB

View File

@@ -7,10 +7,11 @@ from typing import Any
import torch
from vllm.logger import logger
from vllm_ascend.distributed.kvpool.backend.backend import Backend
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import \
Backend
# isort: off
from vllm_ascend.distributed.kvpool.config_data import (
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.config_data import (
ChunkedTokenDatabase,
LasyerMultiBlockReqMeta,
ReqMeta,

View File

@@ -14,7 +14,7 @@ from vllm.utils.network_utils import make_zmq_socket
from vllm.utils.torch_utils import get_dtype_size
from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec
from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
from vllm_ascend.distributed.kv_transfer.kv_pool.cpu_offload.cpu_kv_cache_manager import \
CPUKVCacheManager

View File

@@ -13,7 +13,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import Request
from vllm.v1.serial_utils import MsgpackEncoder
from vllm_ascend.distributed.kvpool.config_data import (
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.config_data import (
AscendConnectorMetadata, LoadSpec, ReqMeta, RequestTracker)

View File

@@ -11,15 +11,16 @@ from vllm.distributed import (get_decode_context_model_parallel_rank,
from vllm.logger import logger
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm_ascend.distributed.kvpool.backend.backend import Backend
from vllm_ascend.distributed.kvpool.backend.memcache_backend import \
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.backend import \
Backend
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.memcache_backend import \
MemcacheBackend
from vllm_ascend.distributed.kvpool.backend.mooncake_backend import \
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.backend.mooncake_backend import \
MooncakeBackend
from vllm_ascend.distributed.kvpool.config_data import (
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.config_data import (
AscendConnectorMetadata, ChunkedTokenDatabase, KeyMetadata,
LasyerMultiBlockReqMeta, ReqMeta)
from vllm_ascend.distributed.kvpool.kv_transfer import (
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.kv_transfer import (
KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread,
KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread)

View File

@@ -24,7 +24,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
MambaSpec, MLAAttentionSpec)
from vllm_ascend.distributed.cpu_offload_manager.metadata import (
from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.metadata import (
MetadataServer, MetadataServerProc, MLAConfig)
if TYPE_CHECKING:

View File

@@ -0,0 +1,61 @@
import os
import torch
import torch.distributed as dist
from vllm_ascend.distributed.parallel_state import get_p_tp_group
def kv_alltoall_and_rearrange(pd_tp_ratio: int, key: torch.Tensor,
value: torch.TensorType):
if pd_tp_ratio <= 1:
return None, None
elif key is None or value is None:
raise ValueError("key or value is None")
k_output = alltoall_and_rearrange(pd_tp_ratio, key)
v_output = alltoall_and_rearrange(pd_tp_ratio, value)
return k_output, v_output
def alltoall_and_rearrange(tp_ratio: int, input_tensor: torch.Tensor):
num_kv_heads = input_tensor.size(1)
output_tensor = torch.zeros_like(input_tensor)
dist.all_to_all_single(output_tensor,
input_tensor,
group=get_p_tp_group().device_group)
input_tensor = 0
result = rearrange_output(output_tensor, tp_ratio, num_kv_heads)
output_tensor = 0
return result
def rearrange_output(base_output: torch.Tensor, cut_num: int,
num_kv_heads: int):
size_0 = base_output.size(0)
if size_0 % cut_num != 0:
raise ValueError(
f"The size of dim 0 [{size_0}] must be divisible by the cut_num [{cut_num}]"
)
chunk_size = size_0 // cut_num
reshaped = base_output.view(cut_num, chunk_size, -1)
transposed = reshaped.transpose(0, 1)
return transposed.contiguous().view(size_0, num_kv_heads, -1)
def align_memory(tensor: torch.Tensor, alignment: int) -> torch.Tensor:
data_ptr = tensor.data_ptr()
aligned_addr = (data_ptr + alignment - 1) // alignment * alignment
offset = (aligned_addr - data_ptr) // tensor.element_size()
return tensor[int(offset):]
def get_transfer_timeout_value():
ascend_transfer_timeout = os.getenv("ASCEND_TRANSFER_TIMEOUT", "")
if len(ascend_transfer_timeout) > 0:
return int(ascend_transfer_timeout)
hccl_rdma_timeout = int(os.getenv('HCCL_RDMA_TIMEOUT',
'20')) # type: ignore
hccl_rdma_retry_cnt = int(os.getenv('HCCL_RDMA_RETRY_CNT',
'7')) # type: ignore
return int((4.096 * (2**hccl_rdma_timeout)) * hccl_rdma_retry_cnt // 1000 +
3000)

View File

@@ -1,4 +1,3 @@
import os
from typing import Optional
import torch
@@ -6,63 +5,7 @@ import torch.distributed as dist
from vllm.distributed.parallel_state import GroupCoordinator, get_dp_group
from vllm.forward_context import get_forward_context
from vllm_ascend.distributed.parallel_state import (get_fc3_quant_x_group,
get_p_tp_group)
def kv_alltoall_and_rearrange(pd_tp_ratio: int, key: torch.Tensor,
value: torch.TensorType):
if pd_tp_ratio <= 1:
return None, None
elif key is None or value is None:
raise ValueError("key or value is None")
k_output = alltoall_and_rearrange(pd_tp_ratio, key)
v_output = alltoall_and_rearrange(pd_tp_ratio, value)
return k_output, v_output
def alltoall_and_rearrange(tp_ratio: int, input_tensor: torch.Tensor):
num_kv_heads = input_tensor.size(1)
output_tensor = torch.zeros_like(input_tensor)
dist.all_to_all_single(output_tensor,
input_tensor,
group=get_p_tp_group().device_group)
input_tensor = 0
result = rearrange_output(output_tensor, tp_ratio, num_kv_heads)
output_tensor = 0
return result
def rearrange_output(base_output: torch.Tensor, cut_num: int,
num_kv_heads: int):
size_0 = base_output.size(0)
if size_0 % cut_num != 0:
raise ValueError(
f"The size of dim 0 [{size_0}] must be divisible by the cut_num [{cut_num}]"
)
chunk_size = size_0 // cut_num
reshaped = base_output.view(cut_num, chunk_size, -1)
transposed = reshaped.transpose(0, 1)
return transposed.contiguous().view(size_0, num_kv_heads, -1)
def align_memory(tensor: torch.Tensor, alignment: int) -> torch.Tensor:
data_ptr = tensor.data_ptr()
aligned_addr = (data_ptr + alignment - 1) // alignment * alignment
offset = (aligned_addr - data_ptr) // tensor.element_size()
return tensor[int(offset):]
def get_transfer_timeout_value():
ascend_transfer_timeout = os.getenv("ASCEND_TRANSFER_TIMEOUT", "")
if len(ascend_transfer_timeout) > 0:
return int(ascend_transfer_timeout)
hccl_rdma_timeout = int(os.getenv('HCCL_RDMA_TIMEOUT',
'20')) # type: ignore
hccl_rdma_retry_cnt = int(os.getenv('HCCL_RDMA_RETRY_CNT',
'7')) # type: ignore
return int((4.096 * (2**hccl_rdma_timeout)) * hccl_rdma_retry_cnt // 1000 +
3000)
from vllm_ascend.distributed.parallel_state import get_fc3_quant_x_group
def fc3_all_gather_and_maybe_unpad_impl(x: torch.Tensor, ) -> torch.Tensor:
@@ -90,6 +33,7 @@ def fc3_all_gather_and_maybe_unpad_impl(x: torch.Tensor, ) -> torch.Tensor:
result[offset:offset + num_tokens_dp] = x[idx, :num_tokens_dp]
offset += num_tokens_dp
x = result
return x

View File

@@ -23,7 +23,8 @@ from torch.distributed import Backend
from vllm.distributed.parallel_state import (GroupCoordinator,
_get_unique_name, _register_group)
from vllm_ascend.distributed.communicator import NPUCommunicator
from vllm_ascend.distributed.device_communicators.npu_communicator import \
NPUCommunicator
from vllm_ascend.utils import create_hccl_pg_options

View File

@@ -411,7 +411,7 @@ class NPUPlatform(Platform):
@classmethod
def get_device_communicator_cls(cls) -> str:
return "vllm_ascend.distributed.communicator.NPUCommunicator"
return "vllm_ascend.distributed.device_communicators.npu_communicator.NPUCommunicator"
@classmethod
def is_pin_memory_available(cls):