[Refactor]Refactor of vllm_ascend/distributed module (#5719)
### What this PR does / why we need it?
Based on the RFC:https://github.com/vllm-project/vllm-ascend/issues/5604
This PR is a refactoring of vllm_ascend/distributed, moving all
kv_transfer realtaed codes into a dedicated folder, which has already
been done in vLLM
### Does this PR introduce _any_ user-facing change?
NA
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
---------
Signed-off-by: lty <linhebiwen@gmail.com>
This commit is contained in:
@@ -295,7 +295,7 @@ This section assumes that you already have a [Kubernetes](https://kubernetes.io/
|
||||
[2025-12-30 11:01:01] INFO multi_node_config.py:348: Resolving cluster IPs via DNS...
|
||||
[2025-12-30 11:01:01] INFO multi_node_config.py:212: Node 0 envs: {'VLLM_USE_MODELSCOPE': 'True', 'OMP_PROC_BIND': 'False', 'OMP_NUM_THREADS': '100', 'HCCL_BUFFSIZE': '1024', 'SERVER_PORT': '8080', 'NUMEXPR_MAX_THREADS': '128', 'DISAGGREGATED_PREFILL_PROXY_SCRIPT': 'examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py', 'HCCL_IF_IP': '10.0.0.102', 'HCCL_SOCKET_IFNAME': 'eth0', 'GLOO_SOCKET_IFNAME': 'eth0', 'TP_SOCKET_IFNAME': 'eth0', 'LOCAL_IP': '10.0.0.102', 'NIC_NAME': 'eth0', 'MASTER_IP': '10.0.0.102'}
|
||||
[2025-12-30 11:01:01] INFO multi_node_config.py:159: Launching proxy: python examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py --host 10.0.0.102 --port 6000 --prefiller-hosts 10.0.0.102 --prefiller-ports 8080 --decoder-hosts 10.0.0.138 --decoder-ports 8080
|
||||
[2025-12-30 11:01:01] INFO conftest.py:107: Starting server with command: vllm serve vllm-ascend/DeepSeek-V3-W8A8 --host 0.0.0.0 --port 8080 --data-parallel-size 2 --data-parallel-size-local 2 --tensor-parallel-size 8 --seed 1024 --enforce-eager --enable-expert-parallel --max-num-seqs 16 --max-model-len 8192 --max-num-batched-tokens 8192 --quantization ascend --trust-remote-code --no-enable-prefix-caching --gpu-memory-utilization 0.9 --kv-transfer-config {"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", "kv_connector_extra_config": {
|
||||
[2025-12-30 11:01:01] INFO conftest.py:107: Starting server with command: vllm serve vllm-ascend/DeepSeek-V3-W8A8 --host 0.0.0.0 --port 8080 --data-parallel-size 2 --data-parallel-size-local 2 --tensor-parallel-size 8 --seed 1024 --enforce-eager --enable-expert-parallel --max-num-seqs 16 --max-model-len 8192 --max-num-batched-tokens 8192 --quantization ascend --trust-remote-code --no-enable-prefix-caching --gpu-memory-utilization 0.9 --kv-transfer-config {"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", "kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
"tp_size": 8
|
||||
|
||||
@@ -326,7 +326,6 @@ vllm serve /weights/DeepSeek-V3.1-w8a8-mtp-QuaRot \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
@@ -402,7 +401,6 @@ vllm serve /weights/DeepSeek-V3.1-w8a8-mtp-QuaRot \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
@@ -480,7 +478,6 @@ vllm serve /weights/DeepSeek-V3.1-w8a8-mtp-QuaRot \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
@@ -558,7 +555,6 @@ vllm serve /weights/DeepSeek-V3.1-w8a8-mtp-QuaRot \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30300",
|
||||
"engine_id": "3",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
|
||||
@@ -316,7 +316,6 @@ Before you start, please
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
@@ -391,7 +390,6 @@ Before you start, please
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
@@ -469,7 +467,6 @@ Before you start, please
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
@@ -548,7 +545,6 @@ Before you start, please
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
|
||||
@@ -450,7 +450,6 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
@@ -516,7 +515,6 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
@@ -583,7 +581,6 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
|
||||
@@ -123,7 +123,6 @@ vllm serve /path_to_weight/DeepSeek-V3.1_w8a8mix_mtp \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
@@ -192,7 +191,6 @@ vllm serve /path_to_weight/DeepSeek-V3.1_w8a8mix_mtp \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
@@ -259,7 +257,6 @@ vllm serve /path_to_weight/DeepSeek-V3.1_w8a8mix_mtp \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "3",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 1,
|
||||
|
||||
@@ -280,7 +280,6 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
@@ -340,7 +339,6 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
@@ -401,7 +399,6 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
@@ -461,7 +458,6 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
|
||||
"kv_connector_extra_config": {
|
||||
|
||||
"prefill": {
|
||||
@@ -529,7 +525,6 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
@@ -589,7 +584,6 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
@@ -650,7 +644,6 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
@@ -710,7 +703,6 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
|
||||
@@ -173,7 +173,6 @@ vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 1,
|
||||
@@ -216,7 +215,6 @@ vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 1,
|
||||
|
||||
@@ -137,7 +137,7 @@ spec:
|
||||
- "--trust-remote-code"
|
||||
- "--enforce-eager"
|
||||
- "--kv-transfer-config"
|
||||
- '{"kv_connector":"MooncakeConnectorV1","kv_buffer_device":"npu","kv_role":"kv_producer","kv_parallel_size":1,"kv_port":"20001","engine_id":"0","kv_rank":0,"kv_connector_module_path":"vllm_ascend.distributed.mooncake_connector","kv_connector_extra_config":{"prefill":{"dp_size":2,"tp_size":2},"decode":{"dp_size":2,"tp_size":2}}}'
|
||||
- '{"kv_connector":"MooncakeConnectorV1","kv_buffer_device":"npu","kv_role":"kv_producer","kv_parallel_size":1,"kv_port":"20001","engine_id":"0","kv_rank":0,"kv_connector_extra_config":{"prefill":{"dp_size":2,"tp_size":2},"decode":{"dp_size":2,"tp_size":2}}}'
|
||||
imagePullPolicy: Always
|
||||
resources:
|
||||
limits:
|
||||
@@ -240,7 +240,7 @@ spec:
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--enforce-eager"
|
||||
- "--kv-transfer-config"
|
||||
- '{"kv_connector":"MooncakeConnectorV1","kv_buffer_device":"npu","kv_role":"kv_consumer","kv_parallel_size":1,"kv_port":"20002","engine_id":"1","kv_rank":1,"kv_connector_module_path":"vllm_ascend.distributed.mooncake_connector","kv_connector_extra_config":{"prefill":{"dp_size":2,"tp_size":2},"decode":{"dp_size":2,"tp_size":2}}}'
|
||||
- '{"kv_connector":"MooncakeConnectorV1","kv_buffer_device":"npu","kv_role":"kv_consumer","kv_parallel_size":1,"kv_port":"20002","engine_id":"1","kv_rank":1,"kv_connector_extra_config":{"prefill":{"dp_size":2,"tp_size":2},"decode":{"dp_size":2,"tp_size":2}}}'
|
||||
imagePullPolicy: Always
|
||||
resources:
|
||||
limits:
|
||||
|
||||
@@ -163,8 +163,7 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \
|
||||
"kv_role": "kv_producer",
|
||||
"kv_parallel_size": "1",
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector"
|
||||
"engine_id": "0"
|
||||
}'
|
||||
--additional-config '{"enable_weight_nz_layout":true,"enable_prefill_optimizations":true}'
|
||||
```
|
||||
@@ -230,8 +229,7 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_parallel_size": "1",
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector"
|
||||
"engine_id": "0"
|
||||
}' \
|
||||
--additional-config '{"enable_weight_nz_layout":true}'
|
||||
```
|
||||
@@ -435,8 +433,7 @@ In the PD separation scenario, we provide a optimized configuration.
|
||||
"kv_role": "kv_producer",
|
||||
"kv_parallel_size": "1",
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector"
|
||||
"engine_id": "0"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -458,8 +455,7 @@ In the PD separation scenario, we provide a optimized configuration.
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_parallel_size": "1",
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector"
|
||||
"engine_id": "0"
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user