[Bugfix] mooncake connector support external dp & update readme (#3579)
### What this PR does / why we need it? mooncake connector support external dp & update readme ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
vLLM-Ascend now supports prefill-decode (PD) disaggregation with EP (Expert Parallel) options. This guide take one-by-one steps to verify these features with constrained resources.
|
vLLM-Ascend now supports prefill-decode (PD) disaggregation with EP (Expert Parallel) options. This guide take one-by-one steps to verify these features with constrained resources.
|
||||||
|
|
||||||
Take the Qwen3-235B model as an example, use vllm-ascend v0.11.0rc1 (with vLLM v0.11.0) on 4 Atlas 800T A3 servers to deploy the "2P1D" architecture. Assume the ip of the prefiller server is 192.0.0.1 (prefill 1) and 192.0.0.2 (prefill 2), and the decoder servers are 192.0.0.3 (decoder 1) and 192.0.0.4 (decoder 2). On each server, use 8 NPUs 16 chips to deploy one service instance.
|
Take the Qwen3-235B model as an example, use 4 Atlas 800T A3 servers to deploy the "2P1D" architecture. Assume the ip of the prefiller server is 192.0.0.1 (prefill 1) and 192.0.0.2 (prefill 2), and the decoder servers are 192.0.0.3 (decoder 1) and 192.0.0.4 (decoder 2). On each server, use 8 NPUs 16 chips to deploy one service instance.
|
||||||
|
|
||||||
## Verify Multi-Node Communication Environment
|
## Verify Multi-Node Communication Environment
|
||||||
|
|
||||||
@@ -30,17 +30,22 @@ for i in {0..15}; do hccn_tool -i $i -net_health -g ; done
|
|||||||
for i in {0..15}; do hccn_tool -i $i -netdetect -g ; done
|
for i in {0..15}; do hccn_tool -i $i -netdetect -g ; done
|
||||||
# View gateway configuration
|
# View gateway configuration
|
||||||
for i in {0..15}; do hccn_tool -i $i -gateway -g ; done
|
for i in {0..15}; do hccn_tool -i $i -gateway -g ; done
|
||||||
# View NPU network configuration
|
```
|
||||||
|
|
||||||
|
2. Check NPU network configuration:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Ensure that the hccn.conf file exists in the environment. If using Docker, mount it into the container.
|
||||||
cat /etc/hccn.conf
|
cat /etc/hccn.conf
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Get NPU IP Addresses
|
3. Get NPU IP Addresses
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
for i in {0..15}; do hccn_tool -i $i -ip -g | grep ipaddr; done
|
for i in {0..15}; do hccn_tool -i $i -ip -g | grep ipaddr; done
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Cross-Node PING Test
|
4. Cross-Node PING Test
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Execute on the target node (replace 'x.x.x.x' with actual npu ip address)
|
# Execute on the target node (replace 'x.x.x.x' with actual npu ip address)
|
||||||
@@ -123,7 +128,7 @@ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packa
|
|||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--api-server-count 2 \
|
--api-server-count 1 \
|
||||||
--data-parallel-size 2 \
|
--data-parallel-size 2 \
|
||||||
--data-parallel-size-local 2 \
|
--data-parallel-size-local 2 \
|
||||||
--data-parallel-address 192.0.0.1 \
|
--data-parallel-address 192.0.0.1 \
|
||||||
@@ -182,7 +187,7 @@ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packa
|
|||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--api-server-count 2 \
|
--api-server-count 1 \
|
||||||
--data-parallel-size 2 \
|
--data-parallel-size 2 \
|
||||||
--data-parallel-size-local 2 \
|
--data-parallel-size-local 2 \
|
||||||
--data-parallel-address 192.0.0.2 \
|
--data-parallel-address 192.0.0.2 \
|
||||||
@@ -241,7 +246,7 @@ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packa
|
|||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--api-server-count 4 \
|
--api-server-count 1 \
|
||||||
--data-parallel-size 32 \
|
--data-parallel-size 32 \
|
||||||
--data-parallel-size-local 16 \
|
--data-parallel-size-local 16 \
|
||||||
--data-parallel-address 192.0.0.3 \
|
--data-parallel-address 192.0.0.3 \
|
||||||
@@ -370,7 +375,7 @@ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packa
|
|||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--api-server-count 2 \
|
--api-server-count 1 \
|
||||||
--data-parallel-size 2 \
|
--data-parallel-size 2 \
|
||||||
--data-parallel-size-local 2 \
|
--data-parallel-size-local 2 \
|
||||||
--data-parallel-address 192.0.0.1 \
|
--data-parallel-address 192.0.0.1 \
|
||||||
@@ -397,8 +402,8 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
|||||||
"tp_size": 8
|
"tp_size": 8
|
||||||
},
|
},
|
||||||
"decode": {
|
"decode": {
|
||||||
"dp_size": 4,
|
"dp_size": 32,
|
||||||
"tp_size": 8
|
"tp_size": 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
@@ -429,7 +434,7 @@ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packa
|
|||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--api-server-count 2 \
|
--api-server-count 1 \
|
||||||
--data-parallel-size 2 \
|
--data-parallel-size 2 \
|
||||||
--data-parallel-size-local 2 \
|
--data-parallel-size-local 2 \
|
||||||
--data-parallel-address 192.0.0.2 \
|
--data-parallel-address 192.0.0.2 \
|
||||||
@@ -456,8 +461,8 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
|||||||
"tp_size": 8
|
"tp_size": 8
|
||||||
},
|
},
|
||||||
"decode": {
|
"decode": {
|
||||||
"dp_size": 4,
|
"dp_size": 32,
|
||||||
"tp_size": 8
|
"tp_size": 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
@@ -488,12 +493,12 @@ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packa
|
|||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--api-server-count 4 \
|
--api-server-count 1 \
|
||||||
--data-parallel-size 4 \
|
--data-parallel-size 32 \
|
||||||
--data-parallel-size-local 2 \
|
--data-parallel-size-local 16 \
|
||||||
--data-parallel-address 192.0.0.3 \
|
--data-parallel-address 192.0.0.3 \
|
||||||
--data-parallel-rpc-port 5964 \
|
--data-parallel-rpc-port 5964 \
|
||||||
--tensor-parallel-size 8 \
|
--tensor-parallel-size 1 \
|
||||||
--enable-expert-parallel \
|
--enable-expert-parallel \
|
||||||
--seed 1024 \
|
--seed 1024 \
|
||||||
--distributed-executor-backend mp \
|
--distributed-executor-backend mp \
|
||||||
@@ -517,8 +522,8 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
|||||||
"tp_size": 8
|
"tp_size": 8
|
||||||
},
|
},
|
||||||
"decode": {
|
"decode": {
|
||||||
"dp_size": 4,
|
"dp_size": 32,
|
||||||
"tp_size": 8
|
"tp_size": 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
@@ -550,12 +555,12 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
|||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--headless \
|
--headless \
|
||||||
--data-parallel-size 4 \
|
--data-parallel-size 32 \
|
||||||
--data-parallel-size-local 2 \
|
--data-parallel-size-local 16 \
|
||||||
--data-parallel-start-rank 2 \
|
--data-parallel-start-rank 16 \
|
||||||
--data-parallel-address 192.0.0.3 \
|
--data-parallel-address 192.0.0.3 \
|
||||||
--data-parallel-rpc-port 5964 \
|
--data-parallel-rpc-port 5964 \
|
||||||
--tensor-parallel-size 8 \
|
--tensor-parallel-size 1 \
|
||||||
--enable-expert-parallel \
|
--enable-expert-parallel \
|
||||||
--seed 1024 \
|
--seed 1024 \
|
||||||
--distributed-executor-backend mp \
|
--distributed-executor-backend mp \
|
||||||
@@ -579,8 +584,8 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
|||||||
"tp_size": 8
|
"tp_size": 8
|
||||||
},
|
},
|
||||||
"decode": {
|
"decode": {
|
||||||
"dp_size": 4,
|
"dp_size": 32,
|
||||||
"tp_size": 8
|
"tp_size": 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
|
|||||||
@@ -499,7 +499,7 @@ class MockVllmConfig:
|
|||||||
self.kv_transfer_config = MagicMock()
|
self.kv_transfer_config = MagicMock()
|
||||||
self.model_config.use_mla = True
|
self.model_config.use_mla = True
|
||||||
self.parallel_config.tensor_parallel_size = 2
|
self.parallel_config.tensor_parallel_size = 2
|
||||||
self.parallel_config.data_parallel_rank_local = 0
|
self.parallel_config.data_parallel_rank = 0
|
||||||
self.parallel_config.data_parallel_size_local = 1
|
self.parallel_config.data_parallel_size_local = 1
|
||||||
self.cache_config.block_size = 16
|
self.cache_config.block_size = 16
|
||||||
self.kv_transfer_config.kv_port = 5000
|
self.kv_transfer_config.kv_port = 5000
|
||||||
@@ -1085,7 +1085,7 @@ class TestMooncakeConnectorWorker(unittest.TestCase):
|
|||||||
|
|
||||||
config.parallel_config = MagicMock()
|
config.parallel_config = MagicMock()
|
||||||
config.parallel_config.tensor_parallel_size = 2
|
config.parallel_config.tensor_parallel_size = 2
|
||||||
config.parallel_config.data_parallel_rank_local = 0
|
config.parallel_config.data_parallel_rank = 0
|
||||||
config.parallel_config.data_parallel_size_local = 1
|
config.parallel_config.data_parallel_size_local = 1
|
||||||
config.kv_transfer_config.kv_port = 8000
|
config.kv_transfer_config.kv_port = 8000
|
||||||
config.kv_transfer_config.kv_role = 'worker'
|
config.kv_transfer_config.kv_role = 'worker'
|
||||||
|
|||||||
@@ -701,7 +701,7 @@ class MooncakeConnectorScheduler:
|
|||||||
# Handshake base port
|
# Handshake base port
|
||||||
self.side_channel_port = (
|
self.side_channel_port = (
|
||||||
vllm_config.kv_transfer_config.kv_port +
|
vllm_config.kv_transfer_config.kv_port +
|
||||||
vllm_config.parallel_config.data_parallel_rank_local *
|
vllm_config.parallel_config.data_parallel_rank *
|
||||||
vllm_config.parallel_config.tensor_parallel_size)
|
vllm_config.parallel_config.tensor_parallel_size)
|
||||||
|
|
||||||
# Requests that need to start recv.
|
# Requests that need to start recv.
|
||||||
@@ -891,7 +891,7 @@ class MooncakeConnectorWorker:
|
|||||||
self.tp_rank = get_tensor_model_parallel_rank()
|
self.tp_rank = get_tensor_model_parallel_rank()
|
||||||
self.tp_size = vllm_config.parallel_config.tensor_parallel_size
|
self.tp_size = vllm_config.parallel_config.tensor_parallel_size
|
||||||
self.tp_group = get_tp_group()
|
self.tp_group = get_tp_group()
|
||||||
self.dp_rank = vllm_config.parallel_config.data_parallel_rank_local
|
self.dp_rank = vllm_config.parallel_config.data_parallel_rank
|
||||||
self.dp_size = vllm_config.parallel_config.data_parallel_size_local
|
self.dp_size = vllm_config.parallel_config.data_parallel_size_local
|
||||||
self.kv_caches: dict[str, torch.Tensor] = {}
|
self.kv_caches: dict[str, torch.Tensor] = {}
|
||||||
self.side_channel_host = get_ip()
|
self.side_channel_host = get_ip()
|
||||||
@@ -902,7 +902,7 @@ class MooncakeConnectorWorker:
|
|||||||
# Handshake base port
|
# Handshake base port
|
||||||
self.side_channel_port = (
|
self.side_channel_port = (
|
||||||
vllm_config.kv_transfer_config.kv_port +
|
vllm_config.kv_transfer_config.kv_port +
|
||||||
vllm_config.parallel_config.data_parallel_rank_local *
|
vllm_config.parallel_config.data_parallel_rank *
|
||||||
vllm_config.parallel_config.tensor_parallel_size)
|
vllm_config.parallel_config.tensor_parallel_size)
|
||||||
self.handshake_port = self.side_channel_port + self.tp_rank
|
self.handshake_port = self.side_channel_port + self.tp_rank
|
||||||
self.sockets: dict = {}
|
self.sockets: dict = {}
|
||||||
|
|||||||
Reference in New Issue
Block a user