[DOC] Qwen3 PD disaggregation user guide (#2751)

### What this PR does / why we need it?
The PR is for the document of the prefiller&decoder disaggregation
deloyment guide.

The scenario of the guide is:
- Use 3 nodes totally and 2 NPUs on each node
- Qwen3-30B-A3B
- 1P2D
- Expert Parallel

The deployment can be used to verify PD Disggregation / Expert Parallel
features with a slightly less resources.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
No.


- vLLM version: v0.10.1.1
- vLLM main:
e599e2c65e

---------

Signed-off-by: paulyu12 <507435917@qq.com>
This commit is contained in:
yupeng
2025-09-07 10:35:37 +08:00
committed by GitHub
parent b2f77d3aa8
commit a746f8274f
4 changed files with 295 additions and 29 deletions

View File

@@ -17,6 +17,10 @@ parser.add_argument("--decode-device-cnt",
type=int,
required=True,
help="number of decode devices")
parser.add_argument("--local-device-ids",
type=str,
required=False,
help="local device ids")
args = parser.parse_args()
local_host = args.local_host
prefill_device_cnt = args.prefill_device_cnt
@@ -54,39 +58,47 @@ chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split(
"\n")[0].split(":")[1].strip()
chips_per_card = int(chips_per_card)
if args.local_device_ids:
local_device_ids = args.local_device_ids.split(',')
else:
local_device_ids = []
for card_id in range(num_cards):
for chip_id in range(chips_per_card):
device_id = card_id * chips_per_card + chip_id
local_device_ids.append(device_id)
# generate local device list for local rank 0, and gather it to all ranks
local_device_list: list[dict[str, str]] = list()
if local_rank == "0":
super_pod_id = "0"
for card_id in range(num_cards):
for chip_id in range(chips_per_card):
device_id = card_id * chips_per_card + chip_id
if soc_info == AscendSocVersion.A3:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
).split(":")[1].strip()
super_device_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
).split(":")[1].strip()
super_pod_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
).split(":")[1].strip()
else:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
).split(":")[1].strip()
for idx in range(len(local_device_ids)):
device_id = local_device_ids[idx]
if soc_info == AscendSocVersion.A3:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
).split(":")[1].strip()
super_device_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
).split(":")[1].strip()
super_pod_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
).split(":")[1].strip()
else:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
).split(":")[1].strip()
device_info = {
"server_id": local_host,
"device_id": str(device_id),
"device_ip": str(device_ip),
}
if soc_info == AscendSocVersion.A3:
device_info.update({
"super_pod_id": str(super_pod_id),
"super_device_id": str(super_device_id)
})
local_device_list.append(device_info)
device_info = {
"server_id": local_host,
"device_id": str(device_id),
"device_ip": str(device_ip),
}
if soc_info == AscendSocVersion.A3:
device_info.update({
"super_pod_id": str(super_pod_id),
"super_device_id": str(super_device_id)
})
local_device_list.append(device_info)
dist.init_process_group(backend=dist.Backend.GLOO)
global_device_list = [None] * dist.get_world_size()

View File

@@ -33,6 +33,11 @@ while [[ $# -gt 0 ]]; do
DECODE_DEVICE_CNT="$1"
shift
;;
--local-device-ids)
shift
LOCAL_DEVICE_IDS="$1"
shift
;;
esac
done
LOCAL_HOSTS=($(hostname -I))
@@ -68,6 +73,10 @@ echo "NNODES": $NNODES
echo "NODE_RANK": $NODE_RANK
echo "==============="
if [ -n "$LOCAL_DEVICE_IDS" ]; then
OPTIONAL_SECTION=" --local-device-ids $LOCAL_DEVICE_IDS"
fi
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
--nproc_per_node 1 \
@@ -75,5 +84,5 @@ if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
--node_rank ${NODE_RANK} \
--master_addr ${MASTER_ADDR} \
--master_port ${MASTER_PORT} \
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT $OPTIONAL_SECTION
fi