[DP] External dp server starter (#2685)
This PR re-implements external-dp starter based on vllm's support for
external dp.
- vLLM version: v0.10.1.1
- vLLM main:
f38035c123
---------
Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
38
examples/external_online_dp/README.md
Normal file
38
examples/external_online_dp/README.md
Normal file
@@ -0,0 +1,38 @@
|
||||
Here is an example guiding how to use `launch_online_dp.py` to launch external dp server in vllm. User can easily launch external dp server following the steps below:
|
||||
|
||||
### Modify parameters in `run_dp_template.sh`
|
||||
`run_dp_template.sh` is an template script used to launch each dp vllm instance separately. It will be called by `launch_online_dp.py` in multi threads and most of its configurations are set by `launch_online_dp.py`. Parameters you need to set manually include:
|
||||
|
||||
1. The IP and socket_ifname of your machine. If running on multi-nodes, please make sure the scripts on each node has been set with correct IP and socket_ifname of that node.
|
||||
2. vLLM serving related parameters including model_path and other configurations. Note that port, dp-related parammeters and tp_size is set by `launch_online_dp.py`, all the other vLLM parameters in this file only serve as an example and you are free to modify them according to your purpose.
|
||||
|
||||
### Run `launch_online_dp.py` with CL arguments
|
||||
All the arguments that can be set by users are:
|
||||
|
||||
1. `--dp-size`: global data parallel size, must be set
|
||||
2. `--tp-size`: tensor parallel size, default 1
|
||||
3. `--dp-size-local`: local data parallel size, defaultly set to `dp_size`
|
||||
4. `--dp-rank-start`: Starting rank for data parallel, default 0
|
||||
5. `--dp-address`: IP address of data parallel master node
|
||||
6. `--dp-rpc-port`: Port of data parallel master node, default 12345
|
||||
7. `--vllm-start-port`: Starting port of vLLM serving instances, default 9000
|
||||
|
||||
An example of running external DP in one single node:
|
||||
```(python)
|
||||
cd examples/external_online_dp
|
||||
# running DP4 TP4 in a node with 16 NPUs
|
||||
python launch_online_dp.py --dp-size 4 --tp-size 4 --dp-size-local 4 --dp-rank-start 0 --dp-address x.x.x.x --dp-rpc-port 12342
|
||||
```
|
||||
|
||||
An example of running external DP in two nodes:
|
||||
```(python)
|
||||
cd examples/external_online_dp
|
||||
# running DP4 TP4 in two nodes with 8 NPUs each
|
||||
|
||||
# On node 0:
|
||||
python launch_online_dp.py --dp-size 4 --tp-size 4 --dp-size-local 2 --dp-rank-start 0 --dp-address x.x.x.x --dp-rpc-port 12342
|
||||
|
||||
# On node 1:
|
||||
python launch_online_dp.py --dp-size 4 --tp-size 4 --dp-size-local 2 --dp-rank-start 2 --dp-address x.x.x.x --dp-rpc-port 12342
|
||||
```
|
||||
|
||||
97
examples/external_online_dp/launch_online_dp.py
Normal file
97
examples/external_online_dp/launch_online_dp.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import argparse
|
||||
import multiprocessing
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--dp-size",
|
||||
type=int,
|
||||
required=True,
|
||||
help="Data parallel size."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tp-size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Tensor parallel size."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dp-size-local",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Local data parallel size."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dp-rank-start",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Starting rank for data parallel."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dp-address",
|
||||
type=str,
|
||||
required=True,
|
||||
help="IP address for data parallel master node."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dp-rpc-port",
|
||||
type=str,
|
||||
default=12345,
|
||||
help="Port for data parallel master node."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vllm-start-port",
|
||||
type=int,
|
||||
default=9000,
|
||||
help="Starting port for the engine."
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
dp_size = args.dp_size
|
||||
tp_size = args.tp_size
|
||||
dp_size_local = args.dp_size_local
|
||||
if dp_size_local == -1:
|
||||
dp_size_local = dp_size
|
||||
dp_rank_start = args.dp_rank_start
|
||||
dp_address = args.dp_address
|
||||
dp_rpc_port = args.dp_rpc_port
|
||||
vllm_start_port = args.vllm_start_port
|
||||
|
||||
def run_command(visiable_devices, dp_rank, vllm_engine_port):
|
||||
command = [
|
||||
"bash",
|
||||
"./run_dp_template.sh",
|
||||
visiable_devices,
|
||||
str(vllm_engine_port),
|
||||
str(dp_size),
|
||||
str(dp_rank),
|
||||
dp_address,
|
||||
dp_rpc_port,
|
||||
str(tp_size),
|
||||
]
|
||||
subprocess.run(command, check=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
template_path = "./run_dp_template.sh"
|
||||
if not os.path.exists(template_path):
|
||||
print(f"Template file {template_path} does not exist.")
|
||||
sys.exit(1)
|
||||
|
||||
processes = []
|
||||
num_cards = dp_size_local * tp_size
|
||||
for i in range(dp_size_local):
|
||||
dp_rank = dp_rank_start + i
|
||||
vllm_engine_port = vllm_start_port + i
|
||||
visiable_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size))
|
||||
process = multiprocessing.Process(target=run_command,
|
||||
args=(visiable_devices, dp_rank,
|
||||
vllm_engine_port))
|
||||
processes.append(process)
|
||||
process.start()
|
||||
|
||||
for process in processes:
|
||||
process.join()
|
||||
46
examples/external_online_dp/run_dp_template.sh
Normal file
46
examples/external_online_dp/run_dp_template.sh
Normal file
@@ -0,0 +1,46 @@
|
||||
export HCCL_IF_IP=your_ip_here
|
||||
export GLOO_SOCKET_IFNAME=your_socket_ifname_here
|
||||
export TP_SOCKET_IFNAME=your_socket_ifname_here
|
||||
export HCCL_SOCKET_IFNAME=your_socket_ifname_here
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=your_rank_table_path_here
|
||||
export VLLM_LOGGING_LEVEL="info"
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export HCCL_DETERMINISTIC=True
|
||||
export HCCL_BUFFSIZE=1024
|
||||
export TASK_QUEUE_ENABLE=1
|
||||
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
export ASCEND_RT_VISIBLE_DEVICES=$1
|
||||
|
||||
vllm serve model_path \
|
||||
--host 0.0.0.0 \
|
||||
--port $2 \
|
||||
--data-parallel-size $3 \
|
||||
--data-parallel-rank $4 \
|
||||
--data-parallel-address $5 \
|
||||
--data-parallel-rpc-port $6 \
|
||||
--tensor-parallel-size $7 \
|
||||
--enable-expert-parallel \
|
||||
--seed 1024 \
|
||||
--served-model-name dsv3 \
|
||||
--max-model-len 3500 \
|
||||
--max-num-batched-tokens 3500 \
|
||||
--max-num-seqs 28 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--quantization ascend \
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
||||
"kv_buffer_device": "npu",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_parallel_size": "1",
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "enable_multistream_moe":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true}'
|
||||
Reference in New Issue
Block a user