diff --git a/examples/external_online_dp/README.md b/examples/external_online_dp/README.md new file mode 100644 index 0000000..4681de5 --- /dev/null +++ b/examples/external_online_dp/README.md @@ -0,0 +1,38 @@ +Here is an example guiding how to use `launch_online_dp.py` to launch external dp server in vllm. User can easily launch external dp server following the steps below: + +### Modify parameters in `run_dp_template.sh` +`run_dp_template.sh` is an template script used to launch each dp vllm instance separately. It will be called by `launch_online_dp.py` in multi threads and most of its configurations are set by `launch_online_dp.py`. Parameters you need to set manually include: + +1. The IP and socket_ifname of your machine. If running on multi-nodes, please make sure the scripts on each node has been set with correct IP and socket_ifname of that node. +2. vLLM serving related parameters including model_path and other configurations. Note that port, dp-related parammeters and tp_size is set by `launch_online_dp.py`, all the other vLLM parameters in this file only serve as an example and you are free to modify them according to your purpose. + +### Run `launch_online_dp.py` with CL arguments +All the arguments that can be set by users are: + +1. `--dp-size`: global data parallel size, must be set +2. `--tp-size`: tensor parallel size, default 1 +3. `--dp-size-local`: local data parallel size, defaultly set to `dp_size` +4. `--dp-rank-start`: Starting rank for data parallel, default 0 +5. `--dp-address`: IP address of data parallel master node +6. `--dp-rpc-port`: Port of data parallel master node, default 12345 +7. `--vllm-start-port`: Starting port of vLLM serving instances, default 9000 + +An example of running external DP in one single node: +```(python) +cd examples/external_online_dp +# running DP4 TP4 in a node with 16 NPUs +python launch_online_dp.py --dp-size 4 --tp-size 4 --dp-size-local 4 --dp-rank-start 0 --dp-address x.x.x.x --dp-rpc-port 12342 +``` + +An example of running external DP in two nodes: +```(python) +cd examples/external_online_dp +# running DP4 TP4 in two nodes with 8 NPUs each + +# On node 0: +python launch_online_dp.py --dp-size 4 --tp-size 4 --dp-size-local 2 --dp-rank-start 0 --dp-address x.x.x.x --dp-rpc-port 12342 + +# On node 1: +python launch_online_dp.py --dp-size 4 --tp-size 4 --dp-size-local 2 --dp-rank-start 2 --dp-address x.x.x.x --dp-rpc-port 12342 +``` + diff --git a/examples/external_online_dp/launch_online_dp.py b/examples/external_online_dp/launch_online_dp.py new file mode 100644 index 0000000..0045ecd --- /dev/null +++ b/examples/external_online_dp/launch_online_dp.py @@ -0,0 +1,97 @@ +import argparse +import multiprocessing +import os +import subprocess +import sys + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--dp-size", + type=int, + required=True, + help="Data parallel size." + ) + parser.add_argument( + "--tp-size", + type=int, + default=1, + help="Tensor parallel size." + ) + parser.add_argument( + "--dp-size-local", + type=int, + default=-1, + help="Local data parallel size." + ) + parser.add_argument( + "--dp-rank-start", + type=int, + default=0, + help="Starting rank for data parallel." + ) + parser.add_argument( + "--dp-address", + type=str, + required=True, + help="IP address for data parallel master node." + ) + parser.add_argument( + "--dp-rpc-port", + type=str, + default=12345, + help="Port for data parallel master node." + ) + parser.add_argument( + "--vllm-start-port", + type=int, + default=9000, + help="Starting port for the engine." + ) + return parser.parse_args() + +args = parse_args() +dp_size = args.dp_size +tp_size = args.tp_size +dp_size_local = args.dp_size_local +if dp_size_local == -1: + dp_size_local = dp_size +dp_rank_start = args.dp_rank_start +dp_address = args.dp_address +dp_rpc_port = args.dp_rpc_port +vllm_start_port = args.vllm_start_port + +def run_command(visiable_devices, dp_rank, vllm_engine_port): + command = [ + "bash", + "./run_dp_template.sh", + visiable_devices, + str(vllm_engine_port), + str(dp_size), + str(dp_rank), + dp_address, + dp_rpc_port, + str(tp_size), + ] + subprocess.run(command, check=True) + +if __name__ == "__main__": + template_path = "./run_dp_template.sh" + if not os.path.exists(template_path): + print(f"Template file {template_path} does not exist.") + sys.exit(1) + + processes = [] + num_cards = dp_size_local * tp_size + for i in range(dp_size_local): + dp_rank = dp_rank_start + i + vllm_engine_port = vllm_start_port + i + visiable_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size)) + process = multiprocessing.Process(target=run_command, + args=(visiable_devices, dp_rank, + vllm_engine_port)) + processes.append(process) + process.start() + + for process in processes: + process.join() \ No newline at end of file diff --git a/examples/external_online_dp/run_dp_template.sh b/examples/external_online_dp/run_dp_template.sh new file mode 100644 index 0000000..661bdfa --- /dev/null +++ b/examples/external_online_dp/run_dp_template.sh @@ -0,0 +1,46 @@ +export HCCL_IF_IP=your_ip_here +export GLOO_SOCKET_IFNAME=your_socket_ifname_here +export TP_SOCKET_IFNAME=your_socket_ifname_here +export HCCL_SOCKET_IFNAME=your_socket_ifname_here +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=your_rank_table_path_here +export VLLM_LOGGING_LEVEL="info" +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=10 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_DETERMINISTIC=True +export HCCL_BUFFSIZE=1024 +export TASK_QUEUE_ENABLE=1 + +export VLLM_USE_V1=1 + +export ASCEND_RT_VISIBLE_DEVICES=$1 + +vllm serve model_path \ + --host 0.0.0.0 \ + --port $2 \ + --data-parallel-size $3 \ + --data-parallel-rank $4 \ + --data-parallel-address $5 \ + --data-parallel-rpc-port $6 \ + --tensor-parallel-size $7 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name dsv3 \ + --max-model-len 3500 \ + --max-num-batched-tokens 3500 \ + --max-num-seqs 28 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --quantization ascend \ + --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": "1", + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "enable_multistream_moe":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true}' \ No newline at end of file