### What this PR does / why we need it? - This PR proposes a P2P version of Disaggregated Prefill based on llm_datadist which manages data transfer. - This solution reconstructs previous offline single-node Disaggregated Prefill solution, and supports multi-node and online serveing now. - Currently this solution supports 1P1D situation of Deepseek hybrid parallelism (P: TP+EP, D: DP+EP). Note that xPyD situation is considered in the solution design, and will be supported soon within v1 engine. --------- Signed-off-by: hw_whx <wanghexiang7@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: hw_whx <wanghexiang7@huawei.com> Co-authored-by: ganyi <pleaplusone.gy@gmail.com>
30 lines
789 B
Bash
30 lines
789 B
Bash
export HCCL_IF_IP=2.0.0.0
|
|
export GLOO_SOCKET_IFNAME="enp189s0f0"
|
|
export TP_SOCKET_IFNAME="enp189s0f0"
|
|
export HCCL_SOCKET_IFNAME="enp189s0f0"
|
|
|
|
export OMP_PROC_BIND=false
|
|
export OMP_NUM_THREADS=100
|
|
|
|
export VLLM_USE_V1=0
|
|
|
|
export ASCEND_RT_VISIBLE_DEVICES=0,1
|
|
export VLLM_DP_SIZE=2
|
|
export VLLM_DP_RANK=0
|
|
export VLLM_DP_MASTER_IP="2.0.0.0"
|
|
export VLLM_DP_MASTER_PORT=40001
|
|
export VLLM_DP_PROXY_IP="2.0.0.0"
|
|
export VLLM_DP_PROXY_PORT=30002
|
|
export VLLM_DP_MONITOR_PORT=30003
|
|
export VLLM_HTTP_PORT=20001
|
|
|
|
vllm serve /data/weights/Qwen2.5-0.5B-Instruct \
|
|
--host 0.0.0.0 \
|
|
--port 20001 \
|
|
--tensor-parallel-size 1 \
|
|
--seed 1024 \
|
|
--served-model-name Qwen \
|
|
--max-model-len 2000 \
|
|
--max-num-batched-tokens 2000 \
|
|
--trust-remote-code \
|
|
--gpu-memory-utilization 0.9 \ |