xc-llm-ascend/examples/disaggregated_prefill/run_decode_server.sh

export HCCL_IF_IP=2.0.0.0
export GLOO_SOCKET_IFNAME="enp189s0f0"
export TP_SOCKET_IFNAME="enp189s0f0"
export HCCL_SOCKET_IFNAME="enp189s0f0"

export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100

export VLLM_USE_V1=0

export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7


vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
    --host 0.0.0.0 \
    --port 20002 \
    --tensor-parallel-size 8 \
    --seed 1024 \
    --served-model-name deepseek \
    --max-model-len 2000 \
    --max-num-batched-tokens 2000 \
    --trust-remote-code \
    --gpu-memory-utilization 0.9 \
    --kv-transfer-config \
    '{"kv_connector": "AscendSimpleConnector",
    "kv_buffer_device": "npu",
    "kv_role": "kv_consumer",
    "kv_parallel_size": 8,
    "kv_port":"21001",
    "kv_connector_extra_config":
    {"prompt_device_ips": ["1.2.3.1", "1.2.3.2", "1.2.3.3", "1.2.3.4", "1.2.3.5", "1.2.3.6", "1.2.3.7", "1.2.3.8"],
    "decode_device_ips": ["1.2.3.9", "1.2.3.10", "1.2.3.11", "1.2.3.12", "1.2.3.13", "1.2.3.14", "1.2.3.15", "1.2.3.16"],
    "llmdatadist_comm_port": 26000,
    "proxy_ip":"3.0.0.0",
    "proxy_port":"30001",
    "http_port": 10002}
    }'
[Disaggregated Prefill] P2P Disaggregated Prefill based on llm_datadist (#694) ### What this PR does / why we need it? - This PR proposes a P2P version of Disaggregated Prefill based on llm_datadist which manages data transfer. - This solution reconstructs previous offline single-node Disaggregated Prefill solution, and supports multi-node and online serveing now. - Currently this solution supports 1P1D situation of Deepseek hybrid parallelism (P: TP+EP, D: DP+EP). Note that xPyD situation is considered in the solution design, and will be supported soon within v1 engine. --------- Signed-off-by: hw_whx <wanghexiang7@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: hw_whx <wanghexiang7@huawei.com> Co-authored-by: ganyi <pleaplusone.gy@gmail.com> 2025-05-01 22:31:36 +08:00			`export HCCL_IF_IP=2.0.0.0`
			`export GLOO_SOCKET_IFNAME="enp189s0f0"`
			`export TP_SOCKET_IFNAME="enp189s0f0"`
			`export HCCL_SOCKET_IFNAME="enp189s0f0"`

			`export OMP_PROC_BIND=false`
			`export OMP_NUM_THREADS=100`

			`export VLLM_USE_V1=0`

			`export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`


			`vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \`
			`--host 0.0.0.0 \`
			`--port 20002 \`
			`--tensor-parallel-size 8 \`
			`--seed 1024 \`
			`--served-model-name deepseek \`
			`--max-model-len 2000 \`
			`--max-num-batched-tokens 2000 \`
			`--trust-remote-code \`
			`--gpu-memory-utilization 0.9 \`
			`--kv-transfer-config \`
			`'{"kv_connector": "AscendSimpleConnector",`
			`"kv_buffer_device": "npu",`
			`"kv_role": "kv_consumer",`
			`"kv_parallel_size": 8,`
			`"kv_port":"21001",`
			`"kv_connector_extra_config":`
			`{"prompt_device_ips": ["1.2.3.1", "1.2.3.2", "1.2.3.3", "1.2.3.4", "1.2.3.5", "1.2.3.6", "1.2.3.7", "1.2.3.8"],`
			`"decode_device_ips": ["1.2.3.9", "1.2.3.10", "1.2.3.11", "1.2.3.12", "1.2.3.13", "1.2.3.14", "1.2.3.15", "1.2.3.16"],`
			`"llmdatadist_comm_port": 26000,`
			`"proxy_ip":"3.0.0.0",`
			`"proxy_port":"30001",`
			`"http_port": 10002}`
			`}'`