Disaggregate prefill for kv cache register style (#950)
### What this PR does / why we need it?
This PR adopt `LLMDataDist` for kv cache register and `pull_blocks`
style disaggregate prefill implementation. The interface implementation
mainly follows the design of NIXL PR
https://github.com/vllm-project/vllm/pull/17751/files#diff-7eaad0b7dee0626bf29d10081b0f0c5e3ea15a4af97e7b182a4e0d35f8346953
.
This PR can be test with the following step:
- Generate the rank table for all machine.
- execute`toy_proxy.py` to launch the disaggregate prefill proxy server,
specify the prefill ip, port and the decode ip, port
- Run the prefill server and decode server.
- send the request to the disaggregate prefill proxy
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.9.2
- vLLM main:
8d0a01a5f2
---------
Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
Signed-off-by: machenglong <machenglong_yewu@cmss.chinamobile.com>
Signed-off-by: liziyu179 <3475441767@qq.com>
Signed-off-by: underfitc <hucong24@huawei.com>
Signed-off-by: zouyida2052 <zouyida@huawei.com>
Signed-off-by: liziyu <liziyu16@huawei.com>
Signed-off-by: underfituu <hzhucong@163.com>
Co-authored-by: machenglong <machenglong_yewu@cmss.chinamobile.com>
Co-authored-by: liziyu179 <3475441767@qq.com>
Co-authored-by: underfitc <hucong24@huawei.com>
Co-authored-by: zouyida2052 <zouyida@huawei.com>
Co-authored-by: liziyu <liziyu16@huawei.com>
Co-authored-by: underfituu <hzhucong@163.com>
This commit is contained in:
79
examples/disaggregated_prefill_v1/gen_ranktable.sh
Normal file
79
examples/disaggregated_prefill_v1/gen_ranktable.sh
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/bin/bash
|
||||
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
||||
|
||||
NPUS_PER_NODE=8
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--ips)
|
||||
shift
|
||||
while [[ $# -gt 0 && ! "$1" == --* ]]; do
|
||||
IPs+=("$1")
|
||||
shift
|
||||
done
|
||||
;;
|
||||
--npus-per-node)
|
||||
shift
|
||||
NPUS_PER_NODE="$1"
|
||||
shift
|
||||
;;
|
||||
--network-card-name)
|
||||
shift
|
||||
NETWORK_CARD_NAME="$1"
|
||||
shift
|
||||
;;
|
||||
--prefill-device-cnt)
|
||||
shift
|
||||
PREFILL_DEVICE_CNT="$1"
|
||||
shift
|
||||
;;
|
||||
--decode-device-cnt)
|
||||
shift
|
||||
DECODE_DEVICE_CNT="$1"
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
LOCAL_HOSTS=($(hostname -I))
|
||||
LOCAL_HOST="127.0.0.1"
|
||||
MASTER_ADDR=${IPs[0]}
|
||||
MASTER_PORT=6657
|
||||
NNODES=${#IPs[@]}
|
||||
NODE_RANK="8"
|
||||
for i in "${!IPs[@]}"; do
|
||||
ip="${IPs[$i]}"
|
||||
for local_host in "${LOCAL_HOSTS[@]}"; do
|
||||
if [[ "$local_host" == "$ip" ]]; then
|
||||
LOCAL_HOST=$local_host
|
||||
NODE_RANK=$i
|
||||
break 2
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
if [[ $NODE_RANK == "" ]];then
|
||||
echo "[Error] para \"NODE_RANK\" must be defined"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES))
|
||||
RANKSTART=`expr $NPUS_PER_NODE \* $NODE_RANK`
|
||||
|
||||
echo "========>param:"
|
||||
echo "LOCAL_HOST": $LOCAL_HOST
|
||||
echo "WORLD_SIZE: " $WORLD_SIZE
|
||||
echo "RANKSTART": $RANKSTART
|
||||
echo "NNODES": $NNODES
|
||||
echo "NODE_RANK": $NODE_RANK
|
||||
echo "==============="
|
||||
|
||||
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
|
||||
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
|
||||
--nproc_per_node 1 \
|
||||
--nnodes ${NNODES} \
|
||||
--node_rank ${NODE_RANK} \
|
||||
--master_addr ${MASTER_ADDR} \
|
||||
--master_port ${MASTER_PORT} \
|
||||
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
|
||||
fi
|
||||
Reference in New Issue
Block a user