Disaggregate prefill for kv cache register style (#950)

### What this PR does / why we need it? This PR adopt `LLMDataDist` for kv cache register and `pull_blocks` style disaggregate prefill implementation. The interface implementation mainly follows the design of NIXL PR https://github.com/vllm-project/vllm/pull/17751/files#diff-7eaad0b7dee0626bf29d10081b0f0c5e3ea15a4af97e7b182a4e0d35f8346953 . This PR can be test with the following step: - Generate the rank table for all machine. - execute`toy_proxy.py` to launch the disaggregate prefill proxy server, specify the prefill ip, port and the decode ip, port - Run the prefill server and decode server. - send the request to the disaggregate prefill proxy ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.9.2 - vLLM main: 8d0a01a5f2 --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Signed-off-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Signed-off-by: liziyu179 <3475441767@qq.com> Signed-off-by: underfitc <hucong24@huawei.com> Signed-off-by: zouyida2052 <zouyida@huawei.com> Signed-off-by: liziyu <liziyu16@huawei.com> Signed-off-by: underfituu <hzhucong@163.com> Co-authored-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Co-authored-by: liziyu179 <3475441767@qq.com> Co-authored-by: underfitc <hucong24@huawei.com> Co-authored-by: zouyida2052 <zouyida@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com> Co-authored-by: underfituu <hzhucong@163.com>
2025-07-26 17:15:47 +08:00
parent 17a430f7b8
commit df0ec55162
28 changed files with 2833 additions and 144 deletions
--- a/examples/disaggregated_prefill_v1/gen_ranktable.sh
+++ b/examples/disaggregated_prefill_v1/gen_ranktable.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
+
+NPUS_PER_NODE=8
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --ips)
+            shift
+            while [[ $# -gt 0 && ! "$1" == --* ]]; do
+                IPs+=("$1")
+                shift
+            done
+            ;;
+        --npus-per-node)
+            shift
+            NPUS_PER_NODE="$1"
+            shift
+            ;;
+        --network-card-name)
+            shift
+            NETWORK_CARD_NAME="$1"
+            shift
+            ;;
+        --prefill-device-cnt)
+            shift
+            PREFILL_DEVICE_CNT="$1"
+            shift
+            ;;
+        --decode-device-cnt)
+            shift
+            DECODE_DEVICE_CNT="$1"
+            shift
+            ;;
+    esac
+done
+LOCAL_HOSTS=($(hostname -I))
+LOCAL_HOST="127.0.0.1"
+MASTER_ADDR=${IPs[0]}
+MASTER_PORT=6657
+NNODES=${#IPs[@]}
+NODE_RANK="8"
+for i in "${!IPs[@]}"; do
+    ip="${IPs[$i]}"
+    for local_host in "${LOCAL_HOSTS[@]}"; do
+        if [[ "$local_host" == "$ip" ]]; then
+            LOCAL_HOST=$local_host
+            NODE_RANK=$i
+            break 2
+        fi
+    done
+done
+
+if [[ $NODE_RANK == "" ]];then
+    echo "[Error] para \"NODE_RANK\" must be defined"
+    exit 1
+fi
+
+WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES))
+RANKSTART=`expr $NPUS_PER_NODE \* $NODE_RANK`
+
+echo "========>param:"
+echo "LOCAL_HOST": $LOCAL_HOST
+echo "WORLD_SIZE: " $WORLD_SIZE
+echo "RANKSTART": $RANKSTART
+echo "NNODES": $NNODES
+echo "NODE_RANK": $NODE_RANK
+echo "==============="
+
+if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
+    GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
+        --nproc_per_node 1 \
+        --nnodes ${NNODES} \
+        --node_rank ${NODE_RANK} \
+        --master_addr ${MASTER_ADDR} \
+        --master_port ${MASTER_PORT} \
+        gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
+fi