xc-llm-ascend/examples/disaggregated_prefill_v1/gen_ranktable.sh

#!/bin/bash

source /usr/local/Ascend/ascend-toolkit/set_env.sh
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}

NPUS_PER_NODE=8
while [[ $# -gt 0 ]]; do
    case "$1" in
        --ips)
            shift
            while [[ $# -gt 0 && ! "$1" == --* ]]; do
                IPs+=("$1")
                shift
            done
            ;;
        --npus-per-node)
            shift
            NPUS_PER_NODE="$1"
            shift
            ;;
        --network-card-name)
            shift
            NETWORK_CARD_NAME="$1"
            shift
            ;;
        --prefill-device-cnt)
            shift
            PREFILL_DEVICE_CNT="$1"
            shift
            ;;
        --decode-device-cnt)
            shift
            DECODE_DEVICE_CNT="$1"
            shift
            ;;
    esac
done
LOCAL_HOSTS=($(hostname -I))
LOCAL_HOST="127.0.0.1"
MASTER_ADDR=${IPs[0]}
MASTER_PORT=6657
NNODES=${#IPs[@]}
NODE_RANK="8"
for i in "${!IPs[@]}"; do
    ip="${IPs[$i]}"
    for local_host in "${LOCAL_HOSTS[@]}"; do
        if [[ "$local_host" == "$ip" ]]; then
            LOCAL_HOST=$local_host
            NODE_RANK=$i
            break 2
        fi
    done
done

if [[ $NODE_RANK == "" ]];then
    echo "[Error] para \"NODE_RANK\" must be defined"
    exit 1
fi

WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES))
RANKSTART=`expr $NPUS_PER_NODE \* $NODE_RANK`

echo "========>param:"
echo "LOCAL_HOST": $LOCAL_HOST
echo "WORLD_SIZE: " $WORLD_SIZE
echo "RANKSTART": $RANKSTART
echo "NNODES": $NNODES
echo "NODE_RANK": $NODE_RANK
echo "==============="

if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
    GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
        --nproc_per_node 1 \
        --nnodes ${NNODES} \
        --node_rank ${NODE_RANK} \
        --master_addr ${MASTER_ADDR} \
        --master_port ${MASTER_PORT} \
        gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
fi
Disaggregate prefill for kv cache register style (#950) ### What this PR does / why we need it? This PR adopt `LLMDataDist` for kv cache register and `pull_blocks` style disaggregate prefill implementation. The interface implementation mainly follows the design of NIXL PR https://github.com/vllm-project/vllm/pull/17751/files#diff-7eaad0b7dee0626bf29d10081b0f0c5e3ea15a4af97e7b182a4e0d35f8346953 . This PR can be test with the following step: - Generate the rank table for all machine. - execute`toy_proxy.py` to launch the disaggregate prefill proxy server, specify the prefill ip, port and the decode ip, port - Run the prefill server and decode server. - send the request to the disaggregate prefill proxy ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/8d0a01a5f2b53794e4bc6b734d7b63cb8a9b7d7d --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Signed-off-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Signed-off-by: liziyu179 <3475441767@qq.com> Signed-off-by: underfitc <hucong24@huawei.com> Signed-off-by: zouyida2052 <zouyida@huawei.com> Signed-off-by: liziyu <liziyu16@huawei.com> Signed-off-by: underfituu <hzhucong@163.com> Co-authored-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Co-authored-by: liziyu179 <3475441767@qq.com> Co-authored-by: underfitc <hucong24@huawei.com> Co-authored-by: zouyida2052 <zouyida@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com> Co-authored-by: underfituu <hzhucong@163.com> 2025-07-26 17:15:47 +08:00			`#!/bin/bash`

			`source /usr/local/Ascend/ascend-toolkit/set_env.sh`
			`export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}`

			`NPUS_PER_NODE=8`
			`while [[ $# -gt 0 ]]; do`
			`case "$1" in`
			`--ips)`
			`shift`
			`while [[ $# -gt 0 && ! "$1" == --* ]]; do`
			`IPs+=("$1")`
			`shift`
			`done`
			`;;`
			`--npus-per-node)`
			`shift`
			`NPUS_PER_NODE="$1"`
			`shift`
			`;;`
			`--network-card-name)`
			`shift`
			`NETWORK_CARD_NAME="$1"`
			`shift`
			`;;`
			`--prefill-device-cnt)`
			`shift`
			`PREFILL_DEVICE_CNT="$1"`
			`shift`
			`;;`
			`--decode-device-cnt)`
			`shift`
			`DECODE_DEVICE_CNT="$1"`
			`shift`
			`;;`
			`esac`
			`done`
			`LOCAL_HOSTS=($(hostname -I))`
			`LOCAL_HOST="127.0.0.1"`
			`MASTER_ADDR=${IPs[0]}`
			`MASTER_PORT=6657`
			`NNODES=${#IPs[@]}`
			`NODE_RANK="8"`
			`for i in "${!IPs[@]}"; do`
			`ip="${IPs[$i]}"`
			`for local_host in "${LOCAL_HOSTS[@]}"; do`
			`if [[ "$local_host" == "$ip" ]]; then`
			`LOCAL_HOST=$local_host`
			`NODE_RANK=$i`
			`break 2`
			`fi`
			`done`
			`done`

			`if [[ $NODE_RANK == "" ]];then`
			`echo "[Error] para \"NODE_RANK\" must be defined"`
			`exit 1`
			`fi`

			`WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES))`
			RANKSTART=`expr $NPUS_PER_NODE \* $NODE_RANK`

			`echo "========>param:"`
			`echo "LOCAL_HOST": $LOCAL_HOST`
			`echo "WORLD_SIZE: " $WORLD_SIZE`
			`echo "RANKSTART": $RANKSTART`
			`echo "NNODES": $NNODES`
			`echo "NODE_RANK": $NODE_RANK`
			`echo "==============="`

			`if [[ -n "${GEN_RANKTABLE}" \|\| ! -e ${PWD}/ranktable.json ]]; then`
			`GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \`
			`--nproc_per_node 1 \`
			`--nnodes ${NNODES} \`
			`--node_rank ${NODE_RANK} \`
			`--master_addr ${MASTER_ADDR} \`
			`--master_port ${MASTER_PORT} \`
			`gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT`
[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065) ### What this PR does / why we need it? Currently our workflow run time takes about 3 hours in total, which seriously affects the developer experience, so it is urgent to have a optimization, after this pr, It is expected that the running time of the full CI can be shortened to 1h40min. - Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB) - Change TP4 ---> TP2 * 2 max-parallel - Move DeepSeek-V2-Lite-W8A8 to single card test ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81 --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2025-07-29 18:59:05 +08:00			`fi`