### What this PR does / why we need it?
Currently our workflow run time takes about 3 hours in total, which
seriously affects the developer experience, so it is urgent to have a
optimization, after this pr, It is expected that the running time of the
full CI can be shortened to 1h40min.
- Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB)
- Change TP4 ---> TP2 * 2 max-parallel
- Move DeepSeek-V2-Lite-W8A8 to single card test
### Does this PR introduce _any_ user-facing change?
No
- vLLM version: v0.10.0
- vLLM main:
a2480251ec
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
80 lines
2.0 KiB
Bash
80 lines
2.0 KiB
Bash
#!/bin/bash
|
|
|
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
|
|
|
NPUS_PER_NODE=8
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--ips)
|
|
shift
|
|
while [[ $# -gt 0 && ! "$1" == --* ]]; do
|
|
IPs+=("$1")
|
|
shift
|
|
done
|
|
;;
|
|
--npus-per-node)
|
|
shift
|
|
NPUS_PER_NODE="$1"
|
|
shift
|
|
;;
|
|
--network-card-name)
|
|
shift
|
|
NETWORK_CARD_NAME="$1"
|
|
shift
|
|
;;
|
|
--prefill-device-cnt)
|
|
shift
|
|
PREFILL_DEVICE_CNT="$1"
|
|
shift
|
|
;;
|
|
--decode-device-cnt)
|
|
shift
|
|
DECODE_DEVICE_CNT="$1"
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
LOCAL_HOSTS=($(hostname -I))
|
|
LOCAL_HOST="127.0.0.1"
|
|
MASTER_ADDR=${IPs[0]}
|
|
MASTER_PORT=6657
|
|
NNODES=${#IPs[@]}
|
|
NODE_RANK="8"
|
|
for i in "${!IPs[@]}"; do
|
|
ip="${IPs[$i]}"
|
|
for local_host in "${LOCAL_HOSTS[@]}"; do
|
|
if [[ "$local_host" == "$ip" ]]; then
|
|
LOCAL_HOST=$local_host
|
|
NODE_RANK=$i
|
|
break 2
|
|
fi
|
|
done
|
|
done
|
|
|
|
if [[ $NODE_RANK == "" ]];then
|
|
echo "[Error] para \"NODE_RANK\" must be defined"
|
|
exit 1
|
|
fi
|
|
|
|
WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES))
|
|
RANKSTART=`expr $NPUS_PER_NODE \* $NODE_RANK`
|
|
|
|
echo "========>param:"
|
|
echo "LOCAL_HOST": $LOCAL_HOST
|
|
echo "WORLD_SIZE: " $WORLD_SIZE
|
|
echo "RANKSTART": $RANKSTART
|
|
echo "NNODES": $NNODES
|
|
echo "NODE_RANK": $NODE_RANK
|
|
echo "==============="
|
|
|
|
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
|
|
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
|
|
--nproc_per_node 1 \
|
|
--nnodes ${NNODES} \
|
|
--node_rank ${NODE_RANK} \
|
|
--master_addr ${MASTER_ADDR} \
|
|
--master_port ${MASTER_PORT} \
|
|
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
|
|
fi
|