forked from EngineX-Cambricon/enginex-mlu370-vllm
49 lines
982 B
Bash
49 lines
982 B
Bash
#/bin/bash
|
|
|
|
rm output/server -rf
|
|
mkdir -p output/server
|
|
|
|
PORT=32345
|
|
use_ray=0
|
|
use_pp=1
|
|
use_eager=0
|
|
|
|
eager_option=""
|
|
if [ $use_eager -gt 0 ]; then
|
|
eager_option="--enforce-eager"
|
|
fi
|
|
|
|
ray_option=""
|
|
if [ $use_ray -gt 0 ]; then
|
|
ray_option="--worker-use-ray"
|
|
ray stop --force
|
|
fi
|
|
|
|
export VLLM_ENGINE_ITERATION_TIMEOUT_S=180
|
|
MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp"
|
|
|
|
if [ $use_pp -gt 0 ]; then
|
|
parallel_option="--pipeline-parallel-size=8"
|
|
else
|
|
parallel_option="--tensor-parallel-size=8"
|
|
fi
|
|
|
|
# TP8
|
|
python -m vllm.entrypoints.openai.api_server \
|
|
--disable-log-requests \
|
|
--port ${PORT} \
|
|
--model ${MODEL_PATH} \
|
|
--trust-remote-code \
|
|
--swap-space 16 \
|
|
${parallel_option} \
|
|
--max-num-batched-tokens=40960 \
|
|
--max-model-len=1034 \
|
|
--block-size=16 \
|
|
--dtype=bfloat16 \
|
|
--max-seq-len-to-capture=1034 \
|
|
--max-num-seqs=40 \
|
|
--quantization=smoothquant \
|
|
${eager_option} \
|
|
${ray_option} \
|
|
2>&1 | tee output/server/server.log
|