forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
@@ -0,0 +1,48 @@
|
||||
#/bin/bash
|
||||
|
||||
rm output/server -rf
|
||||
mkdir -p output/server
|
||||
|
||||
PORT=32345
|
||||
use_ray=0
|
||||
use_pp=1
|
||||
use_eager=0
|
||||
|
||||
eager_option=""
|
||||
if [ $use_eager -gt 0 ]; then
|
||||
eager_option="--enforce-eager"
|
||||
fi
|
||||
|
||||
ray_option=""
|
||||
if [ $use_ray -gt 0 ]; then
|
||||
ray_option="--worker-use-ray"
|
||||
ray stop --force
|
||||
fi
|
||||
|
||||
export VLLM_ENGINE_ITERATION_TIMEOUT_S=180
|
||||
MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp"
|
||||
|
||||
if [ $use_pp -gt 0 ]; then
|
||||
parallel_option="--pipeline-parallel-size=8"
|
||||
else
|
||||
parallel_option="--tensor-parallel-size=8"
|
||||
fi
|
||||
|
||||
# TP8
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--disable-log-requests \
|
||||
--port ${PORT} \
|
||||
--model ${MODEL_PATH} \
|
||||
--trust-remote-code \
|
||||
--swap-space 16 \
|
||||
${parallel_option} \
|
||||
--max-num-batched-tokens=40960 \
|
||||
--max-model-len=1034 \
|
||||
--block-size=16 \
|
||||
--dtype=bfloat16 \
|
||||
--max-seq-len-to-capture=1034 \
|
||||
--max-num-seqs=40 \
|
||||
--quantization=smoothquant \
|
||||
${eager_option} \
|
||||
${ray_option} \
|
||||
2>&1 | tee output/server/server.log
|
||||
Reference in New Issue
Block a user