#/bin/bash rm output/server -rf mkdir -p output/server PORT=32345 use_ray=0 use_pp=1 use_eager=0 eager_option="" if [ $use_eager -gt 0 ]; then eager_option="--enforce-eager" fi ray_option="" if [ $use_ray -gt 0 ]; then ray_option="--worker-use-ray" ray stop --force fi export VLLM_ENGINE_ITERATION_TIMEOUT_S=180 MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp" if [ $use_pp -gt 0 ]; then parallel_option="--pipeline-parallel-size=8" else parallel_option="--tensor-parallel-size=8" fi # TP8 python -m vllm.entrypoints.openai.api_server \ --disable-log-requests \ --port ${PORT} \ --model ${MODEL_PATH} \ --trust-remote-code \ --swap-space 16 \ ${parallel_option} \ --max-num-batched-tokens=40960 \ --max-model-len=1034 \ --block-size=16 \ --dtype=bfloat16 \ --max-seq-len-to-capture=1034 \ --max-num-seqs=40 \ --quantization=smoothquant \ ${eager_option} \ ${ray_option} \ 2>&1 | tee output/server/server.log