#/bin/bash # export EXPERT_PARALLEL_EN=True # export VLLM_LATENCY_DEBUG=True rm output/client -rf mkdir -p output/client PORT=32345 MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp" input_sizes=(1024) output_sizes=(1) # batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40) batch_sizes=(32) for input_size in "${input_sizes[@]}"; do for output_size in "${output_sizes[@]}"; do for batch_size in "${batch_sizes[@]}"; do hf_model_name=$(basename "${HF_MODEL}") LOG_FILE=output/client/${hf_model_name}_${input_size}_${output_size}_bs_${batch_size}.log python benchmarks/benchmark_serving.py \ --backend vllm \ --model ${MODEL_PATH} \ --trust-remote-code \ --dataset-name random \ --num-prompts 1000 \ --port ${PORT} \ --request-rate inf \ --random_input_len $input_size \ --random-output-len ${output_size} \ --max-concurrency ${batch_size} \ 2>&1 | tee ${LOG_FILE} done done done