Files
xc-llm-kunlun/ci/scripts/server/start_vllm.sh
1916hcc 7c2966a98c [CI/Build] Add CI end-to-end (E2E) tests (#139)
* [CI/Build] Add CI end-to-end (E2E) tests
Signed-off-by: Chenchao Hu <huchenchao@example.com>
2026-01-28 19:30:55 +08:00

39 lines
1.3 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
source ci/scripts/common/env.sh
source ci/scripts/common/log.sh
log "Starting vLLM server in container ${DOCKER_NAME}"
docker exec -d "${DOCKER_NAME}" bash -lc "
set -e
chmod +x \"${GITHUB_WORKSPACE}/vLLM-Kunlun/setup_env.sh\"
source \"${GITHUB_WORKSPACE}/vLLM-Kunlun/setup_env.sh\"
rm -f ${VLLM_LOG}
export XPU_VISIBLE_DEVICES=${XPU_VISIBLE_DEVICES}
python -u -m vllm.entrypoints.openai.api_server \
--host ${VLLM_HOST} \
--port ${VLLM_PORT} \
--model ${MODEL_PATH} \
--gpu-memory-utilization 0.9 \
--trust-remote-code \
--max-model-len 32768 \
--tensor-parallel-size 1 \
--dtype float16 \
--max_num_seqs 128 \
--max_num_batched_tokens 32768 \
--block-size 128 \
--no-enable-prefix-caching \
--no-enable-chunked-prefill \
--distributed-executor-backend mp \
--served-model-name ${SERVED_MODEL_NAME} \
--compilation-config '{\"splitting_ops\": [\"vllm.unified_attention\",\"vllm.unified_attention_with_output\",\"vllm.unified_attention_with_output_kunlun\",\"vllm.mamba_mixer2\",\"vllm.mamba_mixer\",\"vllm.short_conv\",\"vllm.linear_attention\",\"vllm.plamo2_mamba_mixer\",\"vllm.gdn_attention\",\"vllm.sparse_attn_indexer\"]}' \
2>&1 | tee ${VLLM_LOG}
"
log "vLLM start command issued (running in background)"