xc-llm-kunlun/ci/scripts/server/start_vllm.sh

#!/usr/bin/env bash
set -euo pipefail

source ci/scripts/common/env.sh
source ci/scripts/common/log.sh

log "Starting vLLM server in container ${DOCKER_NAME}"

docker exec -d "${DOCKER_NAME}" bash -lc "
  set -e

  chmod +x \"${GITHUB_WORKSPACE}/vLLM-Kunlun/setup_env.sh\"
  source \"${GITHUB_WORKSPACE}/vLLM-Kunlun/setup_env.sh\"

  rm -f ${VLLM_LOG}
  export XPU_VISIBLE_DEVICES=${XPU_VISIBLE_DEVICES}

  python -u -m vllm.entrypoints.openai.api_server \
    --host ${VLLM_HOST} \
    --port ${VLLM_PORT} \
    --model ${MODEL_PATH} \
    --gpu-memory-utilization 0.9 \
    --trust-remote-code \
    --max-model-len 32768 \
    --tensor-parallel-size 1 \
    --dtype float16 \
    --max_num_seqs 128 \
    --max_num_batched_tokens 32768 \
    --block-size 128 \
    --no-enable-prefix-caching \
    --no-enable-chunked-prefill \
    --distributed-executor-backend mp \
    --served-model-name ${SERVED_MODEL_NAME} \
    --compilation-config '{\"splitting_ops\": [\"vllm.unified_attention\",\"vllm.unified_attention_with_output\",\"vllm.unified_attention_with_output_kunlun\",\"vllm.mamba_mixer2\",\"vllm.mamba_mixer\",\"vllm.short_conv\",\"vllm.linear_attention\",\"vllm.plamo2_mamba_mixer\",\"vllm.gdn_attention\",\"vllm.sparse_attn_indexer\"]}' \
    2>&1 | tee ${VLLM_LOG}
"

log "vLLM start command issued (running in background)"