#!/bin/bash set -e MODEL_DIR=${MODEL_DIR:-/model} FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer echo "[entrypoint] fixing tokenizer..." python /opt/fix_tokenizer.py echo "[entrypoint] checking head_size..." set +e HEAD_OUT=$(python /opt/detect_head_size.py) RC=$? set -e if [ "$RC" = "2" ] && [ -n "$HEAD_OUT" ]; then export VLLM_USE_FLASH_ATTN_PA=0 echo "[entrypoint] head_size=$HEAD_OUT not in FlashAttention whitelist, switching to Triton backend (VLLM_USE_FLASH_ATTN_PA=0)" fi echo "[entrypoint] starting vllm..." exec vllm serve "$MODEL_DIR" \ --tokenizer "$FIX_TOKENIZER_DIR" \ "$@"