diff --git a/K100-vLLM-Patched-v2.0/entrypoint.sh b/K100-vLLM-Patched-v2.0/entrypoint.sh new file mode 100644 index 0000000..1e0eb51 --- /dev/null +++ b/K100-vLLM-Patched-v2.0/entrypoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +MODEL_DIR=${MODEL_DIR:-/model} +FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer + +echo "[entrypoint] fixing tokenizer..." +python /opt/fix_tokenizer.py + +echo "[entrypoint] checking head_size..." +set +e +HEAD_OUT=$(python /opt/detect_head_size.py) +RC=$? +set -e + +if [ "$RC" = "2" ] && [ -n "$HEAD_OUT" ]; then + export VLLM_USE_FLASH_ATTN_PA=0 + echo "[entrypoint] head_size=$HEAD_OUT not in FlashAttention whitelist, switching to Triton backend (VLLM_USE_FLASH_ATTN_PA=0)" +fi + +echo "[entrypoint] starting vllm..." +exec vllm serve "$MODEL_DIR" \ + --tokenizer "$FIX_TOKENIZER_DIR" \ + "$@" \ No newline at end of file