24 lines
599 B
Bash
24 lines
599 B
Bash
#!/bin/bash
|
|
set -e
|
|
|
|
MODEL_DIR=${MODEL_DIR:-/model}
|
|
FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer
|
|
|
|
echo "[entrypoint] fixing tokenizer..."
|
|
python /opt/fix_tokenizer.py
|
|
|
|
echo "[entrypoint] checking head_size..."
|
|
set +e
|
|
HEAD_OUT=$(python /opt/detect_head_size.py)
|
|
RC=$?
|
|
set -e
|
|
|
|
if [ "$RC" = "2" ] && [ -n "$HEAD_OUT" ]; then
|
|
export VLLM_USE_FLASH_ATTN_PA=0
|
|
echo "[entrypoint] head_size=$HEAD_OUT not in FlashAttention whitelist, switching to Triton backend (VLLM_USE_FLASH_ATTN_PA=0)"
|
|
fi
|
|
|
|
echo "[entrypoint] starting vllm..."
|
|
exec vllm serve "$MODEL_DIR" \
|
|
--tokenizer "$FIX_TOKENIZER_DIR" \
|
|
"$@" |