添加 K100-vLLM-Patched-v2.0/entrypoint.sh
This commit is contained in:
24
K100-vLLM-Patched-v2.0/entrypoint.sh
Normal file
24
K100-vLLM-Patched-v2.0/entrypoint.sh
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
MODEL_DIR=${MODEL_DIR:-/model}
|
||||
FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer
|
||||
|
||||
echo "[entrypoint] fixing tokenizer..."
|
||||
python /opt/fix_tokenizer.py
|
||||
|
||||
echo "[entrypoint] checking head_size..."
|
||||
set +e
|
||||
HEAD_OUT=$(python /opt/detect_head_size.py)
|
||||
RC=$?
|
||||
set -e
|
||||
|
||||
if [ "$RC" = "2" ] && [ -n "$HEAD_OUT" ]; then
|
||||
export VLLM_USE_FLASH_ATTN_PA=0
|
||||
echo "[entrypoint] head_size=$HEAD_OUT not in FlashAttention whitelist, switching to Triton backend (VLLM_USE_FLASH_ATTN_PA=0)"
|
||||
fi
|
||||
|
||||
echo "[entrypoint] starting vllm..."
|
||||
exec vllm serve "$MODEL_DIR" \
|
||||
--tokenizer "$FIX_TOKENIZER_DIR" \
|
||||
"$@"
|
||||
Reference in New Issue
Block a user