From 08aaffe144bbe3f8d0db1f224c43cfe971392ca3 Mon Sep 17 00:00:00 2001 From: i-peixingyu Date: Tue, 19 May 2026 18:37:15 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20K100-vLLM-Patched-v2.0/ent?= =?UTF-8?q?rypoint.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- K100-vLLM-Patched-v2.0/entrypoint.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 K100-vLLM-Patched-v2.0/entrypoint.sh diff --git a/K100-vLLM-Patched-v2.0/entrypoint.sh b/K100-vLLM-Patched-v2.0/entrypoint.sh new file mode 100644 index 0000000..1e0eb51 --- /dev/null +++ b/K100-vLLM-Patched-v2.0/entrypoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +MODEL_DIR=${MODEL_DIR:-/model} +FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer + +echo "[entrypoint] fixing tokenizer..." +python /opt/fix_tokenizer.py + +echo "[entrypoint] checking head_size..." +set +e +HEAD_OUT=$(python /opt/detect_head_size.py) +RC=$? +set -e + +if [ "$RC" = "2" ] && [ -n "$HEAD_OUT" ]; then + export VLLM_USE_FLASH_ATTN_PA=0 + echo "[entrypoint] head_size=$HEAD_OUT not in FlashAttention whitelist, switching to Triton backend (VLLM_USE_FLASH_ATTN_PA=0)" +fi + +echo "[entrypoint] starting vllm..." +exec vllm serve "$MODEL_DIR" \ + --tokenizer "$FIX_TOKENIZER_DIR" \ + "$@" \ No newline at end of file