From 08aaffe144bbe3f8d0db1f224c43cfe971392ca3 Mon Sep 17 00:00:00 2001
From: i-peixingyu <i-peixingyu@4paradigm.com>
Date: Tue, 19 May 2026 18:37:15 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20K100-vLLM-Patched-v2.0/ent?=
 =?UTF-8?q?rypoint.sh?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 K100-vLLM-Patched-v2.0/entrypoint.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 K100-vLLM-Patched-v2.0/entrypoint.sh

diff --git a/K100-vLLM-Patched-v2.0/entrypoint.sh b/K100-vLLM-Patched-v2.0/entrypoint.sh
new file mode 100644
index 0000000..1e0eb51
--- /dev/null
+++ b/K100-vLLM-Patched-v2.0/entrypoint.sh
@@ -0,0 +1,24 @@
+#!/bin/bash 
+set -e
+
+MODEL_DIR=${MODEL_DIR:-/model}
+FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer
+
+echo "[entrypoint] fixing tokenizer..."
+python /opt/fix_tokenizer.py
+
+echo "[entrypoint] checking head_size..."
+set +e
+HEAD_OUT=$(python /opt/detect_head_size.py)
+RC=$?
+set -e
+
+if [ "$RC" = "2" ] && [ -n "$HEAD_OUT" ]; then
+    export VLLM_USE_FLASH_ATTN_PA=0
+    echo "[entrypoint] head_size=$HEAD_OUT not in FlashAttention whitelist, switching to Triton backend (VLLM_USE_FLASH_ATTN_PA=0)"
+fi
+
+echo "[entrypoint] starting vllm..."
+exec vllm serve "$MODEL_DIR" \
+  --tokenizer "$FIX_TOKENIZER_DIR" \
+  "$@"
\ No newline at end of file