From 0cd1996eae2499a200036f4ab1d12bf7611eca21 Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
Date: Sun, 5 Oct 2025 21:13:17 -0700
Subject: [PATCH] feat: add shortcut detection for multimodal templates in
 Jinja format (#11209)

---
 python/sglang/srt/parser/jinja_template_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/sglang/srt/parser/jinja_template_utils.py b/python/sglang/srt/parser/jinja_template_utils.py
index be7d44097..088c3eb91 100644
--- a/python/sglang/srt/parser/jinja_template_utils.py
+++ b/python/sglang/srt/parser/jinja_template_utils.py
@@ -89,6 +89,12 @@ def detect_jinja_template_content_format(chat_template: str) -> str:
     - If template has loops like {%- for content in message['content'] -%} → 'openai'
     - Otherwise → 'string'
     """
+    # Shortcut for multimodal templates
+    if any(
+        keyword in chat_template for keyword in ["image", "audio", "video", "vision"]
+    ):
+        return "openai"
+
     jinja_ast = _try_extract_ast(chat_template)
     if jinja_ast is None:
         return "string"