From 9d8ec2e67e36117ac6da0c82e597d6dbf587d578 Mon Sep 17 00:00:00 2001
From: XinyuanTong <115166877+JustinTong0323@users.noreply.github.com>
Date: Sat, 10 May 2025 09:14:09 -0700
Subject: [PATCH] Fix and Clean up chat-template requirement for VLM (#6114)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
---
 benchmark/mmmu/README.md                      |   2 +-
 benchmark/mmmu/bench_sglang.py                |   2 +-
 docs/backend/openai_api_vision.ipynb          |  30 +---
 docs/backend/sampling_params.md               |   2 +-
 docs/supported_models/embedding_models.md     |   3 +-
 .../vision_language_models.md                 |   5 -
 .../engine/offline_batch_inference_vlm.py     |   2 +-
 .../http_llava_onevision_test.py              |   2 +-
 examples/runtime/multimodal_embedding.py      |   2 +-
 python/sglang/lang/chat_template.py           | 135 ++++++++----------
 python/sglang/srt/conversation.py             |  71 ++++-----
 test/srt/models/test_vlm_models.py            |  20 +--
 test/srt/test_bench_serving.py                |   4 -
 test/srt/test_bnb.py                          |  12 +-
 test/srt/test_openai_server.py                |   1 -
 test/srt/test_vision_openai_server.py         |   6 +-
 16 files changed, 104 insertions(+), 195 deletions(-)

diff --git a/benchmark/mmmu/README.md b/benchmark/mmmu/README.md
index e39bdd3c4..54acee52a 100644
--- a/benchmark/mmmu/README.md
+++ b/benchmark/mmmu/README.md
@@ -5,7 +5,7 @@
 Host the VLM:
 
 ```
-python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --chat-template qwen2-vl --port 30000
+python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --port 30000
 ```
 
 It's recommended to reduce the memory usage by appending something like `--mem-fraction-static 0.6` to the command above.
diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py
index b2a2e2acd..58a4039ef 100644
--- a/benchmark/mmmu/bench_sglang.py
+++ b/benchmark/mmmu/bench_sglang.py
@@ -2,7 +2,7 @@
 Bench the sglang-hosted vLM with benchmark MMMU
 
 Usage:
-    Host the VLM: python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --chat-template qwen2-vl --port 30000
+    Host the VLM: python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --port 30000
 
     Benchmark: python benchmark/mmmu/bench_sglang.py --port 30000 --concurrency 16
 
diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb
index af52fcfb8..a0af07501 100644
--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -27,11 +27,7 @@
    "source": [
     "## Launch A Server\n",
     "\n",
-    "Launch the server in your terminal and wait for it to initialize.\n",
-    "\n",
-    "**Remember to add** `--chat-template` **for example** `--chat-template=qwen2-vl` **to specify the [vision chat template](https://docs.sglang.ai/backend/openai_api_vision.html#Chat-Template), otherwise, the server will only support text (images won’t be passed in), which can lead to degraded performance.**\n",
-    "\n",
-    "We need to specify `--chat-template` for vision language models because the chat template provided in Hugging Face tokenizer only supports text."
+    "Launch the server in your terminal and wait for it to initialize."
    ]
   },
   {
@@ -51,8 +47,7 @@
     "\n",
     "vision_process, port = launch_server_cmd(\n",
     "    \"\"\"\n",
-    "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct \\\n",
-    "    --chat-template=qwen2-vl\n",
+    "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct\n",
     "\"\"\"\n",
     ")\n",
     "\n",
@@ -250,27 +245,6 @@
    "source": [
     "terminate_process(vision_process)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Chat Template\n",
-    "\n",
-    "As mentioned before, if you do not specify a vision model's `--chat-template`, the server uses Hugging Face's default template, which only supports text.\n",
-    "\n",
-    "We list popular vision models with their chat templates:\n",
-    "\n",
-    "- [meta-llama/Llama-3.2-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) uses `llama_3_vision`.\n",
-    "- [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) uses `qwen2-vl`.\n",
-    "- [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it) uses `gemma-it`.\n",
-    "- [openbmb/MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V) uses `minicpmv`.\n",
-    "- [deepseek-ai/deepseek-vl2](https://huggingface.co/deepseek-ai/deepseek-vl2) uses `deepseek-vl2`.\n",
-    "- [LlaVA-OneVision](https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov) uses `chatml-llava`.\n",
-    "- [LLaVA-NeXT](https://huggingface.co/collections/lmms-lab/llava-next-6623288e2d61edba3ddbf5ff) uses `chatml-llava`.\n",
-    "- [Llama3-LLaVA-NeXT](https://huggingface.co/lmms-lab/llama3-llava-next-8b) uses `llava_llama_3`.\n",
-    "- [LLaVA-v1.5 / 1.6](https://huggingface.co/liuhaotian/llava-v1.6-34b) uses `vicuna_v1.1`."
-   ]
   }
  ],
  "metadata": {
diff --git a/docs/backend/sampling_params.md b/docs/backend/sampling_params.md
index eab488bc3..9423ab06d 100644
--- a/docs/backend/sampling_params.md
+++ b/docs/backend/sampling_params.md
@@ -136,7 +136,7 @@ Detailed example in [openai compatible api](https://docs.sglang.ai/backend/opena
 Launch a server:
 
 ```bash
-python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --chat-template chatml-llava
+python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov
 ```
 
 Download an image:
diff --git a/docs/supported_models/embedding_models.md b/docs/supported_models/embedding_models.md
index b6a096790..48ea379c5 100644
--- a/docs/supported_models/embedding_models.md
+++ b/docs/supported_models/embedding_models.md
@@ -3,7 +3,7 @@
 SGLang provides robust support for embedding models by integrating efficient serving mechanisms with its flexible programming interface. This integration allows for streamlined handling of embedding tasks, facilitating faster and more accurate retrieval and semantic search operations. SGLang's architecture enables better resource utilization and reduced latency in embedding model deployment.
 
 ```{important}
-They are executed with `--is-embedding` and some may require `--trust-remote-code` and/or `--chat-template`
+They are executed with `--is-embedding` and some may require `--trust-remote-code`
 ```
 
 ## Example launch Command
@@ -13,7 +13,6 @@ python3 -m sglang.launch_server \
   --model-path Alibaba-NLP/gme-Qwen2-VL-2B-Instruct \  # example HF/local path
   --is-embedding \
   --host 0.0.0.0 \
-  --chat-template gme-qwen2-vl \                     # set chat template
   --port 30000 \
 ```
 
diff --git a/docs/supported_models/vision_language_models.md b/docs/supported_models/vision_language_models.md
index 5e150a4be..a9f4a8197 100644
--- a/docs/supported_models/vision_language_models.md
+++ b/docs/supported_models/vision_language_models.md
@@ -2,16 +2,11 @@
 
 These models accept multi-modal inputs (e.g., images and text) and generate text output. They augment language models with visual encoders and require a specific chat template for handling vision prompts.
 
-```{important}
-We need to specify `--chat-template` for VLMs because the chat template provided in HuggingFace tokenizer only supports text. If you do not specify a vision model’s `--chat-template`, the server uses HuggingFace’s default template, which only supports text and the images won’t be passed in.
-```
-
 ## Example launch Command
 
 ```shell
 python3 -m sglang.launch_server \
   --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \  # example HF/local path
-  --chat-template llama_3_vision \                        # required chat template
   --host 0.0.0.0 \
   --port 30000 \
 ```
diff --git a/examples/runtime/engine/offline_batch_inference_vlm.py b/examples/runtime/engine/offline_batch_inference_vlm.py
index 4063136f9..459a048cc 100644
--- a/examples/runtime/engine/offline_batch_inference_vlm.py
+++ b/examples/runtime/engine/offline_batch_inference_vlm.py
@@ -1,6 +1,6 @@
 """
 Usage:
-python offline_batch_inference_vlm.py --model-path Qwen/Qwen2-VL-7B-Instruct --chat-template=qwen2-vl
+python offline_batch_inference_vlm.py --model-path Qwen/Qwen2-VL-7B-Instruct
 """
 
 import argparse
diff --git a/examples/runtime/llava_onevision/http_llava_onevision_test.py b/examples/runtime/llava_onevision/http_llava_onevision_test.py
index ce59f4061..5c895007f 100644
--- a/examples/runtime/llava_onevision/http_llava_onevision_test.py
+++ b/examples/runtime/llava_onevision/http_llava_onevision_test.py
@@ -1,7 +1,7 @@
 """
 Usage:
 
-python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava
+python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8
 
 python3 http_llava_onevision_test.py
 """
diff --git a/examples/runtime/multimodal_embedding.py b/examples/runtime/multimodal_embedding.py
index 813c3c72c..a924b381e 100644
--- a/examples/runtime/multimodal_embedding.py
+++ b/examples/runtime/multimodal_embedding.py
@@ -1,5 +1,5 @@
 # launch server
-# python -m sglang.launch_server --model-path Alibaba-NLP/gme-Qwen2-VL-2B-Instruct --is-embedding --chat-template gme-qwen2-vl
+# python -m sglang.launch_server --model-path Alibaba-NLP/gme-Qwen2-VL-2B-Instruct --is-embedding
 
 import requests
 
diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py
index 293f3d934..8609f3e58 100644
--- a/python/sglang/lang/chat_template.py
+++ b/python/sglang/lang/chat_template.py
@@ -1,3 +1,4 @@
+import re
 from dataclasses import dataclass
 from enum import Enum, auto
 from typing import Callable, Dict, List, Tuple
@@ -71,9 +72,9 @@ def get_chat_template(name):
 
 def get_chat_template_by_model_path(model_path):
     for matching_func in matching_function_registry:
-        template = matching_func(model_path)
-        if template is not None:
-            return template
+        template_name = matching_func(model_path)
+        if template_name is not None:
+            return get_chat_template(template_name)
     return get_chat_template("default")
 
 
@@ -479,134 +480,112 @@ register_chat_template(
 
 @register_chat_template_matching_function
 def match_deepseek(model_path: str):
-    if (
-        "deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
-    ) and "base" not in model_path.lower():
-        return get_chat_template("deepseek-v3")
+    if re.search(r"deepseek-(v3|r1)", model_path, re.IGNORECASE) and not re.search(
+        r"base", model_path, re.IGNORECASE
+    ):
+        return "deepseek-v3"
 
 
 @register_chat_template_matching_function
 def match_deepseek_janus_pro(model_path: str):
-    if "janus" in model_path.lower():
-        return get_chat_template("janus-pro")
+    if re.search(r"janus", model_path, re.IGNORECASE):
+        return "janus-pro"
 
 
 @register_chat_template_matching_function
 def match_dbrx(model_path: str):
-    if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
-        return get_chat_template("dbrx-instruct")
+    if re.search(r"dbrx", model_path, re.IGNORECASE) and re.search(
+        r"instruct", model_path, re.IGNORECASE
+    ):
+        return "dbrx-instruct"
 
 
 @register_chat_template_matching_function
 def match_vicuna(model_path: str):
-    if "vicuna" in model_path.lower():
-        return get_chat_template("vicuna_v1.1")
-    if "llava-v1.5" in model_path.lower():
-        return get_chat_template("vicuna_v1.1")
-    if "llava-next-video-7b" in model_path.lower():
-        return get_chat_template("vicuna_v1.1")
+    if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
+        return "vicuna_v1.1"
 
 
 @register_chat_template_matching_function
 def match_llama2_chat(model_path: str):
-    model_path = model_path.lower()
-    if "llama-2" in model_path and "chat" in model_path:
-        return get_chat_template("llama-2-chat")
-    if (
-        "mistral" in model_path or "mixtral" in model_path
-    ) and "instruct" in model_path:
-        return get_chat_template("llama-2-chat")
-    if "codellama" in model_path and "instruct" in model_path:
-        return get_chat_template("llama-2-chat")
+    if re.search(
+        r"llama-2.*chat|(mistral|mixtral).*instruct|codellama.*instruct",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "llama-2-chat"
 
 
 @register_chat_template_matching_function
 def match_llama3_instruct(model_path: str):
-    model_path = model_path.lower()
-    if "llama-3" in model_path and "instruct" in model_path:
-        return get_chat_template("llama-3-instruct")
+    if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
+        return "llama-3-instruct"
 
 
 @register_chat_template_matching_function
 def match_chat_ml(model_path: str):
-    # import pdb;pdb.set_trace()
-    model_path = model_path.lower()
-    if "tinyllama" in model_path:
-        return get_chat_template("chatml")
-    # Now the suffix for qwen2 chat model is "instruct"
-    if "qwen" in model_path and "vl" in model_path:
-        return get_chat_template("qwen2-vl")
-    if "qwen" in model_path:
-        if "vl" in model_path:
-            return get_chat_template("qwen2-vl")
-        if ("chat" in model_path or "instruct" in model_path) and (
-            "llava" not in model_path
-        ):
-            return get_chat_template("qwen")
-    if (
-        "llava-v1.6-34b" in model_path
-        or "llava-v1.6-yi-34b" in model_path
-        or "llava-next-video-34b" in model_path
-        or "llava-onevision-qwen2" in model_path
+    if re.search(r"tinyllama", model_path, re.IGNORECASE):
+        return "chatml"
+    if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
+        return "qwen2-vl"
+    if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
     ):
-        return get_chat_template("chatml-llava")
+        return "qwen"
+    if re.search(
+        r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "chatml-llava"
 
 
 @register_chat_template_matching_function
 def match_chat_yi(model_path: str):
-    model_path = model_path.lower()
-    if "yi-vl" in model_path and "llava" not in model_path:
-        return get_chat_template("yi-vl")
-    elif "yi-1.5" in model_path and "chat" in model_path:
-        return get_chat_template("yi-1.5")
+    if re.search(r"yi-vl", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
+    ):
+        return "yi-vl"
+    elif re.search(r"yi-1\.5.*chat", model_path, re.IGNORECASE):
+        return "yi-1.5"
 
 
 @register_chat_template_matching_function
 def match_gemma_it(model_path: str):
-    model_path = model_path.lower()
-    if "gemma" in model_path and "it" in model_path:
-        return get_chat_template("gemma-it")
+    if re.search(r"gemma.*it", model_path, re.IGNORECASE):
+        return "gemma-it"
 
 
 @register_chat_template_matching_function
 def match_openbmb_minicpm(model_path: str):
-    model_path = model_path.lower()
-    if "minicpm-v" in model_path:
-        return get_chat_template("minicpmv")
-    elif "minicpm-o" in model_path:
-        return get_chat_template("minicpmo")
+    if re.search(r"minicpm-v", model_path, re.IGNORECASE):
+        return "minicpmv"
+    elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
+        return "minicpmo"
 
 
 @register_chat_template_matching_function
 def match_c4ai_command_r(model_path: str):
-    model_path = model_path.lower()
-    if "c4ai-command-r" in model_path:
-        return get_chat_template("c4ai-command-r")
+    if re.search(r"c4ai-command-r", model_path, re.IGNORECASE):
+        return "c4ai-command-r"
 
 
 @register_chat_template_matching_function
 def match_granite_instruct(model_path: str):
-    model_path = model_path.lower()
-    # When future versions of Granite are released, this code may
-    # need to be updated. For now, assume that the Granite 3.0
-    # template works across the board.
-    if "granite" in model_path and "instruct" in model_path:
-        return get_chat_template("granite-3-instruct")
+    if re.search(r"granite.*instruct", model_path, re.IGNORECASE):
+        return "granite-3-instruct"
 
 
 @register_chat_template_matching_function
 def match_gemma3_instruct(model_path: str):
-    model_path = model_path.lower()
-    if "gemma-3" in model_path and "1b" not in model_path:
-        # gemma-3-1b-it is completion model
-        return get_chat_template("gemma-it")
+    if re.search(r"gemma-3", model_path, re.IGNORECASE):
+        return "gemma-it"
 
 
 @register_chat_template_matching_function
 def match_internvl_chat(model_path: str):
-    model_path = model_path.lower()
-    if "internvl" in model_path:
-        return get_chat_template("internvl-2-5")
+    if re.search(r"internvl2_5", model_path, re.IGNORECASE):
+        return "internvl-2-5"
 
 
 if __name__ == "__main__":
diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py
index 9492cd87e..e931bc64a 100644
--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -16,6 +16,7 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 import dataclasses
+import re
 from enum import IntEnum, auto
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
@@ -852,91 +853,75 @@ register_conv_template(
 )
 
 
+@register_conv_template_matching_function
+def match_internvl(model_path: str):
+    if re.search(r"internvl2_5", model_path, re.IGNORECASE):
+        return "internvl-2-5"
+
+
 @register_conv_template_matching_function
 def match_llama_3_vision(model_path: str):
-    if (
-        "llama" in model_path.lower()
-        and "3.2" in model_path.lower()
-        and "vision" in model_path.lower()
-    ):
+    if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
         return "llama_3_vision"
 
 
 @register_conv_template_matching_function
 def match_deepseek_janus_pro(model_path: str):
-    if "janus" in model_path.lower():
+    if re.search(r"janus", model_path, re.IGNORECASE):
         return "janus-pro"
 
 
 @register_conv_template_matching_function
 def match_vicuna(model_path: str):
-    if "vicuna" in model_path.lower():
-        return "vicuna_v1.1"
-    if "llava-v1.5" in model_path.lower():
-        return "vicuna_v1.1"
-    if "llava-next-video-7b" in model_path.lower():
+    if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
         return "vicuna_v1.1"
 
 
 @register_conv_template_matching_function
 def match_llama2_chat(model_path: str):
-    model_path = model_path.lower()
-    if "llama-2" in model_path and "chat" in model_path:
-        return "llama-2"
-    if (
-        "mistral" in model_path or "mixtral" in model_path
-    ) and "instruct" in model_path:
-        return "llama-2"
-    if "codellama" in model_path and "instruct" in model_path:
+    if re.search(
+        r"llama-2.*chat|(mistral|mixtral).*instruct|codellama.*instruct",
+        model_path,
+        re.IGNORECASE,
+    ):
         return "llama-2"
 
 
 @register_conv_template_matching_function
 def match_deepseek_vl(model_path: str):
-    model_path = model_path.lower()
-    if "deepseek" in model_path and "vl2" in model_path:
+    if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
         return "deepseek-vl2"
 
 
 @register_conv_template_matching_function
-def match_chat_ml(model_path: str):
-    # import pdb;pdb.set_trace()
-    model_path = model_path.lower()
-    # Now the suffix for qwen2 chat model is "instruct"
-    if "gme" in model_path and "qwen" in model_path and "vl" in model_path:
+def match_qwen_chat_ml(model_path: str):
+    if re.search(r"gme.*qwen.*vl", model_path, re.IGNORECASE):
         return "gme-qwen2-vl"
-    if "qwen" in model_path and "vl" in model_path:
+    if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
         return "qwen2-vl"
-    if (
-        "llava-v1.6-34b" in model_path
-        or "llava-v1.6-yi-34b" in model_path
-        or "llava-next-video-34b" in model_path
-        or "llava-onevision-qwen2" in model_path
+    if re.search(
+        r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
+        model_path,
+        re.IGNORECASE,
     ):
         return "chatml-llava"
 
 
 @register_conv_template_matching_function
-def match_gemma_it(model_path: str):
-    model_path = model_path.lower()
-    if "gemma" in model_path and "it" in model_path:
-        return "gemma-it"
-    if "gemma-3" in model_path and "1b" not in model_path:
-        # gemma-3-1b-it is completion model
+def match_gemma3_instruct(model_path: str):
+    if re.search(r"gemma-3.*it", model_path, re.IGNORECASE):
         return "gemma-it"
 
 
 @register_conv_template_matching_function
 def match_openbmb_minicpm(model_path: str):
-    model_path = model_path.lower()
-    if "minicpm-v" in model_path:
+    if re.search(r"minicpm-v", model_path, re.IGNORECASE):
         return "minicpmv"
-    elif "minicpm-o" in model_path:
+    elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
         return "minicpmo"
 
 
 @register_conv_template_matching_function
 def match_moonshot_kimivl(model_path: str):
-    model_path = model_path.lower()
-    if "kimi" in model_path and "vl" in model_path:
+    if re.search(r"kimi.*vl", model_path, re.IGNORECASE):
         return "kimi-vl"
diff --git a/test/srt/models/test_vlm_models.py b/test/srt/models/test_vlm_models.py
index e9a42ef6e..c55e98da2 100644
--- a/test/srt/models/test_vlm_models.py
+++ b/test/srt/models/test_vlm_models.py
@@ -19,17 +19,12 @@ from sglang.test.test_utils import (
 
 # VLM models for testing
 MODELS = [
-    SimpleNamespace(
-        model="google/gemma-3-27b-it", chat_template="gemma-it", mmmu_accuracy=0.45
-    ),
+    SimpleNamespace(model="google/gemma-3-27b-it", mmmu_accuracy=0.45),
     SimpleNamespace(
         model="Qwen/Qwen2.5-VL-3B-Instruct",
-        chat_template="qwen2-vl",
         mmmu_accuracy=0.4,
     ),
-    SimpleNamespace(
-        model="openbmb/MiniCPM-V-2_6", chat_template="minicpmv", mmmu_accuracy=0.4
-    ),
+    SimpleNamespace(model="openbmb/MiniCPM-V-2_6", mmmu_accuracy=0.4),
 ]
 
 
@@ -50,7 +45,6 @@ class TestVLMModels(CustomTestCase):
     def run_mmmu_eval(
         self,
         model_version: str,
-        chat_template: str,
         output_path: str,
         *,
         env: dict | None = None,
@@ -69,11 +63,7 @@ class TestVLMModels(CustomTestCase):
         os.makedirs(output_path, exist_ok=True)
 
         # -------- compose --model_args --------
-        model_args = (
-            f'model_version="{model_version}",'
-            f'chat_template="{chat_template}",'
-            f"tp={tp}"
-        )
+        model_args = f'model_version="{model_version}",' f"tp={tp}"
 
         # -------- build command list --------
         cmd = [
@@ -122,8 +112,6 @@ class TestVLMModels(CustomTestCase):
                     timeout=self.time_out,
                     api_key=self.api_key,
                     other_args=[
-                        "--chat-template",
-                        model.chat_template,
                         "--trust-remote-code",
                         "--cuda-graph-max-bs",
                         "32",
@@ -134,7 +122,7 @@ class TestVLMModels(CustomTestCase):
                 )
 
                 # Run evaluation
-                self.run_mmmu_eval(model.model, model.chat_template, "./logs")
+                self.run_mmmu_eval(model.model, "./logs")
 
                 # Get the result file
                 result_file_path = glob.glob("./logs/*.json")[0]
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index d0bdf1416..d86f2d81b 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -156,8 +156,6 @@ class TestBenchServing(CustomTestCase):
             num_prompts=200,
             request_rate=float("inf"),
             other_server_args=[
-                "--chat-template",
-                DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST,
                 "--mem-fraction-static",
                 "0.7",
             ],
@@ -181,8 +179,6 @@ class TestBenchServing(CustomTestCase):
             num_prompts=50,
             request_rate=1,
             other_server_args=[
-                "--chat-template",
-                DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST,
                 "--mem-fraction-static",
                 "0.7",
             ],
diff --git a/test/srt/test_bnb.py b/test/srt/test_bnb.py
index 4a117e249..fd4050888 100644
--- a/test/srt/test_bnb.py
+++ b/test/srt/test_bnb.py
@@ -29,10 +29,10 @@ from sglang.test.test_utils import (
 )
 
 VISION_MODELS = [
-    ("unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit", "qwen2-vl"),
-    ("unsloth/Qwen2-VL-7B-Instruct-bnb-4bit", "qwen2-vl"),
-    ("unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", "llama_3_vision"),
-    ("unsloth/Llama-3.2-11B-Vision-bnb-4bit", "llama_3_vision"),
+    "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
+    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
+    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
+    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
 ]
 LANGUAGE_MODELS = [
     "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
@@ -249,11 +249,9 @@ class TestVisionModel(CustomTestCase):
         if is_in_ci():
             models_to_test = [random.choice(VISION_MODELS)]
 
-        for model, template in models_to_test:
+        for model in models_to_test:
             with self.subTest(model=model):
                 other_args = [
-                    "--chat-template",
-                    template,
                     "--mem-fraction-static",
                     "0.6",
                     "--load-format",
diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py
index 662ca0bde..e90690f10 100644
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -688,7 +688,6 @@ class TestOpenAIServerIgnoreEOS(CustomTestCase):
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
             api_key=cls.api_key,
-            other_args=["--chat-template=llama_3_vision"],
         )
         cls.base_url += "/v1"
         cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py
index 69d5c9b46..9bd074829 100644
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -614,7 +614,7 @@ class TestInternVL2_5Server(TestOpenAIVisionServer):
             cls.model,
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--trust-remote-code", "--chat-template", "internvl-2-5"],
+            other_args=["--trust-remote-code"],
         )
         cls.base_url += "/v1"
 
@@ -676,8 +676,6 @@ class TestDeepseekVL2TinyServer(TestOpenAIVisionServer):
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
             other_args=[
                 "--trust-remote-code",
-                "--chat-template",
-                "deepseek-vl2",
                 "--context-length",
                 "4096",
             ],
@@ -775,8 +773,6 @@ class TestKimiVLServer(TestOpenAIVisionServer):
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
             other_args=[
                 "--trust-remote-code",
-                "--chat-template",
-                "kimi-vl",
                 "--context-length",
                 "4096",
                 "--dtype",