diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index 2e8fdb5af..2ce921f50 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -24,7 +24,8 @@ "\n", "Launch the server in your terminal and wait for it to initialize.\n", "\n", - "Remember to add `--chat-template llama_3_vision` to specify the vision chat template, otherwise the server only supports text.\n", + "**Remember to add `--chat-template llama_3_vision` to specify the vision chat template, otherwise the server only supports text, and performance degradation may occur.**\n", + "\n", "We need to specify `--chat-template` for vision language models because the chat template provided in Hugging Face tokenizer only supports text." ] }, diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md index c195e3ae2..8d8e71de9 100644 --- a/docs/backend/server_arguments.md +++ b/docs/backend/server_arguments.md @@ -56,6 +56,9 @@ Please consult the documentation below to learn more about the parameters you ma * `json_model_override_args`: Override model config with the provided JSON. * `delete_ckpt_after_loading`: Delete the model checkpoint after loading the model. +> [!IMPORTANT] +> **Make sure the correct `chat_template` is passed, or performance degradation may occur.** + ## Serving: HTTP & API ### HTTP Server configuration diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py index a2c91c561..08cc80931 100644 --- a/python/sglang/lang/chat_template.py +++ b/python/sglang/lang/chat_template.py @@ -353,7 +353,6 @@ register_chat_template( ) ) - register_chat_template( ChatTemplate( name="deepseek-v3", @@ -428,12 +427,13 @@ def match_chat_ml(model_path: str): if "tinyllama" in model_path: return get_chat_template("chatml") # Now the suffix for qwen2 chat model is "instruct" - if ( - "qwen" in model_path - and ("chat" in model_path or "instruct" in model_path) - and ("llava" not in model_path) - ): - return get_chat_template("qwen") + if "qwen" in model_path: + if "vl" in model_path: + return get_chat_template("qwen2-vl") + if ("chat" in model_path or "instruct" in model_path) and ( + "llava" not in model_path + ): + return get_chat_template("qwen") if ( "llava-v1.6-34b" in model_path or "llava-v1.6-yi-34b" in model_path @@ -459,6 +459,13 @@ def match_gemma_it(model_path: str): return get_chat_template("gemma-it") +@register_chat_template_matching_function +def match_openbmb_minicpm(model_path: str): + model_path = model_path.lower() + if "minicpm" in model_path: + return get_chat_template("minicpmv") + + @register_chat_template_matching_function def match_c4ai_command_r(model_path: str): model_path = model_path.lower() diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index 93bd184c6..b35b656d6 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -438,7 +438,9 @@ def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dic # Launch tokenizer process tokenizer_manager = TokenizerManager(server_args, port_args) if server_args.chat_template: - load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template) + load_chat_template_for_openai_api( + tokenizer_manager, server_args.chat_template, server_args.model_path + ) # Wait for the model to finish loading scheduler_infos = [] diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index c8ce9302b..a1f06e186 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -449,7 +449,8 @@ class LlavaBaseForCausalLM(nn.Module): projector_weights = { "model.mm_projector.0": "multi_modal_projector.linear_1", "model.mm_projector.2": "multi_modal_projector.linear_2", - "model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned). + "model.vision_tower.vision_tower": "vision_tower", + # Update the vision tower weights if we find them in the checkpoint (it may be finetuned). "model.image_newline": "language_model.model.image_newline", } params_dict = dict(self.named_parameters()) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 6687a4c01..8972d42bc 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -20,12 +20,14 @@ import os import time import uuid from http import HTTPStatus -from typing import Dict, List, Optional +from typing import Dict, List from fastapi import HTTPException, Request, UploadFile from fastapi.responses import ORJSONResponse, StreamingResponse from pydantic import ValidationError +from sglang.lang.chat_template import get_chat_template_by_model_path + try: from outlines.fsm.json_schema import convert_json_schema_to_str except ImportError: @@ -92,7 +94,6 @@ file_id_response: Dict[str, FileResponse] = {} # map file id to file path in SGLang backend file_id_storage: Dict[str, str] = {} - # backend storage directory storage_dir = None @@ -116,12 +117,13 @@ def create_streaming_error_response( return json_str -def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg): +def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, model_path): global chat_template_name logger.info( f"Use chat template for the OpenAI-compatible API server: {chat_template_arg}" ) + if not chat_template_exists(chat_template_arg): if not os.path.exists(chat_template_arg): raise RuntimeError( @@ -163,6 +165,18 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg): else: chat_template_name = chat_template_arg + # check chat-template + chat_template = get_chat_template_by_model_path(model_path) + if chat_template is not None: + official_chat_template = chat_template.name + used_chat_template = chat_template_name + if official_chat_template != used_chat_template: + logger.warning( + f"Using a chat_template: '{used_chat_template}', " + f"which is different from official chat template: '{official_chat_template}', " + f"This discrepancy may lead to performance degradation." + ) + async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str = None): try: