doc: emphasize and notify the usage of chat_template (#3589)
Co-authored-by: Chayenne <zhaochen20@outlook.com>
This commit is contained in:
@@ -24,7 +24,8 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"Launch the server in your terminal and wait for it to initialize.\n",
|
"Launch the server in your terminal and wait for it to initialize.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Remember to add `--chat-template llama_3_vision` to specify the vision chat template, otherwise the server only supports text.\n",
|
"**Remember to add `--chat-template llama_3_vision` to specify the vision chat template, otherwise the server only supports text, and performance degradation may occur.**\n",
|
||||||
|
"\n",
|
||||||
"We need to specify `--chat-template` for vision language models because the chat template provided in Hugging Face tokenizer only supports text."
|
"We need to specify `--chat-template` for vision language models because the chat template provided in Hugging Face tokenizer only supports text."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -56,6 +56,9 @@ Please consult the documentation below to learn more about the parameters you ma
|
|||||||
* `json_model_override_args`: Override model config with the provided JSON.
|
* `json_model_override_args`: Override model config with the provided JSON.
|
||||||
* `delete_ckpt_after_loading`: Delete the model checkpoint after loading the model.
|
* `delete_ckpt_after_loading`: Delete the model checkpoint after loading the model.
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> **Make sure the correct `chat_template` is passed, or performance degradation may occur.**
|
||||||
|
|
||||||
## Serving: HTTP & API
|
## Serving: HTTP & API
|
||||||
|
|
||||||
### HTTP Server configuration
|
### HTTP Server configuration
|
||||||
|
|||||||
@@ -353,7 +353,6 @@ register_chat_template(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_chat_template(
|
register_chat_template(
|
||||||
ChatTemplate(
|
ChatTemplate(
|
||||||
name="deepseek-v3",
|
name="deepseek-v3",
|
||||||
@@ -428,12 +427,13 @@ def match_chat_ml(model_path: str):
|
|||||||
if "tinyllama" in model_path:
|
if "tinyllama" in model_path:
|
||||||
return get_chat_template("chatml")
|
return get_chat_template("chatml")
|
||||||
# Now the suffix for qwen2 chat model is "instruct"
|
# Now the suffix for qwen2 chat model is "instruct"
|
||||||
if (
|
if "qwen" in model_path:
|
||||||
"qwen" in model_path
|
if "vl" in model_path:
|
||||||
and ("chat" in model_path or "instruct" in model_path)
|
return get_chat_template("qwen2-vl")
|
||||||
and ("llava" not in model_path)
|
if ("chat" in model_path or "instruct" in model_path) and (
|
||||||
):
|
"llava" not in model_path
|
||||||
return get_chat_template("qwen")
|
):
|
||||||
|
return get_chat_template("qwen")
|
||||||
if (
|
if (
|
||||||
"llava-v1.6-34b" in model_path
|
"llava-v1.6-34b" in model_path
|
||||||
or "llava-v1.6-yi-34b" in model_path
|
or "llava-v1.6-yi-34b" in model_path
|
||||||
@@ -459,6 +459,13 @@ def match_gemma_it(model_path: str):
|
|||||||
return get_chat_template("gemma-it")
|
return get_chat_template("gemma-it")
|
||||||
|
|
||||||
|
|
||||||
|
@register_chat_template_matching_function
|
||||||
|
def match_openbmb_minicpm(model_path: str):
|
||||||
|
model_path = model_path.lower()
|
||||||
|
if "minicpm" in model_path:
|
||||||
|
return get_chat_template("minicpmv")
|
||||||
|
|
||||||
|
|
||||||
@register_chat_template_matching_function
|
@register_chat_template_matching_function
|
||||||
def match_c4ai_command_r(model_path: str):
|
def match_c4ai_command_r(model_path: str):
|
||||||
model_path = model_path.lower()
|
model_path = model_path.lower()
|
||||||
|
|||||||
@@ -438,7 +438,9 @@ def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dic
|
|||||||
# Launch tokenizer process
|
# Launch tokenizer process
|
||||||
tokenizer_manager = TokenizerManager(server_args, port_args)
|
tokenizer_manager = TokenizerManager(server_args, port_args)
|
||||||
if server_args.chat_template:
|
if server_args.chat_template:
|
||||||
load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
|
load_chat_template_for_openai_api(
|
||||||
|
tokenizer_manager, server_args.chat_template, server_args.model_path
|
||||||
|
)
|
||||||
|
|
||||||
# Wait for the model to finish loading
|
# Wait for the model to finish loading
|
||||||
scheduler_infos = []
|
scheduler_infos = []
|
||||||
|
|||||||
@@ -449,7 +449,8 @@ class LlavaBaseForCausalLM(nn.Module):
|
|||||||
projector_weights = {
|
projector_weights = {
|
||||||
"model.mm_projector.0": "multi_modal_projector.linear_1",
|
"model.mm_projector.0": "multi_modal_projector.linear_1",
|
||||||
"model.mm_projector.2": "multi_modal_projector.linear_2",
|
"model.mm_projector.2": "multi_modal_projector.linear_2",
|
||||||
"model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
|
"model.vision_tower.vision_tower": "vision_tower",
|
||||||
|
# Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
|
||||||
"model.image_newline": "language_model.model.image_newline",
|
"model.image_newline": "language_model.model.image_newline",
|
||||||
}
|
}
|
||||||
params_dict = dict(self.named_parameters())
|
params_dict = dict(self.named_parameters())
|
||||||
|
|||||||
@@ -20,12 +20,14 @@ import os
|
|||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List
|
||||||
|
|
||||||
from fastapi import HTTPException, Request, UploadFile
|
from fastapi import HTTPException, Request, UploadFile
|
||||||
from fastapi.responses import ORJSONResponse, StreamingResponse
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
from sglang.lang.chat_template import get_chat_template_by_model_path
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from outlines.fsm.json_schema import convert_json_schema_to_str
|
from outlines.fsm.json_schema import convert_json_schema_to_str
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -92,7 +94,6 @@ file_id_response: Dict[str, FileResponse] = {}
|
|||||||
# map file id to file path in SGLang backend
|
# map file id to file path in SGLang backend
|
||||||
file_id_storage: Dict[str, str] = {}
|
file_id_storage: Dict[str, str] = {}
|
||||||
|
|
||||||
|
|
||||||
# backend storage directory
|
# backend storage directory
|
||||||
storage_dir = None
|
storage_dir = None
|
||||||
|
|
||||||
@@ -116,12 +117,13 @@ def create_streaming_error_response(
|
|||||||
return json_str
|
return json_str
|
||||||
|
|
||||||
|
|
||||||
def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
|
def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, model_path):
|
||||||
global chat_template_name
|
global chat_template_name
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Use chat template for the OpenAI-compatible API server: {chat_template_arg}"
|
f"Use chat template for the OpenAI-compatible API server: {chat_template_arg}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if not chat_template_exists(chat_template_arg):
|
if not chat_template_exists(chat_template_arg):
|
||||||
if not os.path.exists(chat_template_arg):
|
if not os.path.exists(chat_template_arg):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
@@ -163,6 +165,18 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
|
|||||||
else:
|
else:
|
||||||
chat_template_name = chat_template_arg
|
chat_template_name = chat_template_arg
|
||||||
|
|
||||||
|
# check chat-template
|
||||||
|
chat_template = get_chat_template_by_model_path(model_path)
|
||||||
|
if chat_template is not None:
|
||||||
|
official_chat_template = chat_template.name
|
||||||
|
used_chat_template = chat_template_name
|
||||||
|
if official_chat_template != used_chat_template:
|
||||||
|
logger.warning(
|
||||||
|
f"Using a chat_template: '{used_chat_template}', "
|
||||||
|
f"which is different from official chat template: '{official_chat_template}', "
|
||||||
|
f"This discrepancy may lead to performance degradation."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str = None):
|
async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str = None):
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user