diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md index 7fe934f17..1486a2c54 100644 --- a/docs/references/deepseek.md +++ b/docs/references/deepseek.md @@ -163,6 +163,35 @@ When using FlashInfer MLA wrapper (`--attention-backend flashinfer`) with specul See [Separate Reasoning](https://docs.sglang.ai/backend/separate_reasoning.html). + +### Function calling for DeepSeek Models + +Add arguments `--tool-call-parser deepseekv3` to enable this feature. For example (running on 1 * H20 node): + +``` +python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --port 30000 --host 0.0.0.0 --mem-fraction-static 0.9 --disable-cuda-graph --tool-call-parser deepseekv3 +``` + +Sample Request: + +``` +curl "http://127.0.0.1:30000/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{"temperature": 0, "max_tokens": 100, "model": "deepseek-ai/DeepSeek-V3-0324", "tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of an city, the user should supply a city first", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city, e.g. Beijing"}}, "required": ["city"]}}}], "messages": [{"role": "user", "content": "Hows the weather like in Qingdao today"}]}' +``` + +Expected Response + +``` +{"id": "62af80528930423a82c806651ec66e7c", "object": "chat.completion", "created": 1744431333, "model": "deepseek-ai/DeepSeek-V3-0324", "choices": [{"index": 0, "message": {"role": "assistant", "content": null, "reasoning_content": null, "tool_calls": [{"id": "0", "type": "function", "function": {"name": "query_weather", "arguments": "{\\"city\\": \\"Guangzhou\\"}"}}]}, "logprobs": null, "finish_reason": "tool_calls", "matched_stop": null}], "usage": {"prompt_tokens": 118, "total_tokens": 140, "completion_tokens": 22, "prompt_tokens_details": null}} + +``` + +Important Notes: +1. Use a lower `"temperature"` value for better results. +2. Currently, the function calling implementation for deepseek is incompatible with streaming requests. + + ## FAQ 1. **Question**: What should I do if model loading takes too long and NCCL timeout occurs? diff --git a/python/sglang/srt/function_call_parser.py b/python/sglang/srt/function_call_parser.py index 092cf2b4b..484f39490 100644 --- a/python/sglang/srt/function_call_parser.py +++ b/python/sglang/srt/function_call_parser.py @@ -25,6 +25,7 @@ TOOLS_TAG_LIST = [ "", "<|python_tag|>", "[TOOL_CALLS]", + "<|tool▁calls▁begin|>", ] @@ -477,6 +478,64 @@ class Llama32Detector(BaseFormatDetector): ) +class DeepSeekV3Detector(BaseFormatDetector): + """ + Detector for DeepSeek models. + Assumes function call format: + '<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Tokyo"}\n```<|tool▁call▁end|>\n<|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Paris"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|> + """ + + def __init__(self): + super().__init__() + self.bot_token = "<|tool▁calls▁begin|>" + self.eot_token = "<|tool▁calls▁end|>" + self.func_call_regex = r"<|tool▁call▁begin|>.*?<|tool▁call▁end|>" + self.func_detail_regex = r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)\n```<|tool▁call▁end|>" + + def has_tool_call(self, text: str) -> bool: + """Check if the text contains a deepseek format tool call.""" + return self.bot_token in text + + def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult: + """ + One-time parsing: Detects and parses tool calls in the provided text. + + :param text: The complete text to parse. + :param tools: List of available tools. + :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls. + """ + idx = text.find(self.bot_token) + normal_text = text[:idx].strip() if idx != -1 else text + if self.bot_token not in text: + return StreamingParseResult(normal_text=normal_text, calls=[]) + match_result_list = re.findall(self.func_call_regex, text, re.DOTALL) + calls = [] + try: + for match_result in match_result_list: + # Get function name + func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL) + func_name = func_detail.group(2) + func_args = func_detail.group(3) + func_args = json.loads(func_args) + # construct match_result for parse_base_json + match_result = {"name": func_name, "parameters": func_args} + calls.extend(self.parse_base_json(match_result, tools)) + return StreamingParseResult(normal_text=normal_text, calls=calls) + except Exception as e: + logger.error(f"Error in detect_and_parse: {e}") + # return the normal text if parsing fails + return StreamingParseResult(normal_text=text) + + def structure_info(self) -> _GetInfoFunc: + return lambda name: StructureInfo( + begin="<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>" + + name + + "\n```json\n", + end="\n```<|tool▁call▁end|><|tool▁calls▁end|>", + trigger="<|tool▁calls▁begin|>", + ) + + class MultiFormatParser: def __init__(self, detectors: List[BaseFormatDetector]): """ @@ -543,6 +602,7 @@ class FunctionCallParser: "llama3": Llama32Detector, "qwen25": Qwen25Detector, "mistral": MistralDetector, + "deepseekv3": DeepSeekV3Detector, } def __init__(self, tools: List[Tool], tool_call_parser: str): diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 304a67ac1..e1c120055 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -938,6 +938,35 @@ def v1_chat_generate_request( if chat_template_name is None: openai_compatible_messages = [] + if ( + tools + and tokenizer_manager.server_args.tool_call_parser == "deepseekv3" + ): + # add function call prompt to deepseekv3 + openai_compatible_messages.append( + { + "role": "system", + "content": """You are a helpful Assistant. + ## Tools + ### Function + You have the following functions available: + """ + + "".join( + [ + f""" + - `{tool['name']}`: + ```json + {json.dumps(tool)} + ``` + """ + for tool in tools + ] + ), + } + ) + # TODO fix the compatible issues with xgrammar + strict_tag = None + for message in request.messages: if isinstance(message.content, str): openai_compatible_messages.append( diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 73d8db5a1..e1768b52e 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1087,7 +1087,7 @@ class ServerArgs: parser.add_argument( "--tool-call-parser", type=str, - choices=["qwen25", "mistral", "llama3"], + choices=["qwen25", "mistral", "llama3", "deepseekv3"], default=ServerArgs.tool_call_parser, help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.", )