Files

219 lines
8.6 KiB
C
Raw Permalink Normal View History

// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
#pragma once
#include "common.h"
#include <functional>
#include <chrono>
#include <string>
#include <vector>
#include <map>
struct common_chat_templates;
struct common_chat_tool_call {
std::string name;
std::string arguments;
std::string id;
bool operator==(const common_chat_tool_call & other) const {
return name == other.name && arguments == other.arguments && id == other.id;
}
};
struct common_chat_msg_content_part {
std::string type;
std::string text;
bool operator==(const common_chat_msg_content_part & other) const {
return type == other.type && text == other.text;
}
};
struct common_chat_msg {
std::string role;
std::string content;
std::vector<common_chat_msg_content_part> content_parts;
std::vector<common_chat_tool_call> tool_calls;
std::string reasoning_content;
std::string tool_name;
std::string tool_call_id;
template <class T> T to_json_oaicompat() const;
bool empty() const {
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
}
void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
for (auto i = 0u; i < tool_calls.size(); i++) {
if (ids_cache.size() <= i) {
auto id = tool_calls[i].id;
if (id.empty()) {
id = gen_tool_call_id();
}
ids_cache.push_back(id);
}
tool_calls[i].id = ids_cache[i];
}
}
bool operator==(const common_chat_msg & other) const {
return role == other.role
&& content == other.content
&& content_parts == other.content_parts
&& tool_calls == other.tool_calls
&& reasoning_content == other.reasoning_content
&& tool_name == other.tool_name
&& tool_call_id == other.tool_call_id;
}
bool operator!=(const common_chat_msg & other) const {
return !(*this == other);
}
};
struct common_chat_msg_diff {
std::string reasoning_content_delta;
std::string content_delta;
size_t tool_call_index = std::string::npos;
common_chat_tool_call tool_call_delta;
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
bool operator==(const common_chat_msg_diff & other) const {
return content_delta == other.content_delta
&& tool_call_index == other.tool_call_index
&& tool_call_delta == other.tool_call_delta;
}
};
struct common_chat_tool {
std::string name;
std::string description;
std::string parameters;
};
enum common_chat_tool_choice {
COMMON_CHAT_TOOL_CHOICE_AUTO,
COMMON_CHAT_TOOL_CHOICE_REQUIRED,
COMMON_CHAT_TOOL_CHOICE_NONE,
};
enum common_chat_format {
COMMON_CHAT_FORMAT_CONTENT_ONLY,
COMMON_CHAT_FORMAT_GENERIC,
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
COMMON_CHAT_FORMAT_MAGISTRAL,
COMMON_CHAT_FORMAT_LLAMA_3_X,
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
chat : Deepseek V3.1 reasoning and tool calling support (OpenAI Style) (#15533) * Add DeepSeek V3.1 thinking mode support - Added COMMON_CHAT_FORMAT_DEEPSEEK_V3_1 enum value - Created common_chat_params_init_deepseek_v3_1() function (currently uses R1 implementation) - Created common_chat_parse_deepseek_v3_1() function that handles V3.1 thinking format: - Extracts reasoning content before '</think>' tag into reasoning_content - Extracts regular content after '</think>' tag into content - No opening '<think>' tag in V3.1 format - Added detection logic for V3.1 templates based on pattern: 'message['prefix'] is defined and message['prefix'] and thinking' - Added V3.1 case to parsing switch statement This addresses the issue where V3.1 outputs reasoning content followed by '</think>' and then regular content without the opening '<think>' tag. * Another attempt by V3.1 non-thinking * Fix test, but it's not asserting anything. * Ignore vim swap files in tests dir * Update the test * Try using try_find_literal instead of regex * passing test * Revert "Try using try_find_literal instead of regex" This reverts commit c50d887ec2780dd9e6b8b397e92347d3db8d5575. * Remove unnecessary change * Remove comment * Add code to handle non-thinking mode. * Try to set message['prefix'] when thinking is enabled. * This fixes reasoning, but breaks normal content. We need state in the chat parser. * DeepSeek V3.1 thinking is now the default. Disable with `--reasoning-budget 0`. * Simplify (DeepSeek V3.1 reasoning) * Fix sign inversion bug * Add some tool calling code (not working). * Tool calls working in non-reasoning mode. * Attempt a unit test for tool call parsing. * Passing test * Add tests for both happy path and broken fenced DeepSeek V3.1 tool call variants. * Passing DeepSeek V3.1 tool call tests, but model is not working. * Revert assistance response prefill change. Not my monkeys. * Add fenced_thinking unit test variant. Passes, but thinking tool calling still isn't working for some reason. * Tests pass in reasoning mode. Also e2e tool test passes. * Make a copy of the parse_json_tool_calls function for deepseek-v3.1 so as to not accidentally introduce regressions. * Fix thinking_forced_open logic. tool calling broken. Need to add another test case. * That's what I get for cargo culting a newline. * Add multi tool call test for deepseek v3.1 non-reasoning * Move test, remove .gitignore change * Place deepseek-v3.1 reasoning test directly into existing reasoning function per CISC's request. * Address whitespace CI failure. * Merge two assert_equals per CISC's request. * Add DeepSeek-V3.1 tests to tests/test-chat.cpp per CISC's request. * Merge deepseek V3.1 and regular parse_json_tool_calls() function behaviors by adding optional update_cursor argument. * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * DeepSeek V3.1 fix reasoning_format none * Strip grammar down to strictly what we expect based on model card. Throw out parts we cargo culted from R1 that don't make sense. * Update tests/test-chat-parser.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * DeepSeek V3.1 - Add edge case where thinking is forced open, there is tool calling in the reasoning content, but then the model just stops the output without closing the </think> tag, so it's not a partial. In this case, use the tool call in the reasoning content. * DeepSeek V3.1 - simplify update_cursor * Update common/chat.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update common/chat.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update common/chat.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Fix indent --------- Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-09-08 10:59:48 -04:00
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
COMMON_CHAT_FORMAT_HERMES_2_PRO,
COMMON_CHAT_FORMAT_COMMAND_R7B,
COMMON_CHAT_FORMAT_GRANITE,
llama : add gpt-oss (#15091) * oai moe * compat with new checkpoint * add attn sink impl * add rope scaling yarn * logits match with latest transformers code * wip chat template * rm trailing space * use ggml_scale_bias * rm redundant is_swa_all * convert interleaved gate_up * graph : fix activation function to match reference (#7) * vocab : handle o200k_harmony special tokens * ggml : add attention sinks support (#1) * llama : add attn sinks * ggml : add attn sinks * cuda : add attn sinks * vulkan : add support for sinks in softmax remove unnecessary return * ggml : add fused swiglu_oai op (#11) * ggml : add fused swiglu_oai op * Update ggml/src/ggml-cpu/ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update CUDA impl * cont : metal impl * add vulkan impl * test-backend-ops : more test cases, clean up * llama : remove unfused impl * remove extra lines --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com> * repack mxfp4 upon conversion * clean up a bit * enable thinking * add quick hack to render only some special tokens * fix bf16 conversion * remove vocab hack * webui ok * support chat parsing for gpt-oss * fix webui * direct mapping mxfp4, FINALLY * force using mxfp4 * properly use lazy tensor * ggml : add mxfp4 ggml : use e8m0 conversion instead of powf Co-authored-by: Diego Devesa <slarengh@gmail.com> change kvalues_mxfp4 table to match e2m1 (#6) metal : remove quantization for now (not used) cuda : fix disabled CUDA graphs due to ffn moe bias vulkan : add support for mxfp4 cont : add cm2 dequant * ggml : add ggml_add_id (#13) * ggml : add ggml_add_id * add cuda impl * llama : add weight support check for add_id * perf opt * add vulkan impl * rename cuda files * add metal impl * allow in-place ggml_add_id * llama : keep biases on CPU with --cpu-moe * llama : fix compile error ggml-ci * cuda : add fallback for __nv_cvt_e8m0_to_bf16raw ggml-ci * cleanup ggml-ci * sycl : fix supports_op for MXFP4 ggml-ci * fix Unknown reasoning format * ggml-cpu : fix AVX build ggml-ci * fix hip build ggml-ci * cuda : add mxfp4 dequantization support for cuBLAS ggml-ci * ggml-cpu : fix mxfp4 fallback definitions for some architectures ggml-ci * cuda : fix version required for __nv_cvt_e8m0_to_bf16raw --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: slaren <slarengh@gmail.com>
2025-08-05 22:10:36 +03:00
COMMON_CHAT_FORMAT_GPT_OSS,
COMMON_CHAT_FORMAT_SEED_OSS,
COMMON_CHAT_FORMAT_NEMOTRON_V2,
model : Apertus model implementation (#15852) * First attempt * No permute during convert (fixes qk tensors), proper norm application. * RoPE = NeoX * Coherence! * Migrate xielu params from tensors to hyperparameters * Simple CUDA kernel * Revert stupid LLM refactorings * Chat template support * configchecker / flake8 errors * Reorder unary.cu * I do conclude that LLMs are, in fact, stupid. * Fix after merge * Final newline * Make xIELU an UNARY_OP * Final newline * Correctly account for parameter shift * Argh. * Update ggml/src/ggml-cpu/unary-ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Refactor: remove unused methods, inline and factorize softplus, add const modifiers * Revert CUDA changes, implement xIELU as a separate OP * Pesky newline * Add float2half / half2float for F16 inputs/outputs * CUDA variants, attempt 2 * Actually, attempt 3 * Update ggml/src/ggml-cuda/unary.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Missing convert header * Proper formula and reference for xIELU in the comments. * Modify unary-ops.cpp to add the functor-based logic besides the template system to retain optimizations * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Add tensor mappings for Apertus to global list instead * Fix lazy on scalars * Update ggml/src/ggml-cuda/unary.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Add comment about the constraints on positive/negative alpha * Change `softplus` to `ggml_softplus` --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-10-02 19:43:22 +02:00
COMMON_CHAT_FORMAT_APERTUS,
COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
struct common_chat_templates_inputs {
std::vector<common_chat_msg> messages;
std::string grammar;
std::string json_schema;
bool add_generation_prompt = true;
bool use_jinja = true;
// Parameters below only supported when use_jinja is true
std::vector<common_chat_tool> tools;
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
bool parallel_tool_calls = false;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::map<std::string, std::string> chat_template_kwargs;
bool add_bos = false;
bool add_eos = false;
};
struct common_chat_params {
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
std::string prompt;
std::string grammar;
bool grammar_lazy = false;
bool thinking_forced_open = false;
std::vector<common_grammar_trigger> grammar_triggers;
std::vector<std::string> preserved_tokens;
std::vector<std::string> additional_stops;
};
struct common_chat_syntax {
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
bool reasoning_in_content = false;
bool thinking_forced_open = false;
bool parse_tool_calls = true;
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
void common_chat_templates_free(struct common_chat_templates * tmpls);
struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
common_chat_templates_ptr common_chat_templates_init(
const struct llama_model * model,
const std::string & chat_template_override,
const std::string & bos_token_override = "",
const std::string & eos_token_override = "");
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
struct common_chat_params common_chat_templates_apply(
const struct common_chat_templates * tmpls,
const struct common_chat_templates_inputs & inputs);
// Format single message, while taking into account the position of that message in chat history
std::string common_chat_format_single(
const struct common_chat_templates * tmpls,
const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg,
bool add_ass,
bool use_jinja);
// Returns an example of formatted chat
std::string common_chat_format_example(
const struct common_chat_templates * tmpls,
bool use_jinja,
const std::map<std::string, std::string> & chat_template_kwargs);
const char* common_chat_format_name(common_chat_format format);
const char* common_reasoning_format_name(common_reasoning_format format);
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
Thinking model disabled assistant prefill (#15404) * feat: Set enable_thinking IFF not disabled and supported Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Fix inverted logic condition for prefill error Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Always parse the enable_thinking kwarg to overwrite the default value From what I can tell, this started as a Qwen3-specific keyword, but from the use in `chat.cpp` translates this inputs.enable_thinking to the right thinking kwarg for the given model, this is now more of a standardized kwarg, so it should always override the default value when sent as part of the chat_template_kwargs field in the API. Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Don't limit tempalte expansion check to jinja With the use_jinja check, non-jinja models would enable thinking and always fail assistant prefill Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add the error text to json type errors in json_value Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Explicitly reject string values for "enable_thinking" There are too many possible "truthy" / "falsy" strings and too many ambiguous strings that don't have a clear truthy/falsy value, so the simplest thing to do here is to reject the request. Ideally, this would be a 422 (Unprocessable Entity), but right now it's coming back as a 500. Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Move logic for detecting template enable_thinking support to common Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use raw pointer for common chat template function Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2025-09-05 14:31:24 -06:00
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
// Parses a JSON array of messages in OpenAI's chat completion API format.
// T can be std::string containing JSON or nlohmann::ordered_json
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
// T can be std::string containing JSON or nlohmann::ordered_json
template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);