sync from b7516

This commit is contained in:
2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions

View File

@@ -96,10 +96,6 @@ struct task_result_state {
std::string generated_text; // append new chunks of generated text here
std::vector<std::string> generated_tool_call_ids;
// for Anthropic API streaming: track content block state across chunks
bool anthropic_thinking_block_started = false;
bool anthropic_text_block_started = false;
task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
: oaicompat_chat_syntax(oaicompat_chat_syntax) {}
@@ -121,10 +117,8 @@ struct server_task {
int id_slot = -1;
// used by parallel sampling (multiple completions from same prompt)
int id_parent = -1;
// temporary store of child tasks for scheduling
// note: accessing to elements is invalid after the task is moved to server_slot
std::vector<server_task> child_tasks;
size_t n_children = 0; // number of tasks reusing this prompt
int id_parent = -1;
// used by SERVER_TASK_TYPE_INFERENCE
task_params params;
@@ -158,36 +152,6 @@ struct server_task {
return tokens.size();
}
bool need_embd() const {
switch (type) {
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
return true;
default:
return false;
}
}
bool need_logits() const {
switch (type) {
case SERVER_TASK_TYPE_COMPLETION:
case SERVER_TASK_TYPE_INFILL:
return true;
default:
return false;
}
}
bool need_sampling() const {
switch (type) {
case SERVER_TASK_TYPE_COMPLETION:
case SERVER_TASK_TYPE_INFILL:
return true;
default:
return false;
}
}
static task_params params_from_json_cmpl(
const llama_vocab * vocab,
const common_params & params_base,
@@ -199,30 +163,18 @@ struct server_task {
std::unordered_set<int> ids(tasks.size());
for (size_t i = 0; i < tasks.size(); i++) {
ids.insert(tasks[i].id);
for (auto & child : tasks[i].child_tasks) {
ids.insert(child.id);
}
}
return ids;
}
void add_child(int id_parent, int id_child) {
server_task create_child(int id_parent, int id_child) const {
server_task copy;
copy.id = id_child;
copy.id_parent = id_parent;
copy.params = params;
copy.type = type;
copy.tokens = tokens.clone();
copy.id_slot = -1; // child tasks cannot specify slot
// use different sampling seed for each child
// note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) {
copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1;
}
child_tasks.push_back(std::move(copy));
return copy;
}
// the task will be moved into queue, then onto slots
@@ -230,14 +182,6 @@ struct server_task {
task_result_state create_state() const {
return task_result_state(params.oaicompat_chat_syntax);
}
bool is_parent() const {
return child_tasks.size() > 0;
}
bool is_child() const {
return id_parent != -1;
}
};
struct result_timings {
@@ -393,12 +337,6 @@ struct server_task_result_cmpl_partial : server_task_result {
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
bool is_updated = false;
// for Anthropic API: track if any reasoning content has been generated
bool anthropic_has_reasoning = false;
// Streaming state copied from task_result_state for this chunk
bool anthropic_thinking_block_started = false;
bool anthropic_text_block_started = false;
virtual bool is_stop() override {
return false; // in stream mode, partial responses are not considered stop
}
@@ -408,22 +346,6 @@ struct server_task_result_cmpl_partial : server_task_result {
virtual void update(task_result_state & state) override {
is_updated = true;
state.update_chat_msg(content, true, oaicompat_msg_diffs);
// track if the accumulated message has any reasoning content
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
// Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk)
anthropic_thinking_block_started = state.anthropic_thinking_block_started;
anthropic_text_block_started = state.anthropic_text_block_started;
// Pre-compute state updates based on diffs (for next chunk)
for (const auto & diff : oaicompat_msg_diffs) {
if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) {
state.anthropic_thinking_block_started = true;
}
if (!diff.content_delta.empty() && !state.anthropic_text_block_started) {
state.anthropic_text_block_started = true;
}
}
}
json to_json_non_oaicompat();