sync from b7516

2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -96,10 +96,6 @@ struct task_result_state {
    std::string generated_text; // append new chunks of generated text here
    std::vector<std::string> generated_tool_call_ids;

-    // for Anthropic API streaming: track content block state across chunks
-    bool anthropic_thinking_block_started = false;
-    bool anthropic_text_block_started = false;
-
    task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
        : oaicompat_chat_syntax(oaicompat_chat_syntax) {}

@@ -121,10 +117,8 @@ struct server_task {
    int id_slot   = -1;

    // used by parallel sampling (multiple completions from same prompt)
-    int id_parent  = -1;
-    // temporary store of child tasks for scheduling
-    // note: accessing to elements is invalid after the task is moved to server_slot
-    std::vector<server_task> child_tasks;
+    size_t n_children =  0; // number of tasks reusing this prompt
+    int    id_parent  = -1;

    // used by SERVER_TASK_TYPE_INFERENCE
    task_params   params;
@@ -158,36 +152,6 @@ struct server_task {
        return tokens.size();
    }

-    bool need_embd() const {
-        switch (type) {
-            case SERVER_TASK_TYPE_EMBEDDING:
-            case SERVER_TASK_TYPE_RERANK:
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool need_logits() const {
-        switch (type) {
-            case SERVER_TASK_TYPE_COMPLETION:
-            case SERVER_TASK_TYPE_INFILL:
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool need_sampling() const {
-        switch (type) {
-            case SERVER_TASK_TYPE_COMPLETION:
-            case SERVER_TASK_TYPE_INFILL:
-                return true;
-            default:
-                return false;
-        }
-    }
-
    static task_params params_from_json_cmpl(
        const llama_vocab * vocab,
        const common_params & params_base,
@@ -199,30 +163,18 @@ struct server_task {
        std::unordered_set<int> ids(tasks.size());
        for (size_t i = 0; i < tasks.size(); i++) {
            ids.insert(tasks[i].id);
-            for (auto & child : tasks[i].child_tasks) {
-                ids.insert(child.id);
-            }
        }
        return ids;
    }

-    void add_child(int id_parent, int id_child) {
+    server_task create_child(int id_parent, int id_child) const {
        server_task copy;
-
        copy.id        = id_child;
        copy.id_parent = id_parent;
        copy.params    = params;
        copy.type      = type;
        copy.tokens    = tokens.clone();
-        copy.id_slot   = -1; // child tasks cannot specify slot
-
-        // use different sampling seed for each child
-        // note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
-        if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) {
-            copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1;
-        }
-
-        child_tasks.push_back(std::move(copy));
+        return copy;
    }

    // the task will be moved into queue, then onto slots
@@ -230,14 +182,6 @@ struct server_task {
    task_result_state create_state() const {
        return task_result_state(params.oaicompat_chat_syntax);
    }
-
-    bool is_parent() const {
-        return child_tasks.size() > 0;
-    }
-
-    bool is_child() const {
-        return id_parent != -1;
-    }
 };

 struct result_timings {
@@ -393,12 +337,6 @@ struct server_task_result_cmpl_partial : server_task_result {
    std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
    bool is_updated = false;

-    // for Anthropic API: track if any reasoning content has been generated
-    bool anthropic_has_reasoning = false;
-    // Streaming state copied from task_result_state for this chunk
-    bool anthropic_thinking_block_started = false;
-    bool anthropic_text_block_started = false;
-
    virtual bool is_stop() override {
        return false; // in stream mode, partial responses are not considered stop
    }
@@ -408,22 +346,6 @@ struct server_task_result_cmpl_partial : server_task_result {
    virtual void update(task_result_state & state) override {
        is_updated = true;
        state.update_chat_msg(content, true, oaicompat_msg_diffs);
-        // track if the accumulated message has any reasoning content
-        anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
-
-        // Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk)
-        anthropic_thinking_block_started = state.anthropic_thinking_block_started;
-        anthropic_text_block_started = state.anthropic_text_block_started;
-
-        // Pre-compute state updates based on diffs (for next chunk)
-        for (const auto & diff : oaicompat_msg_diffs) {
-            if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) {
-                state.anthropic_thinking_block_started = true;
-            }
-            if (!diff.content_delta.empty() && !state.anthropic_text_block_started) {
-                state.anthropic_text_block_started = true;
-            }
-        }
    }

    json to_json_non_oaicompat();