sync from b7516

2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -4,6 +4,7 @@
 #include "server-task.h"
 #include "server-queue.h"

+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "log.h"
@@ -15,6 +16,7 @@
 #include <cstddef>
 #include <cinttypes>
 #include <memory>
+#include <unordered_set>
 #include <filesystem>

 // fix problem with std::min and std::max
@@ -45,6 +47,26 @@ enum server_state {
    SERVER_STATE_READY,          // Server is ready and model is loaded
 };

+static bool server_task_type_need_embd(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_EMBEDDING:
+        case SERVER_TASK_TYPE_RERANK:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool server_task_type_need_logits(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_COMPLETION:
+        case SERVER_TASK_TYPE_INFILL:
+            return true;
+        default:
+            return false;
+    }
+}
+
 struct server_slot {
    int id;

@@ -59,8 +81,6 @@ struct server_slot {

    common_speculative * spec = nullptr;

-    // TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
-    //       see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
    std::unique_ptr<const server_task> task;
    std::unique_ptr<const server_task> task_prev; // used for debugging

@@ -127,17 +147,6 @@ struct server_slot {
        return res;
    }

-    void prompt_clear(bool allow_processing) {
-        if (!allow_processing) {
-            GGML_ASSERT(!is_processing());
-        }
-
-        SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
-
-        llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
-        prompt.tokens.clear();
-    }
-
    std::vector<common_adapter_lora_info> lora;
    int32_t alora_invocation_start = -1;

@@ -146,7 +155,7 @@ struct server_slot {

    common_sampler_ptr smpl;

-    llama_token  sampled; // in speculative mode, this is the last accepted token
+    llama_token sampled; // in speculative mode, this is the last accepted token
    llama_tokens drafted;

    // stats
@@ -158,7 +167,7 @@ struct server_slot {
    double t_prompt_processing; // ms
    double t_token_generation;  // ms

-    std::function<void(int /* slot_id */)> callback_on_release;
+    std::function<void(int)> callback_on_release;

    // Speculative decoding stats
    int32_t n_draft_total = 0;      // Total draft tokens generated
@@ -187,46 +196,30 @@ struct server_slot {
        n_draft_total = 0;
        n_draft_accepted = 0;

-        task_prev = std::move(task);
        task.reset();
-
-        llama_set_sampler(ctx, id, nullptr);
+        task_prev.reset();

        // clear alora start
        alora_invocation_start = -1;
    }

-    void init_sampler() const {
-        common_sampler_reset(smpl.get());
+    bool need_embd() const {
+        GGML_ASSERT(task);

-        if (!task->need_sampling()) {
-            return;
-        }
+        return server_task_type_need_embd(task->type);
+    }

-        const int64_t t_start = ggml_time_us();
+    bool need_logits() const {
+        GGML_ASSERT(task);

-        int n_text = 0;
-
-        for (int i = 0; i < (int) prompt.tokens.size(); i++) {
-            const llama_token id = prompt.tokens[i];
-
-            if (id != LLAMA_TOKEN_NULL) {
-                common_sampler_accept(smpl.get(), id, false);
-                n_text++;
-            }
-        }
-
-        SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
-                (ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
+        return server_task_type_need_logits(task->type);
    }

    // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
    // also we cannot split if the pooling would require any past tokens
    bool can_split() const {
-        GGML_ASSERT(task);
-
        return
-            !task->need_embd() ||
+            !need_embd() ||
            (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
    }

@@ -267,13 +260,10 @@ struct server_slot {
            SLT_WRN(*this, "%s", "slot is not processing\n");
            return;
        }
-
        generated_token_probs.push_back(token);
    }

    int get_n_draft_max() const {
-        GGML_ASSERT(task);
-
        if (!can_speculate()) {
            return 0;
        }
@@ -298,23 +288,27 @@ struct server_slot {
        return n_draft_max;
    }

+    // note: a slot can also be either a parent or a child
+    bool is_parent() const {
+        return is_processing() && task->n_children > 0;
+    }
+
+    bool is_child() const {
+        return is_processing() && task->id_parent >= 0;
+    }
+
    void release() {
        if (is_processing()) {
            GGML_ASSERT(task);

            SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated);

-            t_last_used        =  ggml_time_us();
+            t_last_used = ggml_time_us();
            t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
-
            state = SLOT_STATE_IDLE;

-            // do not keep context of the child slots - the parent's context is enough
-            if (task->is_child()) {
-                prompt_clear(false);
-            }
-
-            reset();
+            task_prev = std::move(task);
+            task.reset();

            callback_on_release(id);
        }
@@ -433,22 +427,14 @@ struct server_slot {
    }

    void copy_state_to(server_slot & other) const {
-        GGML_ASSERT(state == SLOT_STATE_DONE_PROMPT);
-
-        llama_memory_seq_rm(llama_get_memory(ctx), other.id,     -1, -1);
-        llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, -1, -1);
-
+        llama_memory_seq_rm(llama_get_memory(ctx), other.id, 0, -1);
+        llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, 0, -1);
        other.n_decoded   = n_decoded;
        other.n_remaining = n_remaining;
        other.i_batch     = i_batch;
-
-        other.t_start_process_prompt    = t_start_process_prompt;
-        other.t_prompt_processing       = t_prompt_processing;
        other.n_prompt_tokens_cache     = n_prompt_tokens_cache;
        other.n_prompt_tokens_processed = n_prompt_tokens_processed;
-
        other.prompt = prompt.clone();
-        other.init_sampler();
    }
 };

@@ -761,8 +747,6 @@ private:
        }

        slots.clear();
-
-        // initialize slots
        for (int i = 0; i < params_base.n_parallel; i++) {
            server_slot slot;

@@ -794,8 +778,8 @@ private:

            SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);

-            slot.callback_on_release = [this](int slot_id) {
-                queue_tasks.pop_deferred_task(slot_id);
+            slot.callback_on_release = [this](int) {
+                queue_tasks.pop_deferred_task();
            };

            slot.reset();
@@ -909,9 +893,9 @@ private:
        return true;
    }

-    server_slot * get_slot_by_id(int id_slot) {
+    server_slot * get_slot_by_id(int id) {
        for (server_slot & slot : slots) {
-            if (slot.id == id_slot) {
+            if (slot.id == id) {
                return &slot;
            }
        }
@@ -1011,7 +995,7 @@ private:
                ret->prompt_save(*prompt_cache);

                if (!ret->prompt_load(*prompt_cache, task.tokens)) {
-                    ret->prompt_clear(false);
+                    clear_slot(*ret);
                }

                prompt_cache->update();
@@ -1023,6 +1007,15 @@ private:
        return ret;
    }

+    void clear_slot(server_slot & slot) const {
+        GGML_ASSERT(!slot.is_processing());
+
+        SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());
+
+        llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
+        slot.prompt.tokens.clear();
+    }
+
    // return true if at least one slot has been cleared
    // TODO: improve logic
    //       - smarter decision which slot to clear (LRU or longest prompt?)
@@ -1043,7 +1036,7 @@ private:
            if (slot.prompt.n_tokens() > 0) {
                SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());

-                slot.prompt_clear(false);
+                clear_slot(slot);

                res = true;

@@ -1069,6 +1062,8 @@ private:
    }

    bool launch_slot_with_task(server_slot & slot, server_task && task) {
+        slot.reset();
+
        // process per-request lora adapters
        if (!task.params.lora.empty()) {
            auto task_loras = construct_lora_list(task.params.lora);
@@ -1142,7 +1137,7 @@ private:
        SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());

        // initialize samplers
-        if (task.need_sampling()) {
+        {
            slot.smpl.reset(common_sampler_init(model, task.params.sampling));

            if (slot.smpl == nullptr) {
@@ -1151,28 +1146,7 @@ private:
                return false;
            }

-            const bool need_logits = task.params.sampling.n_probs > 0;
-
-            bool backend_sampling = true;
-
-            backend_sampling &= task.params.sampling.backend_sampling;
-
-            // TODO: speculative decoding requires multiple samples per batch - not supported yet
-            backend_sampling &= !(slot.ctx_dft && task.params.speculative.n_max > 0);
-
-            // TODO: getting post/pre sampling logits is not yet supported with backend sampling
-            backend_sampling &= !need_logits;
-
-            // TODO: tmp until backend sampling is fully implemented
-            if (backend_sampling) {
-                llama_set_sampler(ctx, slot.id, common_sampler_get(slot.smpl.get()));
-            } else {
-                llama_set_sampler(ctx, slot.id, nullptr);
-            }
-
            SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
-        } else {
-            slot.smpl.reset();
        }

        // initialize draft batch
@@ -1185,11 +1159,12 @@ private:

        slot.task = std::make_unique<const server_task>(std::move(task));

-        slot.state = slot.task->is_child()
+        slot.state = slot.is_child()
            ? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
            : SLOT_STATE_STARTED;

-        SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
+        SLT_INF(slot, "%s", "processing task\n");
+
        return true;
    }

@@ -1509,9 +1484,9 @@ private:
        res->n_tokens  = slot.task->n_tokens();
        res->res_type  = slot.task->params.res_type;

-        const int n_embd_out = llama_model_n_embd_out(model);
+        const int n_embd = llama_model_n_embd(model);

-        std::vector<float> embd_res(n_embd_out, 0.0f);
+        std::vector<float> embd_res(n_embd, 0.0f);

        for (int i = 0; i < batch.n_tokens; ++i) {
            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
@@ -1528,18 +1503,18 @@ private:
            if (embd == nullptr) {
                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);

-                res->embedding.push_back(std::vector<float>(n_embd_out, 0.0f));
+                res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
                continue;
            }

            // normalize only when there is pooling
            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
-                common_embd_normalize(embd, embd_res.data(), n_embd_out, slot.task->params.embd_normalize);
+                common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize);
                res->embedding.push_back(embd_res);
                break;
            }

-            res->embedding.emplace_back(embd, embd + n_embd_out);
+            res->embedding.emplace_back(embd, embd + n_embd);
        }

        SLT_DBG(slot, "%s", "sending embeddings\n");
@@ -1584,7 +1559,9 @@ private:

    // tokenize the input if it's set by CLI, return false on error
    bool tokenize_cli_input(server_task & task) {
-        GGML_ASSERT(task.cli_input != nullptr);
+        if (task.cli_input == nullptr) {
+            return true; // nothing to do
+        }
        try {
            auto & opt = oai_parser_opt;
            common_chat_templates_inputs inputs;
@@ -1618,64 +1595,6 @@ private:
        return true;
    }

-    std::vector<server_slot *> get_free_slots(size_t n_slots_needed, int exclude_id_slot) {
-        std::vector<server_slot *> free_slots;
-        for (auto & slot : slots) {
-            if (!slot.is_processing() && slot.id != exclude_id_slot) {
-                free_slots.push_back(&slot);
-            }
-            if (free_slots.size() >= n_slots_needed) {
-                break;
-            }
-        }
-        return free_slots;
-    }
-
-    // launch multiple slots for parent + child tasks
-    bool launch_slots_with_parent_task(server_slot & parent_slot, std::vector<server_slot *> & child_slots, server_task && parent_task) {
-        GGML_ASSERT(!parent_slot.is_processing());
-        GGML_ASSERT(parent_task.is_parent());
-        GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
-
-        int id_parent = parent_task.id;
-
-        SRV_INF("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());
-
-        // to be called in case of failure to release all launched slots
-        auto release_slots = [this, id_parent]() {
-            for (auto & slot : slots) {
-                if (slot.is_processing() && (
-                        slot.task->id == id_parent ||
-                        slot.task->id_parent == id_parent
-                )) {
-                    slot.release();
-                }
-            }
-        };
-
-        // launch all child tasks first
-        size_t idx = 0;
-        GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
-        for (auto * slot : child_slots) {
-            int id_child = parent_task.child_tasks[idx].id;
-            if (!launch_slot_with_task(*slot, std::move(parent_task.child_tasks[idx]))) {
-                SRV_ERR("failed to launch slot with child task, id_task = %d\n", id_child);
-                release_slots();
-                return false;
-            }
-            idx++;
-        }
-
-        // finally, launch the parent task
-        if (!launch_slot_with_task(parent_slot, std::move(parent_task))) {
-            SRV_ERR("failed to launch slot with task, id_task = %d\n", id_parent);
-            release_slots();
-            return false;
-        }
-
-        return true;
-    }
-
    void process_single_task(server_task && task) {
        switch (task.type) {
            case SERVER_TASK_TYPE_COMPLETION:
@@ -1683,55 +1602,31 @@ private:
            case SERVER_TASK_TYPE_EMBEDDING:
            case SERVER_TASK_TYPE_RERANK:
                {
-                    // special case: if input is provided via CLI, tokenize it first
-                    // otherwise, no need to tokenize as it's already done inside the HTTP thread
-                    if (task.cli_input != nullptr) {
-                        if (!tokenize_cli_input(task)) {
-                            break;
-                        }
+                    if (!tokenize_cli_input(task)) {
+                        break;
                    }

                    const int id_slot = task.id_slot;
-                    const int id_task = task.id;

-                    server_slot * slot = id_slot != -1
-                                            ? get_slot_by_id(id_slot)
-                                            : get_available_slot(task);
-
-                    //
-                    // slot scheduling logic
-                    //
+                    server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);

                    if (slot == nullptr) {
                        // if no slot is available, we defer this task for processing later
-                        SRV_DBG("no slot is available, defer task, id_task = %d\n", id_task);
+                        SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
                        queue_tasks.defer(std::move(task));
                        break;
                    }

                    if (slot->is_processing()) {
                        // if requested slot is unavailable, we defer this task for processing later
-                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", id_task);
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                        queue_tasks.defer(std::move(task));
                        break;
                    }

-                    if (task.is_parent()) {
-                        // try getting free slots for all child tasks
-                        size_t n_child_tasks = task.child_tasks.size();
-                        std::vector<server_slot *> child_slots = get_free_slots(n_child_tasks, slot->id);
-                        if (child_slots.size() < n_child_tasks) {
-                            SRV_DBG("not enough free slots for child tasks, n_free = %zu, n_children = %zu, defer task, id_task = %d\n", child_slots.size(), n_child_tasks, id_task);
-                            queue_tasks.defer(std::move(task));
-                            break;
-                        }
-                        if (!launch_slots_with_parent_task(*slot, child_slots, std::move(task))) {
-                            SRV_ERR("failed to launch slot with parent task, id_task = %d\n", id_task);
-                            break; // drop the task
-                        }
-                    } else if (!launch_slot_with_task(*slot, std::move(task))) {
-                        SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
-                        break; // drop the task
+                    if (!launch_slot_with_task(*slot, std::move(task))) {
+                        SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
+                        break;
                    }
                } break;
            case SERVER_TASK_TYPE_CANCEL:
@@ -1905,7 +1800,7 @@ private:
                    // Erase token cache
                    const size_t n_erased = slot->prompt.tokens.size();

-                    slot->prompt_clear(false);
+                    clear_slot(*slot);

                    auto res = std::make_unique<server_task_result_slot_erase>();
                    res->id       = task.id;
@@ -2000,7 +1895,7 @@ private:
                    GGML_ABORT("not supported by multimodal");
                }

-                if (slot.task->is_parent() || slot.task->is_child()) {
+                if (slot.is_parent() || slot.is_child()) {
                    send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
                    slot.release();
                    continue;
@@ -2139,12 +2034,6 @@ private:
                    continue;
                }

-                // check if this is a child slot
-                if (slot.state == SLOT_STATE_WAIT_OTHER) {
-                    SLT_DBG(slot, "%s", "waiting for parent slot to complete\n");
-                    continue;
-                }
-
                // this slot still has a prompt to be processed
                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
                    const auto & input_tokens = slot.task->tokens;
@@ -2187,7 +2076,7 @@ private:
                        }

                        // TODO: support memory-less logits computation
-                        if (slot.task->need_logits() && !llama_get_memory(ctx)) {
+                        if (slot.need_logits() && !llama_get_memory(ctx)) {
                            send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
                            slot.release();
                            continue;
@@ -2424,12 +2313,6 @@ private:
                        slot.n_prompt_tokens_processed = 0;

                        slot.prompt.tokens.keep_first(n_past);
-
-                        // send initial 0% progress update if needed
-                        // this is to signal the client that the request has started processing
-                        if (slot.task->params.stream && slot.task->params.return_progress) {
-                            send_partial_response(slot, {}, true);
-                        }
                    }

                    if (!slot.can_split()) {
@@ -2447,7 +2330,7 @@ private:
                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
                        SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);

-                        slot.prompt_clear(true);
+                        clear_slot(slot);

                        // there is no common part left
                        slot.n_prompt_tokens_cache = 0;
@@ -2526,7 +2409,7 @@ private:
                            cur_tok,
                            slot.prompt.tokens.pos_next(),
                            { slot.id },
-                            slot.task->need_embd());
+                            slot.need_embd());
                        slot.prompt.tokens.push_back(cur_tok);

                        slot.n_prompt_tokens_processed++;
@@ -2547,6 +2430,16 @@ private:

                        GGML_ASSERT(batch.n_tokens > 0);

+                        common_sampler_reset(slot.smpl.get());
+
+                        // Process all prompt tokens through sampler system
+                        for (int i = 0; i < slot.task->n_tokens(); ++i) {
+                            llama_token id = input_tokens[i];
+                            if (id != LLAMA_TOKEN_NULL) {
+                                common_sampler_accept(slot.smpl.get(), id, false);
+                            }
+                        }
+
                        // extract the logits only for the last token
                        batch.logits[batch.n_tokens - 1] = true;

@@ -2555,8 +2448,6 @@ private:

                        SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);

-                        slot.init_sampler();
-
                        const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
                        const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);

@@ -2603,6 +2494,11 @@ private:
            }
        }

+        if (batch.n_tokens == 0) {
+            SRV_WRN("%s", "no tokens to decode\n");
+            return;
+        }
+
        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);

        if (slot_batched) {
@@ -2616,11 +2512,7 @@ private:
                slot_batched->lora[alora_disabled_id].scale = alora_scale;
            }

-            llama_set_embeddings(ctx, slot_batched->task->need_embd());
-        }
-
-        if (batch.n_tokens == 0) {
-            SRV_WRN("%s", "no tokens to decode\n");
+            llama_set_embeddings(ctx, slot_batched->need_embd());
        }

        int32_t i_next = 0;
@@ -2674,7 +2566,7 @@ private:

                                // note: it's complicated to keep track of how much of the current batch has been
                                //       processed before the error occurred, so we simply clear the entire context
-                                slot.prompt_clear(false);
+                                clear_slot(slot);
                            }
                        }

@@ -2698,30 +2590,31 @@ private:
            // on successful decode, restore the original batch size
            n_batch = llama_n_batch(ctx);

-            // handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
+            // technically, measuring the time here excludes the sampling time for the last batch
+            // but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
+            const int64_t t_current = ggml_time_us();
+
            for (auto & slot : slots) {
-                if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) {
-                    std::vector<server_slot *> children;
+                // may need to copy state to other slots
+                if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
+                    std::vector<server_slot *> child_slots;
                    for (auto & other : slots) {
                        if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
-                            children.push_back(&other);
+                            child_slots.push_back(&other);
                        }
                    }

-                    // all children slots should already launched by launch_slots_with_parent_task()
-                    // copy state to the child slots
-                    for (auto & child : children) {
-                        SLT_INF(slot, " - copying state to child %d\n", child->id);
-
-                        GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
-
-                        slot.copy_state_to(*child);
-                        child->state = SLOT_STATE_DONE_PROMPT;
+                    // we can only proceed if all child slots are having the correct tasks
+                    if (child_slots.size() == slot.task->n_children) {
+                        // copy state to the child slots
+                        for (auto & child : child_slots) {
+                            SLT_INF(slot, "copying state to child %d\n", child->id);
+                            slot.copy_state_to(*child);
+                            child->state = SLOT_STATE_DONE_PROMPT;
+                        }
                    }
                }
-            }

-            for (auto & slot : slots) {
                // optionally send prompt processing progress
                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
                    if (slot.task->params.stream && slot.task->params.return_progress) {
@@ -2749,8 +2642,6 @@ private:
                        continue; // continue loop of slots
                    }

-                    GGML_ASSERT(slot.task->need_sampling());
-
                    // prompt evaluated for next-token prediction
                    slot.state = SLOT_STATE_GENERATING;
                } else if (slot.state != SLOT_STATE_GENERATING) {
@@ -2769,9 +2660,6 @@ private:

                common_sampler_accept(slot.smpl.get(), id, true);

-                // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
-                const int64_t t_current = ggml_time_us();
-
                slot.n_decoded += 1;

                if (slot.n_decoded == 1) {
@@ -2808,15 +2696,13 @@ private:
                    continue;
                }

-                const size_t n_draft = slot.drafted.size();
+                size_t n_draft = slot.drafted.size();

                // the accepted tokens from the speculation
                const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
                slot.i_batch_dft.clear();
                slot.drafted.clear();

-                const int64_t t_current = ggml_time_us();
-
                slot.n_decoded += ids.size();

                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
@@ -2898,12 +2784,6 @@ server_response_reader server_context::get_response_reader() {

 server_context_meta server_context::get_meta() const {
    auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
-
-    auto bos_id = llama_vocab_bos(impl->vocab);
-    auto eos_id = llama_vocab_eos(impl->vocab);
-    auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
-    auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : "";
-
    return server_context_meta {
        /* build_info             */ build_info,
        /* model_name             */ impl->model_name,
@@ -2918,8 +2798,8 @@ server_context_meta server_context::get_meta() const {
        /* chat_template          */ common_chat_templates_source(impl->chat_templates.get()),
        /* chat_template_tool_use */ tool_use_src ? tool_use_src : "",

-        /* bos_token_str          */ bos_token_str,
-        /* eos_token_str          */ eos_token_str,
+        /* bos_token_str          */ common_token_to_piece(impl->ctx, llama_vocab_bos(impl->vocab), true),
+        /* eos_token_str          */ common_token_to_piece(impl->ctx, llama_vocab_eos(impl->vocab), true),
        /* fim_pre_token          */ llama_vocab_fim_pre(impl->vocab),
        /* fim_sub_token          */ llama_vocab_fim_suf(impl->vocab),
        /* fim_mid_token          */ llama_vocab_fim_mid(impl->vocab),
@@ -2992,9 +2872,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
            // Everything else, including multimodal completions.
            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
        }
-
-        // tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
-
+        tasks.reserve(inputs.size());
        for (size_t i = 0; i < inputs.size(); i++) {
            server_task task = server_task(type);

@@ -3013,11 +2891,13 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
            task.params.oaicompat_cmpl_id = completion_id;
            task.params.oaicompat_model   = meta->model_name;

-            // prepare child tasks
            if (task.params.n_cmpl > 1) {
-                int n_children = task.params.n_cmpl - 1;
-                for (int j = 0; j < n_children; j++) {
-                    task.add_child(task.id, rd.get_new_id());
+                task.n_children = task.params.n_cmpl - 1;
+                for (size_t j = 0; j < task.n_children; j++) {
+                    server_task child = task.create_child(
+                        task.id,
+                        rd.get_new_id());
+                    tasks.push_back(std::move(child));
                }
            }

@@ -3066,22 +2946,19 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
        // in streaming mode, the first error must be treated as non-stream response
        // this is to match the OAI API behavior
        // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
-        auto first_result = rd.next(req.should_stop);
+        server_task_result_ptr first_result = rd.next(req.should_stop);
        if (first_result == nullptr) {
-            GGML_ASSERT(req.should_stop());
            return res; // connection is closed
-        }
-
-        if (first_result->is_error()) {
+        } else if (first_result->is_error()) {
            res->error(first_result->to_json());
            return res;
+        } else {
+            GGML_ASSERT(
+                dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr
+                || dynamic_cast<server_task_result_cmpl_final*>(first_result.get()) != nullptr
+            );
        }

-        GGML_ASSERT(
-            dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr ||
-            dynamic_cast<server_task_result_cmpl_final*>  (first_result.get()) != nullptr
-        );
-
        // next responses are streamed
        // to be sent immediately
        json first_result_json = first_result->to_json();
@@ -3137,7 +3014,6 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
                auto result = rd.next(req.should_stop);
                if (result == nullptr) {
                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
-                    GGML_ASSERT(req.should_stop());
                    return false; // should_stop condition met
                }

@@ -3221,11 +3097,6 @@ void server_routes::init_routes() {

        // get the result
        auto result = res->rd.next(req.should_stop);
-        if (!result) {
-            // connection was closed
-            GGML_ASSERT(req.should_stop());
-            return res;
-        }

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3326,11 +3197,6 @@ void server_routes::init_routes() {

        // get the result
        auto result = res->rd.next(req.should_stop);
-        if (!result) {
-            // connection was closed
-            GGML_ASSERT(req.should_stop());
-            return res;
-        }

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3837,12 +3703,7 @@ void server_routes::init_routes() {
        }

        // get the result
-        auto result = rd.next(req.should_stop);
-        if (!result) {
-            // connection was closed
-            GGML_ASSERT(req.should_stop());
-            return res;
-        }
+        server_task_result_ptr result = rd.next(req.should_stop);

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3871,12 +3732,7 @@ void server_routes::init_routes() {
        }

        // get the result
-        auto result = rd.next(req.should_stop);
-        if (!result) {
-            // connection was closed
-            GGML_ASSERT(req.should_stop());
-            return res;
-        }
+        server_task_result_ptr result = rd.next(req.should_stop);

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3909,12 +3765,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const ser
        rd.post_task(std::move(task));
    }

-    auto result = rd.next(req.should_stop);
-    if (!result) {
-        // connection was closed
-        GGML_ASSERT(req.should_stop());
-        return res;
-    }
+    server_task_result_ptr result = rd.next(req.should_stop);

    if (result->is_error()) {
        res->error(result->to_json());
@@ -3945,12 +3796,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const
        rd.post_task(std::move(task));
    }

-    auto result = rd.next(req.should_stop);
-    if (!result) {
-        // connection was closed
-        GGML_ASSERT(req.should_stop());
-        return res;
-    }
+    server_task_result_ptr result = rd.next(req.should_stop);

    if (result->is_error()) {
        res->error(result->to_json());
@@ -3972,12 +3818,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const se
        rd.post_task(std::move(task));
    }

-    auto result = rd.next(req.should_stop);
-    if (!result) {
-        // connection was closed
-        GGML_ASSERT(req.should_stop());
-        return res;
-    }
+    server_task_result_ptr result = rd.next(req.should_stop);

    if (result->is_error()) {
        res->error(result->to_json());