sync from b7516

This commit is contained in:
2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions

View File

@@ -4,6 +4,7 @@
#include "server-task.h"
#include "server-queue.h"
#include "arg.h"
#include "common.h"
#include "llama.h"
#include "log.h"
@@ -15,6 +16,7 @@
#include <cstddef>
#include <cinttypes>
#include <memory>
#include <unordered_set>
#include <filesystem>
// fix problem with std::min and std::max
@@ -45,6 +47,26 @@ enum server_state {
SERVER_STATE_READY, // Server is ready and model is loaded
};
static bool server_task_type_need_embd(server_task_type task_type) {
switch (task_type) {
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
return true;
default:
return false;
}
}
static bool server_task_type_need_logits(server_task_type task_type) {
switch (task_type) {
case SERVER_TASK_TYPE_COMPLETION:
case SERVER_TASK_TYPE_INFILL:
return true;
default:
return false;
}
}
struct server_slot {
int id;
@@ -59,8 +81,6 @@ struct server_slot {
common_speculative * spec = nullptr;
// TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
// see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
std::unique_ptr<const server_task> task;
std::unique_ptr<const server_task> task_prev; // used for debugging
@@ -127,17 +147,6 @@ struct server_slot {
return res;
}
void prompt_clear(bool allow_processing) {
if (!allow_processing) {
GGML_ASSERT(!is_processing());
}
SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
prompt.tokens.clear();
}
std::vector<common_adapter_lora_info> lora;
int32_t alora_invocation_start = -1;
@@ -146,7 +155,7 @@ struct server_slot {
common_sampler_ptr smpl;
llama_token sampled; // in speculative mode, this is the last accepted token
llama_token sampled; // in speculative mode, this is the last accepted token
llama_tokens drafted;
// stats
@@ -158,7 +167,7 @@ struct server_slot {
double t_prompt_processing; // ms
double t_token_generation; // ms
std::function<void(int /* slot_id */)> callback_on_release;
std::function<void(int)> callback_on_release;
// Speculative decoding stats
int32_t n_draft_total = 0; // Total draft tokens generated
@@ -187,46 +196,30 @@ struct server_slot {
n_draft_total = 0;
n_draft_accepted = 0;
task_prev = std::move(task);
task.reset();
llama_set_sampler(ctx, id, nullptr);
task_prev.reset();
// clear alora start
alora_invocation_start = -1;
}
void init_sampler() const {
common_sampler_reset(smpl.get());
bool need_embd() const {
GGML_ASSERT(task);
if (!task->need_sampling()) {
return;
}
return server_task_type_need_embd(task->type);
}
const int64_t t_start = ggml_time_us();
bool need_logits() const {
GGML_ASSERT(task);
int n_text = 0;
for (int i = 0; i < (int) prompt.tokens.size(); i++) {
const llama_token id = prompt.tokens[i];
if (id != LLAMA_TOKEN_NULL) {
common_sampler_accept(smpl.get(), id, false);
n_text++;
}
}
SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
(ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
return server_task_type_need_logits(task->type);
}
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
// also we cannot split if the pooling would require any past tokens
bool can_split() const {
GGML_ASSERT(task);
return
!task->need_embd() ||
!need_embd() ||
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
}
@@ -267,13 +260,10 @@ struct server_slot {
SLT_WRN(*this, "%s", "slot is not processing\n");
return;
}
generated_token_probs.push_back(token);
}
int get_n_draft_max() const {
GGML_ASSERT(task);
if (!can_speculate()) {
return 0;
}
@@ -298,23 +288,27 @@ struct server_slot {
return n_draft_max;
}
// note: a slot can also be either a parent or a child
bool is_parent() const {
return is_processing() && task->n_children > 0;
}
bool is_child() const {
return is_processing() && task->id_parent >= 0;
}
void release() {
if (is_processing()) {
GGML_ASSERT(task);
SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated);
t_last_used = ggml_time_us();
t_last_used = ggml_time_us();
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
state = SLOT_STATE_IDLE;
// do not keep context of the child slots - the parent's context is enough
if (task->is_child()) {
prompt_clear(false);
}
reset();
task_prev = std::move(task);
task.reset();
callback_on_release(id);
}
@@ -433,22 +427,14 @@ struct server_slot {
}
void copy_state_to(server_slot & other) const {
GGML_ASSERT(state == SLOT_STATE_DONE_PROMPT);
llama_memory_seq_rm(llama_get_memory(ctx), other.id, -1, -1);
llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, -1, -1);
llama_memory_seq_rm(llama_get_memory(ctx), other.id, 0, -1);
llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, 0, -1);
other.n_decoded = n_decoded;
other.n_remaining = n_remaining;
other.i_batch = i_batch;
other.t_start_process_prompt = t_start_process_prompt;
other.t_prompt_processing = t_prompt_processing;
other.n_prompt_tokens_cache = n_prompt_tokens_cache;
other.n_prompt_tokens_processed = n_prompt_tokens_processed;
other.prompt = prompt.clone();
other.init_sampler();
}
};
@@ -761,8 +747,6 @@ private:
}
slots.clear();
// initialize slots
for (int i = 0; i < params_base.n_parallel; i++) {
server_slot slot;
@@ -794,8 +778,8 @@ private:
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
slot.callback_on_release = [this](int slot_id) {
queue_tasks.pop_deferred_task(slot_id);
slot.callback_on_release = [this](int) {
queue_tasks.pop_deferred_task();
};
slot.reset();
@@ -909,9 +893,9 @@ private:
return true;
}
server_slot * get_slot_by_id(int id_slot) {
server_slot * get_slot_by_id(int id) {
for (server_slot & slot : slots) {
if (slot.id == id_slot) {
if (slot.id == id) {
return &slot;
}
}
@@ -1011,7 +995,7 @@ private:
ret->prompt_save(*prompt_cache);
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
ret->prompt_clear(false);
clear_slot(*ret);
}
prompt_cache->update();
@@ -1023,6 +1007,15 @@ private:
return ret;
}
void clear_slot(server_slot & slot) const {
GGML_ASSERT(!slot.is_processing());
SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
slot.prompt.tokens.clear();
}
// return true if at least one slot has been cleared
// TODO: improve logic
// - smarter decision which slot to clear (LRU or longest prompt?)
@@ -1043,7 +1036,7 @@ private:
if (slot.prompt.n_tokens() > 0) {
SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
slot.prompt_clear(false);
clear_slot(slot);
res = true;
@@ -1069,6 +1062,8 @@ private:
}
bool launch_slot_with_task(server_slot & slot, server_task && task) {
slot.reset();
// process per-request lora adapters
if (!task.params.lora.empty()) {
auto task_loras = construct_lora_list(task.params.lora);
@@ -1142,7 +1137,7 @@ private:
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
// initialize samplers
if (task.need_sampling()) {
{
slot.smpl.reset(common_sampler_init(model, task.params.sampling));
if (slot.smpl == nullptr) {
@@ -1151,28 +1146,7 @@ private:
return false;
}
const bool need_logits = task.params.sampling.n_probs > 0;
bool backend_sampling = true;
backend_sampling &= task.params.sampling.backend_sampling;
// TODO: speculative decoding requires multiple samples per batch - not supported yet
backend_sampling &= !(slot.ctx_dft && task.params.speculative.n_max > 0);
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
backend_sampling &= !need_logits;
// TODO: tmp until backend sampling is fully implemented
if (backend_sampling) {
llama_set_sampler(ctx, slot.id, common_sampler_get(slot.smpl.get()));
} else {
llama_set_sampler(ctx, slot.id, nullptr);
}
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
} else {
slot.smpl.reset();
}
// initialize draft batch
@@ -1185,11 +1159,12 @@ private:
slot.task = std::make_unique<const server_task>(std::move(task));
slot.state = slot.task->is_child()
slot.state = slot.is_child()
? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
: SLOT_STATE_STARTED;
SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
SLT_INF(slot, "%s", "processing task\n");
return true;
}
@@ -1509,9 +1484,9 @@ private:
res->n_tokens = slot.task->n_tokens();
res->res_type = slot.task->params.res_type;
const int n_embd_out = llama_model_n_embd_out(model);
const int n_embd = llama_model_n_embd(model);
std::vector<float> embd_res(n_embd_out, 0.0f);
std::vector<float> embd_res(n_embd, 0.0f);
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
@@ -1528,18 +1503,18 @@ private:
if (embd == nullptr) {
SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
res->embedding.push_back(std::vector<float>(n_embd_out, 0.0f));
res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
continue;
}
// normalize only when there is pooling
if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
common_embd_normalize(embd, embd_res.data(), n_embd_out, slot.task->params.embd_normalize);
common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize);
res->embedding.push_back(embd_res);
break;
}
res->embedding.emplace_back(embd, embd + n_embd_out);
res->embedding.emplace_back(embd, embd + n_embd);
}
SLT_DBG(slot, "%s", "sending embeddings\n");
@@ -1584,7 +1559,9 @@ private:
// tokenize the input if it's set by CLI, return false on error
bool tokenize_cli_input(server_task & task) {
GGML_ASSERT(task.cli_input != nullptr);
if (task.cli_input == nullptr) {
return true; // nothing to do
}
try {
auto & opt = oai_parser_opt;
common_chat_templates_inputs inputs;
@@ -1618,64 +1595,6 @@ private:
return true;
}
std::vector<server_slot *> get_free_slots(size_t n_slots_needed, int exclude_id_slot) {
std::vector<server_slot *> free_slots;
for (auto & slot : slots) {
if (!slot.is_processing() && slot.id != exclude_id_slot) {
free_slots.push_back(&slot);
}
if (free_slots.size() >= n_slots_needed) {
break;
}
}
return free_slots;
}
// launch multiple slots for parent + child tasks
bool launch_slots_with_parent_task(server_slot & parent_slot, std::vector<server_slot *> & child_slots, server_task && parent_task) {
GGML_ASSERT(!parent_slot.is_processing());
GGML_ASSERT(parent_task.is_parent());
GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
int id_parent = parent_task.id;
SRV_INF("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());
// to be called in case of failure to release all launched slots
auto release_slots = [this, id_parent]() {
for (auto & slot : slots) {
if (slot.is_processing() && (
slot.task->id == id_parent ||
slot.task->id_parent == id_parent
)) {
slot.release();
}
}
};
// launch all child tasks first
size_t idx = 0;
GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
for (auto * slot : child_slots) {
int id_child = parent_task.child_tasks[idx].id;
if (!launch_slot_with_task(*slot, std::move(parent_task.child_tasks[idx]))) {
SRV_ERR("failed to launch slot with child task, id_task = %d\n", id_child);
release_slots();
return false;
}
idx++;
}
// finally, launch the parent task
if (!launch_slot_with_task(parent_slot, std::move(parent_task))) {
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_parent);
release_slots();
return false;
}
return true;
}
void process_single_task(server_task && task) {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
@@ -1683,55 +1602,31 @@ private:
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
{
// special case: if input is provided via CLI, tokenize it first
// otherwise, no need to tokenize as it's already done inside the HTTP thread
if (task.cli_input != nullptr) {
if (!tokenize_cli_input(task)) {
break;
}
if (!tokenize_cli_input(task)) {
break;
}
const int id_slot = task.id_slot;
const int id_task = task.id;
server_slot * slot = id_slot != -1
? get_slot_by_id(id_slot)
: get_available_slot(task);
//
// slot scheduling logic
//
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
if (slot == nullptr) {
// if no slot is available, we defer this task for processing later
SRV_DBG("no slot is available, defer task, id_task = %d\n", id_task);
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
queue_tasks.defer(std::move(task));
break;
}
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", id_task);
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(std::move(task));
break;
}
if (task.is_parent()) {
// try getting free slots for all child tasks
size_t n_child_tasks = task.child_tasks.size();
std::vector<server_slot *> child_slots = get_free_slots(n_child_tasks, slot->id);
if (child_slots.size() < n_child_tasks) {
SRV_DBG("not enough free slots for child tasks, n_free = %zu, n_children = %zu, defer task, id_task = %d\n", child_slots.size(), n_child_tasks, id_task);
queue_tasks.defer(std::move(task));
break;
}
if (!launch_slots_with_parent_task(*slot, child_slots, std::move(task))) {
SRV_ERR("failed to launch slot with parent task, id_task = %d\n", id_task);
break; // drop the task
}
} else if (!launch_slot_with_task(*slot, std::move(task))) {
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
break; // drop the task
if (!launch_slot_with_task(*slot, std::move(task))) {
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
break;
}
} break;
case SERVER_TASK_TYPE_CANCEL:
@@ -1905,7 +1800,7 @@ private:
// Erase token cache
const size_t n_erased = slot->prompt.tokens.size();
slot->prompt_clear(false);
clear_slot(*slot);
auto res = std::make_unique<server_task_result_slot_erase>();
res->id = task.id;
@@ -2000,7 +1895,7 @@ private:
GGML_ABORT("not supported by multimodal");
}
if (slot.task->is_parent() || slot.task->is_child()) {
if (slot.is_parent() || slot.is_child()) {
send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
slot.release();
continue;
@@ -2139,12 +2034,6 @@ private:
continue;
}
// check if this is a child slot
if (slot.state == SLOT_STATE_WAIT_OTHER) {
SLT_DBG(slot, "%s", "waiting for parent slot to complete\n");
continue;
}
// this slot still has a prompt to be processed
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
const auto & input_tokens = slot.task->tokens;
@@ -2187,7 +2076,7 @@ private:
}
// TODO: support memory-less logits computation
if (slot.task->need_logits() && !llama_get_memory(ctx)) {
if (slot.need_logits() && !llama_get_memory(ctx)) {
send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
slot.release();
continue;
@@ -2424,12 +2313,6 @@ private:
slot.n_prompt_tokens_processed = 0;
slot.prompt.tokens.keep_first(n_past);
// send initial 0% progress update if needed
// this is to signal the client that the request has started processing
if (slot.task->params.stream && slot.task->params.return_progress) {
send_partial_response(slot, {}, true);
}
}
if (!slot.can_split()) {
@@ -2447,7 +2330,7 @@ private:
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
slot.prompt_clear(true);
clear_slot(slot);
// there is no common part left
slot.n_prompt_tokens_cache = 0;
@@ -2526,7 +2409,7 @@ private:
cur_tok,
slot.prompt.tokens.pos_next(),
{ slot.id },
slot.task->need_embd());
slot.need_embd());
slot.prompt.tokens.push_back(cur_tok);
slot.n_prompt_tokens_processed++;
@@ -2547,6 +2430,16 @@ private:
GGML_ASSERT(batch.n_tokens > 0);
common_sampler_reset(slot.smpl.get());
// Process all prompt tokens through sampler system
for (int i = 0; i < slot.task->n_tokens(); ++i) {
llama_token id = input_tokens[i];
if (id != LLAMA_TOKEN_NULL) {
common_sampler_accept(slot.smpl.get(), id, false);
}
}
// extract the logits only for the last token
batch.logits[batch.n_tokens - 1] = true;
@@ -2555,8 +2448,6 @@ private:
SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
slot.init_sampler();
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);
@@ -2603,6 +2494,11 @@ private:
}
}
if (batch.n_tokens == 0) {
SRV_WRN("%s", "no tokens to decode\n");
return;
}
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
if (slot_batched) {
@@ -2616,11 +2512,7 @@ private:
slot_batched->lora[alora_disabled_id].scale = alora_scale;
}
llama_set_embeddings(ctx, slot_batched->task->need_embd());
}
if (batch.n_tokens == 0) {
SRV_WRN("%s", "no tokens to decode\n");
llama_set_embeddings(ctx, slot_batched->need_embd());
}
int32_t i_next = 0;
@@ -2674,7 +2566,7 @@ private:
// note: it's complicated to keep track of how much of the current batch has been
// processed before the error occurred, so we simply clear the entire context
slot.prompt_clear(false);
clear_slot(slot);
}
}
@@ -2698,30 +2590,31 @@ private:
// on successful decode, restore the original batch size
n_batch = llama_n_batch(ctx);
// handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
// technically, measuring the time here excludes the sampling time for the last batch
// but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
const int64_t t_current = ggml_time_us();
for (auto & slot : slots) {
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) {
std::vector<server_slot *> children;
// may need to copy state to other slots
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
std::vector<server_slot *> child_slots;
for (auto & other : slots) {
if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
children.push_back(&other);
child_slots.push_back(&other);
}
}
// all children slots should already launched by launch_slots_with_parent_task()
// copy state to the child slots
for (auto & child : children) {
SLT_INF(slot, " - copying state to child %d\n", child->id);
GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
slot.copy_state_to(*child);
child->state = SLOT_STATE_DONE_PROMPT;
// we can only proceed if all child slots are having the correct tasks
if (child_slots.size() == slot.task->n_children) {
// copy state to the child slots
for (auto & child : child_slots) {
SLT_INF(slot, "copying state to child %d\n", child->id);
slot.copy_state_to(*child);
child->state = SLOT_STATE_DONE_PROMPT;
}
}
}
}
for (auto & slot : slots) {
// optionally send prompt processing progress
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
if (slot.task->params.stream && slot.task->params.return_progress) {
@@ -2749,8 +2642,6 @@ private:
continue; // continue loop of slots
}
GGML_ASSERT(slot.task->need_sampling());
// prompt evaluated for next-token prediction
slot.state = SLOT_STATE_GENERATING;
} else if (slot.state != SLOT_STATE_GENERATING) {
@@ -2769,9 +2660,6 @@ private:
common_sampler_accept(slot.smpl.get(), id, true);
// here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
const int64_t t_current = ggml_time_us();
slot.n_decoded += 1;
if (slot.n_decoded == 1) {
@@ -2808,15 +2696,13 @@ private:
continue;
}
const size_t n_draft = slot.drafted.size();
size_t n_draft = slot.drafted.size();
// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
slot.i_batch_dft.clear();
slot.drafted.clear();
const int64_t t_current = ggml_time_us();
slot.n_decoded += ids.size();
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
@@ -2898,12 +2784,6 @@ server_response_reader server_context::get_response_reader() {
server_context_meta server_context::get_meta() const {
auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
auto bos_id = llama_vocab_bos(impl->vocab);
auto eos_id = llama_vocab_eos(impl->vocab);
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : "";
return server_context_meta {
/* build_info */ build_info,
/* model_name */ impl->model_name,
@@ -2918,8 +2798,8 @@ server_context_meta server_context::get_meta() const {
/* chat_template */ common_chat_templates_source(impl->chat_templates.get()),
/* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
/* bos_token_str */ bos_token_str,
/* eos_token_str */ eos_token_str,
/* bos_token_str */ common_token_to_piece(impl->ctx, llama_vocab_bos(impl->vocab), true),
/* eos_token_str */ common_token_to_piece(impl->ctx, llama_vocab_eos(impl->vocab), true),
/* fim_pre_token */ llama_vocab_fim_pre(impl->vocab),
/* fim_sub_token */ llama_vocab_fim_suf(impl->vocab),
/* fim_mid_token */ llama_vocab_fim_mid(impl->vocab),
@@ -2992,9 +2872,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
// Everything else, including multimodal completions.
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
}
// tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
tasks.reserve(inputs.size());
for (size_t i = 0; i < inputs.size(); i++) {
server_task task = server_task(type);
@@ -3013,11 +2891,13 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
task.params.oaicompat_cmpl_id = completion_id;
task.params.oaicompat_model = meta->model_name;
// prepare child tasks
if (task.params.n_cmpl > 1) {
int n_children = task.params.n_cmpl - 1;
for (int j = 0; j < n_children; j++) {
task.add_child(task.id, rd.get_new_id());
task.n_children = task.params.n_cmpl - 1;
for (size_t j = 0; j < task.n_children; j++) {
server_task child = task.create_child(
task.id,
rd.get_new_id());
tasks.push_back(std::move(child));
}
}
@@ -3066,22 +2946,19 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
// in streaming mode, the first error must be treated as non-stream response
// this is to match the OAI API behavior
// ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
auto first_result = rd.next(req.should_stop);
server_task_result_ptr first_result = rd.next(req.should_stop);
if (first_result == nullptr) {
GGML_ASSERT(req.should_stop());
return res; // connection is closed
}
if (first_result->is_error()) {
} else if (first_result->is_error()) {
res->error(first_result->to_json());
return res;
} else {
GGML_ASSERT(
dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr
|| dynamic_cast<server_task_result_cmpl_final*>(first_result.get()) != nullptr
);
}
GGML_ASSERT(
dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr ||
dynamic_cast<server_task_result_cmpl_final*> (first_result.get()) != nullptr
);
// next responses are streamed
// to be sent immediately
json first_result_json = first_result->to_json();
@@ -3137,7 +3014,6 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
auto result = rd.next(req.should_stop);
if (result == nullptr) {
SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
GGML_ASSERT(req.should_stop());
return false; // should_stop condition met
}
@@ -3221,11 +3097,6 @@ void server_routes::init_routes() {
// get the result
auto result = res->rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
if (result->is_error()) {
res->error(result->to_json());
@@ -3326,11 +3197,6 @@ void server_routes::init_routes() {
// get the result
auto result = res->rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
if (result->is_error()) {
res->error(result->to_json());
@@ -3837,12 +3703,7 @@ void server_routes::init_routes() {
}
// get the result
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());
@@ -3871,12 +3732,7 @@ void server_routes::init_routes() {
}
// get the result
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());
@@ -3909,12 +3765,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const ser
rd.post_task(std::move(task));
}
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());
@@ -3945,12 +3796,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const
rd.post_task(std::move(task));
}
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());
@@ -3972,12 +3818,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const se
rd.post_task(std::move(task));
}
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());