sync from b7516
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
#include "server-task.h"
|
||||
#include "server-queue.h"
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
@@ -15,6 +16,7 @@
|
||||
#include <cstddef>
|
||||
#include <cinttypes>
|
||||
#include <memory>
|
||||
#include <unordered_set>
|
||||
#include <filesystem>
|
||||
|
||||
// fix problem with std::min and std::max
|
||||
@@ -45,6 +47,26 @@ enum server_state {
|
||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||
};
|
||||
|
||||
static bool server_task_type_need_embd(server_task_type task_type) {
|
||||
switch (task_type) {
|
||||
case SERVER_TASK_TYPE_EMBEDDING:
|
||||
case SERVER_TASK_TYPE_RERANK:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool server_task_type_need_logits(server_task_type task_type) {
|
||||
switch (task_type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
case SERVER_TASK_TYPE_INFILL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct server_slot {
|
||||
int id;
|
||||
|
||||
@@ -59,8 +81,6 @@ struct server_slot {
|
||||
|
||||
common_speculative * spec = nullptr;
|
||||
|
||||
// TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
|
||||
// see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
|
||||
std::unique_ptr<const server_task> task;
|
||||
std::unique_ptr<const server_task> task_prev; // used for debugging
|
||||
|
||||
@@ -127,17 +147,6 @@ struct server_slot {
|
||||
return res;
|
||||
}
|
||||
|
||||
void prompt_clear(bool allow_processing) {
|
||||
if (!allow_processing) {
|
||||
GGML_ASSERT(!is_processing());
|
||||
}
|
||||
|
||||
SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
|
||||
prompt.tokens.clear();
|
||||
}
|
||||
|
||||
std::vector<common_adapter_lora_info> lora;
|
||||
int32_t alora_invocation_start = -1;
|
||||
|
||||
@@ -146,7 +155,7 @@ struct server_slot {
|
||||
|
||||
common_sampler_ptr smpl;
|
||||
|
||||
llama_token sampled; // in speculative mode, this is the last accepted token
|
||||
llama_token sampled; // in speculative mode, this is the last accepted token
|
||||
llama_tokens drafted;
|
||||
|
||||
// stats
|
||||
@@ -158,7 +167,7 @@ struct server_slot {
|
||||
double t_prompt_processing; // ms
|
||||
double t_token_generation; // ms
|
||||
|
||||
std::function<void(int /* slot_id */)> callback_on_release;
|
||||
std::function<void(int)> callback_on_release;
|
||||
|
||||
// Speculative decoding stats
|
||||
int32_t n_draft_total = 0; // Total draft tokens generated
|
||||
@@ -187,46 +196,30 @@ struct server_slot {
|
||||
n_draft_total = 0;
|
||||
n_draft_accepted = 0;
|
||||
|
||||
task_prev = std::move(task);
|
||||
task.reset();
|
||||
|
||||
llama_set_sampler(ctx, id, nullptr);
|
||||
task_prev.reset();
|
||||
|
||||
// clear alora start
|
||||
alora_invocation_start = -1;
|
||||
}
|
||||
|
||||
void init_sampler() const {
|
||||
common_sampler_reset(smpl.get());
|
||||
bool need_embd() const {
|
||||
GGML_ASSERT(task);
|
||||
|
||||
if (!task->need_sampling()) {
|
||||
return;
|
||||
}
|
||||
return server_task_type_need_embd(task->type);
|
||||
}
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
bool need_logits() const {
|
||||
GGML_ASSERT(task);
|
||||
|
||||
int n_text = 0;
|
||||
|
||||
for (int i = 0; i < (int) prompt.tokens.size(); i++) {
|
||||
const llama_token id = prompt.tokens[i];
|
||||
|
||||
if (id != LLAMA_TOKEN_NULL) {
|
||||
common_sampler_accept(smpl.get(), id, false);
|
||||
n_text++;
|
||||
}
|
||||
}
|
||||
|
||||
SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
|
||||
(ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
|
||||
return server_task_type_need_logits(task->type);
|
||||
}
|
||||
|
||||
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
|
||||
// also we cannot split if the pooling would require any past tokens
|
||||
bool can_split() const {
|
||||
GGML_ASSERT(task);
|
||||
|
||||
return
|
||||
!task->need_embd() ||
|
||||
!need_embd() ||
|
||||
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
|
||||
}
|
||||
|
||||
@@ -267,13 +260,10 @@ struct server_slot {
|
||||
SLT_WRN(*this, "%s", "slot is not processing\n");
|
||||
return;
|
||||
}
|
||||
|
||||
generated_token_probs.push_back(token);
|
||||
}
|
||||
|
||||
int get_n_draft_max() const {
|
||||
GGML_ASSERT(task);
|
||||
|
||||
if (!can_speculate()) {
|
||||
return 0;
|
||||
}
|
||||
@@ -298,23 +288,27 @@ struct server_slot {
|
||||
return n_draft_max;
|
||||
}
|
||||
|
||||
// note: a slot can also be either a parent or a child
|
||||
bool is_parent() const {
|
||||
return is_processing() && task->n_children > 0;
|
||||
}
|
||||
|
||||
bool is_child() const {
|
||||
return is_processing() && task->id_parent >= 0;
|
||||
}
|
||||
|
||||
void release() {
|
||||
if (is_processing()) {
|
||||
GGML_ASSERT(task);
|
||||
|
||||
SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated);
|
||||
|
||||
t_last_used = ggml_time_us();
|
||||
t_last_used = ggml_time_us();
|
||||
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
||||
|
||||
state = SLOT_STATE_IDLE;
|
||||
|
||||
// do not keep context of the child slots - the parent's context is enough
|
||||
if (task->is_child()) {
|
||||
prompt_clear(false);
|
||||
}
|
||||
|
||||
reset();
|
||||
task_prev = std::move(task);
|
||||
task.reset();
|
||||
|
||||
callback_on_release(id);
|
||||
}
|
||||
@@ -433,22 +427,14 @@ struct server_slot {
|
||||
}
|
||||
|
||||
void copy_state_to(server_slot & other) const {
|
||||
GGML_ASSERT(state == SLOT_STATE_DONE_PROMPT);
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), other.id, -1, -1);
|
||||
llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, -1, -1);
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), other.id, 0, -1);
|
||||
llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, 0, -1);
|
||||
other.n_decoded = n_decoded;
|
||||
other.n_remaining = n_remaining;
|
||||
other.i_batch = i_batch;
|
||||
|
||||
other.t_start_process_prompt = t_start_process_prompt;
|
||||
other.t_prompt_processing = t_prompt_processing;
|
||||
other.n_prompt_tokens_cache = n_prompt_tokens_cache;
|
||||
other.n_prompt_tokens_processed = n_prompt_tokens_processed;
|
||||
|
||||
other.prompt = prompt.clone();
|
||||
other.init_sampler();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -761,8 +747,6 @@ private:
|
||||
}
|
||||
|
||||
slots.clear();
|
||||
|
||||
// initialize slots
|
||||
for (int i = 0; i < params_base.n_parallel; i++) {
|
||||
server_slot slot;
|
||||
|
||||
@@ -794,8 +778,8 @@ private:
|
||||
|
||||
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
|
||||
|
||||
slot.callback_on_release = [this](int slot_id) {
|
||||
queue_tasks.pop_deferred_task(slot_id);
|
||||
slot.callback_on_release = [this](int) {
|
||||
queue_tasks.pop_deferred_task();
|
||||
};
|
||||
|
||||
slot.reset();
|
||||
@@ -909,9 +893,9 @@ private:
|
||||
return true;
|
||||
}
|
||||
|
||||
server_slot * get_slot_by_id(int id_slot) {
|
||||
server_slot * get_slot_by_id(int id) {
|
||||
for (server_slot & slot : slots) {
|
||||
if (slot.id == id_slot) {
|
||||
if (slot.id == id) {
|
||||
return &slot;
|
||||
}
|
||||
}
|
||||
@@ -1011,7 +995,7 @@ private:
|
||||
ret->prompt_save(*prompt_cache);
|
||||
|
||||
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
|
||||
ret->prompt_clear(false);
|
||||
clear_slot(*ret);
|
||||
}
|
||||
|
||||
prompt_cache->update();
|
||||
@@ -1023,6 +1007,15 @@ private:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void clear_slot(server_slot & slot) const {
|
||||
GGML_ASSERT(!slot.is_processing());
|
||||
|
||||
SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
|
||||
slot.prompt.tokens.clear();
|
||||
}
|
||||
|
||||
// return true if at least one slot has been cleared
|
||||
// TODO: improve logic
|
||||
// - smarter decision which slot to clear (LRU or longest prompt?)
|
||||
@@ -1043,7 +1036,7 @@ private:
|
||||
if (slot.prompt.n_tokens() > 0) {
|
||||
SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
|
||||
|
||||
slot.prompt_clear(false);
|
||||
clear_slot(slot);
|
||||
|
||||
res = true;
|
||||
|
||||
@@ -1069,6 +1062,8 @@ private:
|
||||
}
|
||||
|
||||
bool launch_slot_with_task(server_slot & slot, server_task && task) {
|
||||
slot.reset();
|
||||
|
||||
// process per-request lora adapters
|
||||
if (!task.params.lora.empty()) {
|
||||
auto task_loras = construct_lora_list(task.params.lora);
|
||||
@@ -1142,7 +1137,7 @@ private:
|
||||
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
|
||||
|
||||
// initialize samplers
|
||||
if (task.need_sampling()) {
|
||||
{
|
||||
slot.smpl.reset(common_sampler_init(model, task.params.sampling));
|
||||
|
||||
if (slot.smpl == nullptr) {
|
||||
@@ -1151,28 +1146,7 @@ private:
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool need_logits = task.params.sampling.n_probs > 0;
|
||||
|
||||
bool backend_sampling = true;
|
||||
|
||||
backend_sampling &= task.params.sampling.backend_sampling;
|
||||
|
||||
// TODO: speculative decoding requires multiple samples per batch - not supported yet
|
||||
backend_sampling &= !(slot.ctx_dft && task.params.speculative.n_max > 0);
|
||||
|
||||
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
|
||||
backend_sampling &= !need_logits;
|
||||
|
||||
// TODO: tmp until backend sampling is fully implemented
|
||||
if (backend_sampling) {
|
||||
llama_set_sampler(ctx, slot.id, common_sampler_get(slot.smpl.get()));
|
||||
} else {
|
||||
llama_set_sampler(ctx, slot.id, nullptr);
|
||||
}
|
||||
|
||||
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
|
||||
} else {
|
||||
slot.smpl.reset();
|
||||
}
|
||||
|
||||
// initialize draft batch
|
||||
@@ -1185,11 +1159,12 @@ private:
|
||||
|
||||
slot.task = std::make_unique<const server_task>(std::move(task));
|
||||
|
||||
slot.state = slot.task->is_child()
|
||||
slot.state = slot.is_child()
|
||||
? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
|
||||
: SLOT_STATE_STARTED;
|
||||
|
||||
SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
|
||||
SLT_INF(slot, "%s", "processing task\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1509,9 +1484,9 @@ private:
|
||||
res->n_tokens = slot.task->n_tokens();
|
||||
res->res_type = slot.task->params.res_type;
|
||||
|
||||
const int n_embd_out = llama_model_n_embd_out(model);
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
|
||||
std::vector<float> embd_res(n_embd_out, 0.0f);
|
||||
std::vector<float> embd_res(n_embd, 0.0f);
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||
@@ -1528,18 +1503,18 @@ private:
|
||||
if (embd == nullptr) {
|
||||
SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
|
||||
|
||||
res->embedding.push_back(std::vector<float>(n_embd_out, 0.0f));
|
||||
res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
|
||||
continue;
|
||||
}
|
||||
|
||||
// normalize only when there is pooling
|
||||
if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
|
||||
common_embd_normalize(embd, embd_res.data(), n_embd_out, slot.task->params.embd_normalize);
|
||||
common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize);
|
||||
res->embedding.push_back(embd_res);
|
||||
break;
|
||||
}
|
||||
|
||||
res->embedding.emplace_back(embd, embd + n_embd_out);
|
||||
res->embedding.emplace_back(embd, embd + n_embd);
|
||||
}
|
||||
|
||||
SLT_DBG(slot, "%s", "sending embeddings\n");
|
||||
@@ -1584,7 +1559,9 @@ private:
|
||||
|
||||
// tokenize the input if it's set by CLI, return false on error
|
||||
bool tokenize_cli_input(server_task & task) {
|
||||
GGML_ASSERT(task.cli_input != nullptr);
|
||||
if (task.cli_input == nullptr) {
|
||||
return true; // nothing to do
|
||||
}
|
||||
try {
|
||||
auto & opt = oai_parser_opt;
|
||||
common_chat_templates_inputs inputs;
|
||||
@@ -1618,64 +1595,6 @@ private:
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<server_slot *> get_free_slots(size_t n_slots_needed, int exclude_id_slot) {
|
||||
std::vector<server_slot *> free_slots;
|
||||
for (auto & slot : slots) {
|
||||
if (!slot.is_processing() && slot.id != exclude_id_slot) {
|
||||
free_slots.push_back(&slot);
|
||||
}
|
||||
if (free_slots.size() >= n_slots_needed) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return free_slots;
|
||||
}
|
||||
|
||||
// launch multiple slots for parent + child tasks
|
||||
bool launch_slots_with_parent_task(server_slot & parent_slot, std::vector<server_slot *> & child_slots, server_task && parent_task) {
|
||||
GGML_ASSERT(!parent_slot.is_processing());
|
||||
GGML_ASSERT(parent_task.is_parent());
|
||||
GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
|
||||
|
||||
int id_parent = parent_task.id;
|
||||
|
||||
SRV_INF("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());
|
||||
|
||||
// to be called in case of failure to release all launched slots
|
||||
auto release_slots = [this, id_parent]() {
|
||||
for (auto & slot : slots) {
|
||||
if (slot.is_processing() && (
|
||||
slot.task->id == id_parent ||
|
||||
slot.task->id_parent == id_parent
|
||||
)) {
|
||||
slot.release();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// launch all child tasks first
|
||||
size_t idx = 0;
|
||||
GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
|
||||
for (auto * slot : child_slots) {
|
||||
int id_child = parent_task.child_tasks[idx].id;
|
||||
if (!launch_slot_with_task(*slot, std::move(parent_task.child_tasks[idx]))) {
|
||||
SRV_ERR("failed to launch slot with child task, id_task = %d\n", id_child);
|
||||
release_slots();
|
||||
return false;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
// finally, launch the parent task
|
||||
if (!launch_slot_with_task(parent_slot, std::move(parent_task))) {
|
||||
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_parent);
|
||||
release_slots();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void process_single_task(server_task && task) {
|
||||
switch (task.type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
@@ -1683,55 +1602,31 @@ private:
|
||||
case SERVER_TASK_TYPE_EMBEDDING:
|
||||
case SERVER_TASK_TYPE_RERANK:
|
||||
{
|
||||
// special case: if input is provided via CLI, tokenize it first
|
||||
// otherwise, no need to tokenize as it's already done inside the HTTP thread
|
||||
if (task.cli_input != nullptr) {
|
||||
if (!tokenize_cli_input(task)) {
|
||||
break;
|
||||
}
|
||||
if (!tokenize_cli_input(task)) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int id_slot = task.id_slot;
|
||||
const int id_task = task.id;
|
||||
|
||||
server_slot * slot = id_slot != -1
|
||||
? get_slot_by_id(id_slot)
|
||||
: get_available_slot(task);
|
||||
|
||||
//
|
||||
// slot scheduling logic
|
||||
//
|
||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
||||
|
||||
if (slot == nullptr) {
|
||||
// if no slot is available, we defer this task for processing later
|
||||
SRV_DBG("no slot is available, defer task, id_task = %d\n", id_task);
|
||||
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", id_task);
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
if (task.is_parent()) {
|
||||
// try getting free slots for all child tasks
|
||||
size_t n_child_tasks = task.child_tasks.size();
|
||||
std::vector<server_slot *> child_slots = get_free_slots(n_child_tasks, slot->id);
|
||||
if (child_slots.size() < n_child_tasks) {
|
||||
SRV_DBG("not enough free slots for child tasks, n_free = %zu, n_children = %zu, defer task, id_task = %d\n", child_slots.size(), n_child_tasks, id_task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
if (!launch_slots_with_parent_task(*slot, child_slots, std::move(task))) {
|
||||
SRV_ERR("failed to launch slot with parent task, id_task = %d\n", id_task);
|
||||
break; // drop the task
|
||||
}
|
||||
} else if (!launch_slot_with_task(*slot, std::move(task))) {
|
||||
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
|
||||
break; // drop the task
|
||||
if (!launch_slot_with_task(*slot, std::move(task))) {
|
||||
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
|
||||
break;
|
||||
}
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_CANCEL:
|
||||
@@ -1905,7 +1800,7 @@ private:
|
||||
// Erase token cache
|
||||
const size_t n_erased = slot->prompt.tokens.size();
|
||||
|
||||
slot->prompt_clear(false);
|
||||
clear_slot(*slot);
|
||||
|
||||
auto res = std::make_unique<server_task_result_slot_erase>();
|
||||
res->id = task.id;
|
||||
@@ -2000,7 +1895,7 @@ private:
|
||||
GGML_ABORT("not supported by multimodal");
|
||||
}
|
||||
|
||||
if (slot.task->is_parent() || slot.task->is_child()) {
|
||||
if (slot.is_parent() || slot.is_child()) {
|
||||
send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
|
||||
slot.release();
|
||||
continue;
|
||||
@@ -2139,12 +2034,6 @@ private:
|
||||
continue;
|
||||
}
|
||||
|
||||
// check if this is a child slot
|
||||
if (slot.state == SLOT_STATE_WAIT_OTHER) {
|
||||
SLT_DBG(slot, "%s", "waiting for parent slot to complete\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
// this slot still has a prompt to be processed
|
||||
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
|
||||
const auto & input_tokens = slot.task->tokens;
|
||||
@@ -2187,7 +2076,7 @@ private:
|
||||
}
|
||||
|
||||
// TODO: support memory-less logits computation
|
||||
if (slot.task->need_logits() && !llama_get_memory(ctx)) {
|
||||
if (slot.need_logits() && !llama_get_memory(ctx)) {
|
||||
send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
|
||||
slot.release();
|
||||
continue;
|
||||
@@ -2424,12 +2313,6 @@ private:
|
||||
slot.n_prompt_tokens_processed = 0;
|
||||
|
||||
slot.prompt.tokens.keep_first(n_past);
|
||||
|
||||
// send initial 0% progress update if needed
|
||||
// this is to signal the client that the request has started processing
|
||||
if (slot.task->params.stream && slot.task->params.return_progress) {
|
||||
send_partial_response(slot, {}, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (!slot.can_split()) {
|
||||
@@ -2447,7 +2330,7 @@ private:
|
||||
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
|
||||
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
|
||||
|
||||
slot.prompt_clear(true);
|
||||
clear_slot(slot);
|
||||
|
||||
// there is no common part left
|
||||
slot.n_prompt_tokens_cache = 0;
|
||||
@@ -2526,7 +2409,7 @@ private:
|
||||
cur_tok,
|
||||
slot.prompt.tokens.pos_next(),
|
||||
{ slot.id },
|
||||
slot.task->need_embd());
|
||||
slot.need_embd());
|
||||
slot.prompt.tokens.push_back(cur_tok);
|
||||
|
||||
slot.n_prompt_tokens_processed++;
|
||||
@@ -2547,6 +2430,16 @@ private:
|
||||
|
||||
GGML_ASSERT(batch.n_tokens > 0);
|
||||
|
||||
common_sampler_reset(slot.smpl.get());
|
||||
|
||||
// Process all prompt tokens through sampler system
|
||||
for (int i = 0; i < slot.task->n_tokens(); ++i) {
|
||||
llama_token id = input_tokens[i];
|
||||
if (id != LLAMA_TOKEN_NULL) {
|
||||
common_sampler_accept(slot.smpl.get(), id, false);
|
||||
}
|
||||
}
|
||||
|
||||
// extract the logits only for the last token
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
@@ -2555,8 +2448,6 @@ private:
|
||||
|
||||
SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
|
||||
|
||||
slot.init_sampler();
|
||||
|
||||
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
|
||||
const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);
|
||||
|
||||
@@ -2603,6 +2494,11 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
SRV_WRN("%s", "no tokens to decode\n");
|
||||
return;
|
||||
}
|
||||
|
||||
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
|
||||
|
||||
if (slot_batched) {
|
||||
@@ -2616,11 +2512,7 @@ private:
|
||||
slot_batched->lora[alora_disabled_id].scale = alora_scale;
|
||||
}
|
||||
|
||||
llama_set_embeddings(ctx, slot_batched->task->need_embd());
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
SRV_WRN("%s", "no tokens to decode\n");
|
||||
llama_set_embeddings(ctx, slot_batched->need_embd());
|
||||
}
|
||||
|
||||
int32_t i_next = 0;
|
||||
@@ -2674,7 +2566,7 @@ private:
|
||||
|
||||
// note: it's complicated to keep track of how much of the current batch has been
|
||||
// processed before the error occurred, so we simply clear the entire context
|
||||
slot.prompt_clear(false);
|
||||
clear_slot(slot);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2698,30 +2590,31 @@ private:
|
||||
// on successful decode, restore the original batch size
|
||||
n_batch = llama_n_batch(ctx);
|
||||
|
||||
// handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
|
||||
// technically, measuring the time here excludes the sampling time for the last batch
|
||||
// but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
for (auto & slot : slots) {
|
||||
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) {
|
||||
std::vector<server_slot *> children;
|
||||
// may need to copy state to other slots
|
||||
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
|
||||
std::vector<server_slot *> child_slots;
|
||||
for (auto & other : slots) {
|
||||
if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
|
||||
children.push_back(&other);
|
||||
child_slots.push_back(&other);
|
||||
}
|
||||
}
|
||||
|
||||
// all children slots should already launched by launch_slots_with_parent_task()
|
||||
// copy state to the child slots
|
||||
for (auto & child : children) {
|
||||
SLT_INF(slot, " - copying state to child %d\n", child->id);
|
||||
|
||||
GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
|
||||
|
||||
slot.copy_state_to(*child);
|
||||
child->state = SLOT_STATE_DONE_PROMPT;
|
||||
// we can only proceed if all child slots are having the correct tasks
|
||||
if (child_slots.size() == slot.task->n_children) {
|
||||
// copy state to the child slots
|
||||
for (auto & child : child_slots) {
|
||||
SLT_INF(slot, "copying state to child %d\n", child->id);
|
||||
slot.copy_state_to(*child);
|
||||
child->state = SLOT_STATE_DONE_PROMPT;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto & slot : slots) {
|
||||
// optionally send prompt processing progress
|
||||
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
|
||||
if (slot.task->params.stream && slot.task->params.return_progress) {
|
||||
@@ -2749,8 +2642,6 @@ private:
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
GGML_ASSERT(slot.task->need_sampling());
|
||||
|
||||
// prompt evaluated for next-token prediction
|
||||
slot.state = SLOT_STATE_GENERATING;
|
||||
} else if (slot.state != SLOT_STATE_GENERATING) {
|
||||
@@ -2769,9 +2660,6 @@ private:
|
||||
|
||||
common_sampler_accept(slot.smpl.get(), id, true);
|
||||
|
||||
// here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
slot.n_decoded += 1;
|
||||
|
||||
if (slot.n_decoded == 1) {
|
||||
@@ -2808,15 +2696,13 @@ private:
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t n_draft = slot.drafted.size();
|
||||
size_t n_draft = slot.drafted.size();
|
||||
|
||||
// the accepted tokens from the speculation
|
||||
const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
|
||||
slot.i_batch_dft.clear();
|
||||
slot.drafted.clear();
|
||||
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
slot.n_decoded += ids.size();
|
||||
|
||||
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
|
||||
@@ -2898,12 +2784,6 @@ server_response_reader server_context::get_response_reader() {
|
||||
|
||||
server_context_meta server_context::get_meta() const {
|
||||
auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
|
||||
|
||||
auto bos_id = llama_vocab_bos(impl->vocab);
|
||||
auto eos_id = llama_vocab_eos(impl->vocab);
|
||||
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
|
||||
auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : "";
|
||||
|
||||
return server_context_meta {
|
||||
/* build_info */ build_info,
|
||||
/* model_name */ impl->model_name,
|
||||
@@ -2918,8 +2798,8 @@ server_context_meta server_context::get_meta() const {
|
||||
/* chat_template */ common_chat_templates_source(impl->chat_templates.get()),
|
||||
/* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
|
||||
|
||||
/* bos_token_str */ bos_token_str,
|
||||
/* eos_token_str */ eos_token_str,
|
||||
/* bos_token_str */ common_token_to_piece(impl->ctx, llama_vocab_bos(impl->vocab), true),
|
||||
/* eos_token_str */ common_token_to_piece(impl->ctx, llama_vocab_eos(impl->vocab), true),
|
||||
/* fim_pre_token */ llama_vocab_fim_pre(impl->vocab),
|
||||
/* fim_sub_token */ llama_vocab_fim_suf(impl->vocab),
|
||||
/* fim_mid_token */ llama_vocab_fim_mid(impl->vocab),
|
||||
@@ -2992,9 +2872,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
// Everything else, including multimodal completions.
|
||||
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
|
||||
}
|
||||
|
||||
// tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
|
||||
|
||||
tasks.reserve(inputs.size());
|
||||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
server_task task = server_task(type);
|
||||
|
||||
@@ -3013,11 +2891,13 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
task.params.oaicompat_model = meta->model_name;
|
||||
|
||||
// prepare child tasks
|
||||
if (task.params.n_cmpl > 1) {
|
||||
int n_children = task.params.n_cmpl - 1;
|
||||
for (int j = 0; j < n_children; j++) {
|
||||
task.add_child(task.id, rd.get_new_id());
|
||||
task.n_children = task.params.n_cmpl - 1;
|
||||
for (size_t j = 0; j < task.n_children; j++) {
|
||||
server_task child = task.create_child(
|
||||
task.id,
|
||||
rd.get_new_id());
|
||||
tasks.push_back(std::move(child));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3066,22 +2946,19 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
// in streaming mode, the first error must be treated as non-stream response
|
||||
// this is to match the OAI API behavior
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
|
||||
auto first_result = rd.next(req.should_stop);
|
||||
server_task_result_ptr first_result = rd.next(req.should_stop);
|
||||
if (first_result == nullptr) {
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return res; // connection is closed
|
||||
}
|
||||
|
||||
if (first_result->is_error()) {
|
||||
} else if (first_result->is_error()) {
|
||||
res->error(first_result->to_json());
|
||||
return res;
|
||||
} else {
|
||||
GGML_ASSERT(
|
||||
dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr
|
||||
|| dynamic_cast<server_task_result_cmpl_final*>(first_result.get()) != nullptr
|
||||
);
|
||||
}
|
||||
|
||||
GGML_ASSERT(
|
||||
dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr ||
|
||||
dynamic_cast<server_task_result_cmpl_final*> (first_result.get()) != nullptr
|
||||
);
|
||||
|
||||
// next responses are streamed
|
||||
// to be sent immediately
|
||||
json first_result_json = first_result->to_json();
|
||||
@@ -3137,7 +3014,6 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
auto result = rd.next(req.should_stop);
|
||||
if (result == nullptr) {
|
||||
SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return false; // should_stop condition met
|
||||
}
|
||||
|
||||
@@ -3221,11 +3097,6 @@ void server_routes::init_routes() {
|
||||
|
||||
// get the result
|
||||
auto result = res->rd.next(req.should_stop);
|
||||
if (!result) {
|
||||
// connection was closed
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return res;
|
||||
}
|
||||
|
||||
if (result->is_error()) {
|
||||
res->error(result->to_json());
|
||||
@@ -3326,11 +3197,6 @@ void server_routes::init_routes() {
|
||||
|
||||
// get the result
|
||||
auto result = res->rd.next(req.should_stop);
|
||||
if (!result) {
|
||||
// connection was closed
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return res;
|
||||
}
|
||||
|
||||
if (result->is_error()) {
|
||||
res->error(result->to_json());
|
||||
@@ -3837,12 +3703,7 @@ void server_routes::init_routes() {
|
||||
}
|
||||
|
||||
// get the result
|
||||
auto result = rd.next(req.should_stop);
|
||||
if (!result) {
|
||||
// connection was closed
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return res;
|
||||
}
|
||||
server_task_result_ptr result = rd.next(req.should_stop);
|
||||
|
||||
if (result->is_error()) {
|
||||
res->error(result->to_json());
|
||||
@@ -3871,12 +3732,7 @@ void server_routes::init_routes() {
|
||||
}
|
||||
|
||||
// get the result
|
||||
auto result = rd.next(req.should_stop);
|
||||
if (!result) {
|
||||
// connection was closed
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return res;
|
||||
}
|
||||
server_task_result_ptr result = rd.next(req.should_stop);
|
||||
|
||||
if (result->is_error()) {
|
||||
res->error(result->to_json());
|
||||
@@ -3909,12 +3765,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const ser
|
||||
rd.post_task(std::move(task));
|
||||
}
|
||||
|
||||
auto result = rd.next(req.should_stop);
|
||||
if (!result) {
|
||||
// connection was closed
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return res;
|
||||
}
|
||||
server_task_result_ptr result = rd.next(req.should_stop);
|
||||
|
||||
if (result->is_error()) {
|
||||
res->error(result->to_json());
|
||||
@@ -3945,12 +3796,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const
|
||||
rd.post_task(std::move(task));
|
||||
}
|
||||
|
||||
auto result = rd.next(req.should_stop);
|
||||
if (!result) {
|
||||
// connection was closed
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return res;
|
||||
}
|
||||
server_task_result_ptr result = rd.next(req.should_stop);
|
||||
|
||||
if (result->is_error()) {
|
||||
res->error(result->to_json());
|
||||
@@ -3972,12 +3818,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const se
|
||||
rd.post_task(std::move(task));
|
||||
}
|
||||
|
||||
auto result = rd.next(req.should_stop);
|
||||
if (!result) {
|
||||
// connection was closed
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return res;
|
||||
}
|
||||
server_task_result_ptr result = rd.next(req.should_stop);
|
||||
|
||||
if (result->is_error()) {
|
||||
res->error(result->to_json());
|
||||
|
||||
Reference in New Issue
Block a user