sync from b7516

This commit is contained in:
2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions

View File

@@ -38,6 +38,14 @@ set(TARGET_SRCS
server-http.h
server-models.cpp
server-models.h
server-task.cpp
server-task.h
server-queue.cpp
server-queue.h
server-common.cpp
server-common.h
server-context.cpp
server-context.h
)
set(PUBLIC_ASSETS
index.html.gz

View File

@@ -33,7 +33,6 @@ For the ful list of features, please refer to [server's changelog](https://githu
| -------- | ----------- |
| `-h, --help, --usage` | print usage and exit |
| `--version` | show version and build info |
| `--license` | show source code license and dependencies |
| `-cl, --cache-list` | show list of models in cache |
| `--completion-bash` | print source-able bash completion script for llama.cpp |
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
@@ -74,23 +73,22 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type<br/>(env: LLAMA_ARG_OVERRIDE_TENSOR) |
| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
| `--check-tensors` | check model tensor data for invalid values (default: false) |
| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
@@ -130,8 +128,6 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) |
| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) |
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
@@ -155,7 +151,6 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--grammar-file FNAME` | file to read grammar from |
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)<br/>(env: LLAMA_ARG_BACKEND_SAMPLING) |
### Server-specific params
@@ -192,11 +187,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
| `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
| `--chat-template-kwargs STRING` | sets additional params for the json template parser<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
@@ -212,8 +207,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
@@ -225,7 +220,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
| `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
@@ -784,8 +779,7 @@ By default, it is read-only. To make POST request to change global properties, y
"modalities": {
"vision": false
},
"build_info": "b(build number)-(build commit hash)",
"is_sleeping": false
"build_info": "b(build number)-(build commit hash)"
}
```
@@ -794,7 +788,6 @@ By default, it is read-only. To make POST request to change global properties, y
- `model_path` - the path to model file (same with `-m` argument)
- `chat_template` - the model's original Jinja2 prompt template
- `modalities` - the list of supported modalities
- `is_sleeping` - sleeping status, see [Sleeping on idle](#sleeping-on-idle)
### POST `/props`: Change server global properties.
@@ -1493,7 +1486,6 @@ The precedence rule for preset options is as follows:
We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
- `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)
### Routing requests
@@ -1582,7 +1574,8 @@ Payload:
```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
"extra_args": ["-n", "128", "--top-k", "4"]
}
```
@@ -1637,12 +1630,9 @@ The server supports an automatic sleep mode that activates after a specified per
When the server enters sleep mode, the model and its associated memory (including the KV cache) are unloaded from RAM to conserve resources. Any new incoming task will automatically trigger the model to reload.
The sleeping status can be retrieved from the `GET /props` endpoint (or `/props?model=(model_name)` in router mode).
Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer:
- `GET /health`
- `GET /props`
- `GET /models`
## More examples

Binary file not shown.

View File

@@ -1,10 +1,10 @@
#include "common.h"
#include "download.h"
#include "log.h"
#include "llama.h"
#include "mtmd.h"
#include "mtmd-helper.h"
#include "chat.h"
#include "arg.h" // for common_remote_get_content; TODO: use download.h only
#include "base64.hpp"
#include "server-common.h"
@@ -779,7 +779,7 @@ static void handle_media(
// download remote image
// TODO @ngxson : maybe make these params configurable
common_remote_params params;
params.headers.push_back({"User-Agent", "llama.cpp/" + build_info});
params.headers.push_back("User-Agent: llama.cpp/" + build_info);
params.max_size = 1024 * 1024 * 10; // 10MB
params.timeout = 10; // seconds
SRV_INF("downloading image from '%s'\n", url.c_str());
@@ -1385,21 +1385,16 @@ json format_response_rerank(
std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
std::vector<llama_token_data> cur;
const auto * logits = llama_get_logits_ith(ctx, idx);
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
const int n_logits = llama_get_sampled_logits_count_ith(ctx, idx);
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
cur.resize(n_logits);
if (sampled_ids) {
for (int i = 0; i < n_logits; i++) {
cur[i] = llama_token_data{sampled_ids[i], logits[i], 0.0f};
}
} else {
for (llama_token token_id = 0; token_id < n_logits; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
const int n_vocab = llama_vocab_n_tokens(vocab);
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
// sort tokens by logits

View File

@@ -4,6 +4,7 @@
#include "server-task.h"
#include "server-queue.h"
#include "arg.h"
#include "common.h"
#include "llama.h"
#include "log.h"
@@ -15,6 +16,7 @@
#include <cstddef>
#include <cinttypes>
#include <memory>
#include <unordered_set>
#include <filesystem>
// fix problem with std::min and std::max
@@ -45,6 +47,26 @@ enum server_state {
SERVER_STATE_READY, // Server is ready and model is loaded
};
static bool server_task_type_need_embd(server_task_type task_type) {
switch (task_type) {
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
return true;
default:
return false;
}
}
static bool server_task_type_need_logits(server_task_type task_type) {
switch (task_type) {
case SERVER_TASK_TYPE_COMPLETION:
case SERVER_TASK_TYPE_INFILL:
return true;
default:
return false;
}
}
struct server_slot {
int id;
@@ -59,8 +81,6 @@ struct server_slot {
common_speculative * spec = nullptr;
// TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
// see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
std::unique_ptr<const server_task> task;
std::unique_ptr<const server_task> task_prev; // used for debugging
@@ -127,17 +147,6 @@ struct server_slot {
return res;
}
void prompt_clear(bool allow_processing) {
if (!allow_processing) {
GGML_ASSERT(!is_processing());
}
SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
prompt.tokens.clear();
}
std::vector<common_adapter_lora_info> lora;
int32_t alora_invocation_start = -1;
@@ -146,7 +155,7 @@ struct server_slot {
common_sampler_ptr smpl;
llama_token sampled; // in speculative mode, this is the last accepted token
llama_token sampled; // in speculative mode, this is the last accepted token
llama_tokens drafted;
// stats
@@ -158,7 +167,7 @@ struct server_slot {
double t_prompt_processing; // ms
double t_token_generation; // ms
std::function<void(int /* slot_id */)> callback_on_release;
std::function<void(int)> callback_on_release;
// Speculative decoding stats
int32_t n_draft_total = 0; // Total draft tokens generated
@@ -187,46 +196,30 @@ struct server_slot {
n_draft_total = 0;
n_draft_accepted = 0;
task_prev = std::move(task);
task.reset();
llama_set_sampler(ctx, id, nullptr);
task_prev.reset();
// clear alora start
alora_invocation_start = -1;
}
void init_sampler() const {
common_sampler_reset(smpl.get());
bool need_embd() const {
GGML_ASSERT(task);
if (!task->need_sampling()) {
return;
}
return server_task_type_need_embd(task->type);
}
const int64_t t_start = ggml_time_us();
bool need_logits() const {
GGML_ASSERT(task);
int n_text = 0;
for (int i = 0; i < (int) prompt.tokens.size(); i++) {
const llama_token id = prompt.tokens[i];
if (id != LLAMA_TOKEN_NULL) {
common_sampler_accept(smpl.get(), id, false);
n_text++;
}
}
SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
(ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
return server_task_type_need_logits(task->type);
}
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
// also we cannot split if the pooling would require any past tokens
bool can_split() const {
GGML_ASSERT(task);
return
!task->need_embd() ||
!need_embd() ||
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
}
@@ -267,13 +260,10 @@ struct server_slot {
SLT_WRN(*this, "%s", "slot is not processing\n");
return;
}
generated_token_probs.push_back(token);
}
int get_n_draft_max() const {
GGML_ASSERT(task);
if (!can_speculate()) {
return 0;
}
@@ -298,23 +288,27 @@ struct server_slot {
return n_draft_max;
}
// note: a slot can also be either a parent or a child
bool is_parent() const {
return is_processing() && task->n_children > 0;
}
bool is_child() const {
return is_processing() && task->id_parent >= 0;
}
void release() {
if (is_processing()) {
GGML_ASSERT(task);
SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated);
t_last_used = ggml_time_us();
t_last_used = ggml_time_us();
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
state = SLOT_STATE_IDLE;
// do not keep context of the child slots - the parent's context is enough
if (task->is_child()) {
prompt_clear(false);
}
reset();
task_prev = std::move(task);
task.reset();
callback_on_release(id);
}
@@ -433,22 +427,14 @@ struct server_slot {
}
void copy_state_to(server_slot & other) const {
GGML_ASSERT(state == SLOT_STATE_DONE_PROMPT);
llama_memory_seq_rm(llama_get_memory(ctx), other.id, -1, -1);
llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, -1, -1);
llama_memory_seq_rm(llama_get_memory(ctx), other.id, 0, -1);
llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, 0, -1);
other.n_decoded = n_decoded;
other.n_remaining = n_remaining;
other.i_batch = i_batch;
other.t_start_process_prompt = t_start_process_prompt;
other.t_prompt_processing = t_prompt_processing;
other.n_prompt_tokens_cache = n_prompt_tokens_cache;
other.n_prompt_tokens_processed = n_prompt_tokens_processed;
other.prompt = prompt.clone();
other.init_sampler();
}
};
@@ -761,8 +747,6 @@ private:
}
slots.clear();
// initialize slots
for (int i = 0; i < params_base.n_parallel; i++) {
server_slot slot;
@@ -794,8 +778,8 @@ private:
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
slot.callback_on_release = [this](int slot_id) {
queue_tasks.pop_deferred_task(slot_id);
slot.callback_on_release = [this](int) {
queue_tasks.pop_deferred_task();
};
slot.reset();
@@ -909,9 +893,9 @@ private:
return true;
}
server_slot * get_slot_by_id(int id_slot) {
server_slot * get_slot_by_id(int id) {
for (server_slot & slot : slots) {
if (slot.id == id_slot) {
if (slot.id == id) {
return &slot;
}
}
@@ -1011,7 +995,7 @@ private:
ret->prompt_save(*prompt_cache);
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
ret->prompt_clear(false);
clear_slot(*ret);
}
prompt_cache->update();
@@ -1023,6 +1007,15 @@ private:
return ret;
}
void clear_slot(server_slot & slot) const {
GGML_ASSERT(!slot.is_processing());
SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
slot.prompt.tokens.clear();
}
// return true if at least one slot has been cleared
// TODO: improve logic
// - smarter decision which slot to clear (LRU or longest prompt?)
@@ -1043,7 +1036,7 @@ private:
if (slot.prompt.n_tokens() > 0) {
SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
slot.prompt_clear(false);
clear_slot(slot);
res = true;
@@ -1069,6 +1062,8 @@ private:
}
bool launch_slot_with_task(server_slot & slot, server_task && task) {
slot.reset();
// process per-request lora adapters
if (!task.params.lora.empty()) {
auto task_loras = construct_lora_list(task.params.lora);
@@ -1142,7 +1137,7 @@ private:
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
// initialize samplers
if (task.need_sampling()) {
{
slot.smpl.reset(common_sampler_init(model, task.params.sampling));
if (slot.smpl == nullptr) {
@@ -1151,28 +1146,7 @@ private:
return false;
}
const bool need_logits = task.params.sampling.n_probs > 0;
bool backend_sampling = true;
backend_sampling &= task.params.sampling.backend_sampling;
// TODO: speculative decoding requires multiple samples per batch - not supported yet
backend_sampling &= !(slot.ctx_dft && task.params.speculative.n_max > 0);
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
backend_sampling &= !need_logits;
// TODO: tmp until backend sampling is fully implemented
if (backend_sampling) {
llama_set_sampler(ctx, slot.id, common_sampler_get(slot.smpl.get()));
} else {
llama_set_sampler(ctx, slot.id, nullptr);
}
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
} else {
slot.smpl.reset();
}
// initialize draft batch
@@ -1185,11 +1159,12 @@ private:
slot.task = std::make_unique<const server_task>(std::move(task));
slot.state = slot.task->is_child()
slot.state = slot.is_child()
? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
: SLOT_STATE_STARTED;
SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
SLT_INF(slot, "%s", "processing task\n");
return true;
}
@@ -1509,9 +1484,9 @@ private:
res->n_tokens = slot.task->n_tokens();
res->res_type = slot.task->params.res_type;
const int n_embd_out = llama_model_n_embd_out(model);
const int n_embd = llama_model_n_embd(model);
std::vector<float> embd_res(n_embd_out, 0.0f);
std::vector<float> embd_res(n_embd, 0.0f);
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
@@ -1528,18 +1503,18 @@ private:
if (embd == nullptr) {
SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
res->embedding.push_back(std::vector<float>(n_embd_out, 0.0f));
res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
continue;
}
// normalize only when there is pooling
if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
common_embd_normalize(embd, embd_res.data(), n_embd_out, slot.task->params.embd_normalize);
common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize);
res->embedding.push_back(embd_res);
break;
}
res->embedding.emplace_back(embd, embd + n_embd_out);
res->embedding.emplace_back(embd, embd + n_embd);
}
SLT_DBG(slot, "%s", "sending embeddings\n");
@@ -1584,7 +1559,9 @@ private:
// tokenize the input if it's set by CLI, return false on error
bool tokenize_cli_input(server_task & task) {
GGML_ASSERT(task.cli_input != nullptr);
if (task.cli_input == nullptr) {
return true; // nothing to do
}
try {
auto & opt = oai_parser_opt;
common_chat_templates_inputs inputs;
@@ -1618,64 +1595,6 @@ private:
return true;
}
std::vector<server_slot *> get_free_slots(size_t n_slots_needed, int exclude_id_slot) {
std::vector<server_slot *> free_slots;
for (auto & slot : slots) {
if (!slot.is_processing() && slot.id != exclude_id_slot) {
free_slots.push_back(&slot);
}
if (free_slots.size() >= n_slots_needed) {
break;
}
}
return free_slots;
}
// launch multiple slots for parent + child tasks
bool launch_slots_with_parent_task(server_slot & parent_slot, std::vector<server_slot *> & child_slots, server_task && parent_task) {
GGML_ASSERT(!parent_slot.is_processing());
GGML_ASSERT(parent_task.is_parent());
GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
int id_parent = parent_task.id;
SRV_INF("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());
// to be called in case of failure to release all launched slots
auto release_slots = [this, id_parent]() {
for (auto & slot : slots) {
if (slot.is_processing() && (
slot.task->id == id_parent ||
slot.task->id_parent == id_parent
)) {
slot.release();
}
}
};
// launch all child tasks first
size_t idx = 0;
GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
for (auto * slot : child_slots) {
int id_child = parent_task.child_tasks[idx].id;
if (!launch_slot_with_task(*slot, std::move(parent_task.child_tasks[idx]))) {
SRV_ERR("failed to launch slot with child task, id_task = %d\n", id_child);
release_slots();
return false;
}
idx++;
}
// finally, launch the parent task
if (!launch_slot_with_task(parent_slot, std::move(parent_task))) {
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_parent);
release_slots();
return false;
}
return true;
}
void process_single_task(server_task && task) {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
@@ -1683,55 +1602,31 @@ private:
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
{
// special case: if input is provided via CLI, tokenize it first
// otherwise, no need to tokenize as it's already done inside the HTTP thread
if (task.cli_input != nullptr) {
if (!tokenize_cli_input(task)) {
break;
}
if (!tokenize_cli_input(task)) {
break;
}
const int id_slot = task.id_slot;
const int id_task = task.id;
server_slot * slot = id_slot != -1
? get_slot_by_id(id_slot)
: get_available_slot(task);
//
// slot scheduling logic
//
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
if (slot == nullptr) {
// if no slot is available, we defer this task for processing later
SRV_DBG("no slot is available, defer task, id_task = %d\n", id_task);
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
queue_tasks.defer(std::move(task));
break;
}
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", id_task);
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(std::move(task));
break;
}
if (task.is_parent()) {
// try getting free slots for all child tasks
size_t n_child_tasks = task.child_tasks.size();
std::vector<server_slot *> child_slots = get_free_slots(n_child_tasks, slot->id);
if (child_slots.size() < n_child_tasks) {
SRV_DBG("not enough free slots for child tasks, n_free = %zu, n_children = %zu, defer task, id_task = %d\n", child_slots.size(), n_child_tasks, id_task);
queue_tasks.defer(std::move(task));
break;
}
if (!launch_slots_with_parent_task(*slot, child_slots, std::move(task))) {
SRV_ERR("failed to launch slot with parent task, id_task = %d\n", id_task);
break; // drop the task
}
} else if (!launch_slot_with_task(*slot, std::move(task))) {
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
break; // drop the task
if (!launch_slot_with_task(*slot, std::move(task))) {
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
break;
}
} break;
case SERVER_TASK_TYPE_CANCEL:
@@ -1905,7 +1800,7 @@ private:
// Erase token cache
const size_t n_erased = slot->prompt.tokens.size();
slot->prompt_clear(false);
clear_slot(*slot);
auto res = std::make_unique<server_task_result_slot_erase>();
res->id = task.id;
@@ -2000,7 +1895,7 @@ private:
GGML_ABORT("not supported by multimodal");
}
if (slot.task->is_parent() || slot.task->is_child()) {
if (slot.is_parent() || slot.is_child()) {
send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
slot.release();
continue;
@@ -2139,12 +2034,6 @@ private:
continue;
}
// check if this is a child slot
if (slot.state == SLOT_STATE_WAIT_OTHER) {
SLT_DBG(slot, "%s", "waiting for parent slot to complete\n");
continue;
}
// this slot still has a prompt to be processed
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
const auto & input_tokens = slot.task->tokens;
@@ -2187,7 +2076,7 @@ private:
}
// TODO: support memory-less logits computation
if (slot.task->need_logits() && !llama_get_memory(ctx)) {
if (slot.need_logits() && !llama_get_memory(ctx)) {
send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
slot.release();
continue;
@@ -2424,12 +2313,6 @@ private:
slot.n_prompt_tokens_processed = 0;
slot.prompt.tokens.keep_first(n_past);
// send initial 0% progress update if needed
// this is to signal the client that the request has started processing
if (slot.task->params.stream && slot.task->params.return_progress) {
send_partial_response(slot, {}, true);
}
}
if (!slot.can_split()) {
@@ -2447,7 +2330,7 @@ private:
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
slot.prompt_clear(true);
clear_slot(slot);
// there is no common part left
slot.n_prompt_tokens_cache = 0;
@@ -2526,7 +2409,7 @@ private:
cur_tok,
slot.prompt.tokens.pos_next(),
{ slot.id },
slot.task->need_embd());
slot.need_embd());
slot.prompt.tokens.push_back(cur_tok);
slot.n_prompt_tokens_processed++;
@@ -2547,6 +2430,16 @@ private:
GGML_ASSERT(batch.n_tokens > 0);
common_sampler_reset(slot.smpl.get());
// Process all prompt tokens through sampler system
for (int i = 0; i < slot.task->n_tokens(); ++i) {
llama_token id = input_tokens[i];
if (id != LLAMA_TOKEN_NULL) {
common_sampler_accept(slot.smpl.get(), id, false);
}
}
// extract the logits only for the last token
batch.logits[batch.n_tokens - 1] = true;
@@ -2555,8 +2448,6 @@ private:
SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
slot.init_sampler();
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);
@@ -2603,6 +2494,11 @@ private:
}
}
if (batch.n_tokens == 0) {
SRV_WRN("%s", "no tokens to decode\n");
return;
}
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
if (slot_batched) {
@@ -2616,11 +2512,7 @@ private:
slot_batched->lora[alora_disabled_id].scale = alora_scale;
}
llama_set_embeddings(ctx, slot_batched->task->need_embd());
}
if (batch.n_tokens == 0) {
SRV_WRN("%s", "no tokens to decode\n");
llama_set_embeddings(ctx, slot_batched->need_embd());
}
int32_t i_next = 0;
@@ -2674,7 +2566,7 @@ private:
// note: it's complicated to keep track of how much of the current batch has been
// processed before the error occurred, so we simply clear the entire context
slot.prompt_clear(false);
clear_slot(slot);
}
}
@@ -2698,30 +2590,31 @@ private:
// on successful decode, restore the original batch size
n_batch = llama_n_batch(ctx);
// handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
// technically, measuring the time here excludes the sampling time for the last batch
// but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
const int64_t t_current = ggml_time_us();
for (auto & slot : slots) {
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) {
std::vector<server_slot *> children;
// may need to copy state to other slots
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
std::vector<server_slot *> child_slots;
for (auto & other : slots) {
if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
children.push_back(&other);
child_slots.push_back(&other);
}
}
// all children slots should already launched by launch_slots_with_parent_task()
// copy state to the child slots
for (auto & child : children) {
SLT_INF(slot, " - copying state to child %d\n", child->id);
GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
slot.copy_state_to(*child);
child->state = SLOT_STATE_DONE_PROMPT;
// we can only proceed if all child slots are having the correct tasks
if (child_slots.size() == slot.task->n_children) {
// copy state to the child slots
for (auto & child : child_slots) {
SLT_INF(slot, "copying state to child %d\n", child->id);
slot.copy_state_to(*child);
child->state = SLOT_STATE_DONE_PROMPT;
}
}
}
}
for (auto & slot : slots) {
// optionally send prompt processing progress
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
if (slot.task->params.stream && slot.task->params.return_progress) {
@@ -2749,8 +2642,6 @@ private:
continue; // continue loop of slots
}
GGML_ASSERT(slot.task->need_sampling());
// prompt evaluated for next-token prediction
slot.state = SLOT_STATE_GENERATING;
} else if (slot.state != SLOT_STATE_GENERATING) {
@@ -2769,9 +2660,6 @@ private:
common_sampler_accept(slot.smpl.get(), id, true);
// here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
const int64_t t_current = ggml_time_us();
slot.n_decoded += 1;
if (slot.n_decoded == 1) {
@@ -2808,15 +2696,13 @@ private:
continue;
}
const size_t n_draft = slot.drafted.size();
size_t n_draft = slot.drafted.size();
// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
slot.i_batch_dft.clear();
slot.drafted.clear();
const int64_t t_current = ggml_time_us();
slot.n_decoded += ids.size();
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
@@ -2898,12 +2784,6 @@ server_response_reader server_context::get_response_reader() {
server_context_meta server_context::get_meta() const {
auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
auto bos_id = llama_vocab_bos(impl->vocab);
auto eos_id = llama_vocab_eos(impl->vocab);
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : "";
return server_context_meta {
/* build_info */ build_info,
/* model_name */ impl->model_name,
@@ -2918,8 +2798,8 @@ server_context_meta server_context::get_meta() const {
/* chat_template */ common_chat_templates_source(impl->chat_templates.get()),
/* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
/* bos_token_str */ bos_token_str,
/* eos_token_str */ eos_token_str,
/* bos_token_str */ common_token_to_piece(impl->ctx, llama_vocab_bos(impl->vocab), true),
/* eos_token_str */ common_token_to_piece(impl->ctx, llama_vocab_eos(impl->vocab), true),
/* fim_pre_token */ llama_vocab_fim_pre(impl->vocab),
/* fim_sub_token */ llama_vocab_fim_suf(impl->vocab),
/* fim_mid_token */ llama_vocab_fim_mid(impl->vocab),
@@ -2992,9 +2872,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
// Everything else, including multimodal completions.
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
}
// tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
tasks.reserve(inputs.size());
for (size_t i = 0; i < inputs.size(); i++) {
server_task task = server_task(type);
@@ -3013,11 +2891,13 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
task.params.oaicompat_cmpl_id = completion_id;
task.params.oaicompat_model = meta->model_name;
// prepare child tasks
if (task.params.n_cmpl > 1) {
int n_children = task.params.n_cmpl - 1;
for (int j = 0; j < n_children; j++) {
task.add_child(task.id, rd.get_new_id());
task.n_children = task.params.n_cmpl - 1;
for (size_t j = 0; j < task.n_children; j++) {
server_task child = task.create_child(
task.id,
rd.get_new_id());
tasks.push_back(std::move(child));
}
}
@@ -3066,22 +2946,19 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
// in streaming mode, the first error must be treated as non-stream response
// this is to match the OAI API behavior
// ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
auto first_result = rd.next(req.should_stop);
server_task_result_ptr first_result = rd.next(req.should_stop);
if (first_result == nullptr) {
GGML_ASSERT(req.should_stop());
return res; // connection is closed
}
if (first_result->is_error()) {
} else if (first_result->is_error()) {
res->error(first_result->to_json());
return res;
} else {
GGML_ASSERT(
dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr
|| dynamic_cast<server_task_result_cmpl_final*>(first_result.get()) != nullptr
);
}
GGML_ASSERT(
dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr ||
dynamic_cast<server_task_result_cmpl_final*> (first_result.get()) != nullptr
);
// next responses are streamed
// to be sent immediately
json first_result_json = first_result->to_json();
@@ -3137,7 +3014,6 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
auto result = rd.next(req.should_stop);
if (result == nullptr) {
SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
GGML_ASSERT(req.should_stop());
return false; // should_stop condition met
}
@@ -3221,11 +3097,6 @@ void server_routes::init_routes() {
// get the result
auto result = res->rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
if (result->is_error()) {
res->error(result->to_json());
@@ -3326,11 +3197,6 @@ void server_routes::init_routes() {
// get the result
auto result = res->rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
if (result->is_error()) {
res->error(result->to_json());
@@ -3837,12 +3703,7 @@ void server_routes::init_routes() {
}
// get the result
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());
@@ -3871,12 +3732,7 @@ void server_routes::init_routes() {
}
// get the result
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());
@@ -3909,12 +3765,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const ser
rd.post_task(std::move(task));
}
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());
@@ -3945,12 +3796,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const
rd.post_task(std::move(task));
}
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());
@@ -3972,12 +3818,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const se
rd.post_task(std::move(task));
}
auto result = rd.next(req.should_stop);
if (!result) {
// connection was closed
GGML_ASSERT(req.should_stop());
return res;
}
server_task_result_ptr result = rd.next(req.should_stop);
if (result->is_error()) {
res->error(result->to_json());

View File

@@ -21,13 +21,11 @@
#ifdef _WIN32
#include <winsock2.h>
#include <windows.h>
#else
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
extern char **environ;
#endif
#if defined(__APPLE__) && defined(__MACH__)
@@ -36,8 +34,6 @@ extern char **environ;
#include <limits.h>
#endif
#define DEFAULT_STOP_TIMEOUT 10 // seconds
#define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit"
#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready"
@@ -101,49 +97,6 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
}
}
#ifdef _WIN32
static std::string wide_to_utf8(const wchar_t * ws) {
if (!ws || !*ws) {
return {};
}
const int len = static_cast<int>(std::wcslen(ws));
const int bytes = WideCharToMultiByte(CP_UTF8, 0, ws, len, nullptr, 0, nullptr, nullptr);
if (bytes == 0) {
return {};
}
std::string utf8(bytes, '\0');
WideCharToMultiByte(CP_UTF8, 0, ws, len, utf8.data(), bytes, nullptr, nullptr);
return utf8;
}
#endif
static std::vector<std::string> get_environment() {
std::vector<std::string> env;
#ifdef _WIN32
LPWCH env_block = GetEnvironmentStringsW();
if (!env_block) {
return env;
}
for (LPWCH e = env_block; *e; e += wcslen(e) + 1) {
env.emplace_back(wide_to_utf8(e));
}
FreeEnvironmentStringsW(env_block);
#else
if (environ == nullptr) {
return env;
}
for (char ** e = environ; *e != nullptr; e++) {
env.emplace_back(*e);
}
#endif
return env;
}
void server_model_meta::update_args(common_preset_context & ctx_preset, std::string bin_path) {
// update params
unset_reserved_args(preset, false);
@@ -162,11 +115,14 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
server_models::server_models(
const common_params & params,
int argc,
char ** argv)
char ** argv,
char ** envp)
: ctx_preset(LLAMA_EXAMPLE_SERVER),
base_params(params),
base_env(get_environment()),
base_preset(ctx_preset.load_from_args(argc, argv)) {
for (char ** env = envp; *env != nullptr; env++) {
base_env.push_back(std::string(*env));
}
// clean up base preset
unset_reserved_args(base_preset, true);
// set binary path
@@ -247,14 +203,13 @@ void server_models::load_models() {
// convert presets to server_model_meta and add to mapping
for (const auto & preset : final_presets) {
server_model_meta meta{
/* preset */ preset.second,
/* name */ preset.first,
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
/* preset */ preset.second,
/* name */ preset.first,
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* exit_code */ 0
};
add_model(std::move(meta));
}
@@ -272,20 +227,6 @@ void server_models::load_models() {
}
}
// handle custom stop-timeout option
for (auto & [name, inst] : mapping) {
std::string val;
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
try {
inst.meta.stop_timeout = std::stoi(val);
} catch (...) {
SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
}
}
}
// load any autoload models
std::vector<std::string> models_to_load;
for (const auto & [name, inst] : mapping) {
@@ -421,7 +362,7 @@ void server_models::unload_lru() {
int64_t lru_last_used = ggml_time_ms();
size_t count_active = 0;
{
std::unique_lock<std::mutex> lk(mutex);
std::lock_guard<std::mutex> lk(mutex);
for (const auto & m : mapping) {
if (m.second.meta.is_active()) {
count_active++;
@@ -435,13 +376,6 @@ void server_models::unload_lru() {
if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
unload(lru_model_name);
// wait for unload to complete
{
std::unique_lock<std::mutex> lk(mutex);
cv.wait(lk, [this, &lru_model_name]() {
return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
});
}
}
}
@@ -502,83 +436,38 @@ void server_models::load(const std::string & name) {
// start a thread to manage the child process
// captured variables are guaranteed to be destroyed only after the thread is joined
inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port, stop_timeout = inst.meta.stop_timeout]() {
FILE * stdin_file = subprocess_stdin(child_proc.get());
FILE * stdout_file = subprocess_stdout(child_proc.get()); // combined stdout/stderr
std::thread log_thread([&]() {
// read stdout/stderr and forward to main server log
// also handle status report from child process
bool state_received = false; // true if child state received
if (stdout_file) {
char buffer[4096];
while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
LOG("[%5d] %s", port, buffer);
if (!state_received && std::strstr(buffer, CMD_CHILD_TO_ROUTER_READY) != nullptr) {
// child process is ready
this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
state_received = true;
}
inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() {
// read stdout/stderr and forward to main server log
bool state_received = false; // true if child state received
FILE * p_stdout_stderr = subprocess_stdout(child_proc.get());
if (p_stdout_stderr) {
char buffer[4096];
while (fgets(buffer, sizeof(buffer), p_stdout_stderr) != nullptr) {
LOG("[%5d] %s", port, buffer);
if (!state_received && std::strstr(buffer, CMD_CHILD_TO_ROUTER_READY) != nullptr) {
// child process is ready
this->update_status(name, SERVER_MODEL_STATUS_LOADED);
state_received = true;
}
} else {
SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
}
});
std::thread stopping_thread([&]() {
// thread to monitor stopping signal
auto is_stopping = [this, &name]() {
return this->stopping_models.find(name) != this->stopping_models.end();
};
{
std::unique_lock<std::mutex> lk(this->mutex);
this->cv_stop.wait(lk, is_stopping);
}
SRV_INF("stopping model instance name=%s\n", name.c_str());
// send interrupt to child process
fprintf(stdin_file, "%s\n", CMD_ROUTER_TO_CHILD_EXIT);
fflush(stdin_file);
// wait to stop gracefully or timeout
int64_t start_time = ggml_time_ms();
while (true) {
std::unique_lock<std::mutex> lk(this->mutex);
if (!is_stopping()) {
return; // already stopped
}
int64_t elapsed = ggml_time_ms() - start_time;
if (elapsed >= stop_timeout * 1000) {
// timeout, force kill
SRV_WRN("force-killing model instance name=%s after %d seconds timeout\n", name.c_str(), stop_timeout);
subprocess_terminate(child_proc.get());
return;
}
this->cv_stop.wait_for(lk, std::chrono::seconds(1));
}
});
} else {
SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
}
// we reach here when the child process exits
// note: we cannot join() prior to this point because it will close stdin_file
if (log_thread.joinable()) {
log_thread.join();
}
// stop the timeout monitoring thread
{
std::lock_guard<std::mutex> lk(this->mutex);
stopping_models.erase(name);
cv_stop.notify_all();
}
if (stopping_thread.joinable()) {
stopping_thread.join();
}
// get the exit code
int exit_code = 0;
subprocess_join(child_proc.get(), &exit_code);
subprocess_destroy(child_proc.get());
// update status and exit code
this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
// update PID and status
{
std::lock_guard<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
auto & meta = it->second.meta;
meta.exit_code = exit_code;
meta.status = SERVER_MODEL_STATUS_UNLOADED;
}
cv.notify_all();
}
SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
});
@@ -599,14 +488,22 @@ void server_models::load(const std::string & name) {
cv.notify_all();
}
static void interrupt_subprocess(FILE * stdin_file) {
// because subprocess.h does not provide a way to send SIGINT,
// we will send a command to the child process to exit gracefully
if (stdin_file) {
fprintf(stdin_file, "%s\n", CMD_ROUTER_TO_CHILD_EXIT);
fflush(stdin_file);
}
}
void server_models::unload(const std::string & name) {
std::lock_guard<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
if (it->second.meta.is_active()) {
SRV_INF("unloading model instance name=%s\n", name.c_str());
stopping_models.insert(name);
cv_stop.notify_all();
interrupt_subprocess(it->second.stdin_file);
// status change will be handled by the managing thread
} else {
SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
@@ -621,8 +518,7 @@ void server_models::unload_all() {
for (auto & [name, inst] : mapping) {
if (inst.meta.is_active()) {
SRV_INF("unloading model instance name=%s\n", name.c_str());
stopping_models.insert(name);
cv_stop.notify_all();
interrupt_subprocess(inst.stdin_file);
// status change will be handled by the managing thread
}
// moving the thread to join list to avoid deadlock
@@ -636,15 +532,16 @@ void server_models::unload_all() {
}
}
void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
std::unique_lock<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
auto & meta = it->second.meta;
meta.status = status;
meta.exit_code = exit_code;
void server_models::update_status(const std::string & name, server_model_status status) {
// for now, we only allow updating to LOADED status
if (status != SERVER_MODEL_STATUS_LOADED) {
throw std::runtime_error("invalid status value");
}
auto meta = get_meta(name);
if (meta.has_value()) {
meta->status = status;
update_meta(name, meta.value());
}
cv.notify_all();
}
void server_models::wait_until_loaded(const std::string & name) {
@@ -671,7 +568,6 @@ bool server_models::ensure_model_loaded(const std::string & name) {
load(name);
}
// for loading state
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
wait_until_loaded(name);
@@ -704,10 +600,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
req.path,
req.headers,
req.body,
req.should_stop,
base_params.timeout_read,
base_params.timeout_write
);
req.should_stop);
return proxy;
}
@@ -902,7 +795,7 @@ void server_models_routes::init_routes() {
res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
return res;
}
if (!model->is_active()) {
if (model->status != SERVER_MODEL_STATUS_LOADED) {
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
return res;
}
@@ -995,18 +888,13 @@ server_http_proxy::server_http_proxy(
const std::string & path,
const std::map<std::string, std::string> & headers,
const std::string & body,
const std::function<bool()> should_stop,
int32_t timeout_read,
int32_t timeout_write
) {
const std::function<bool()> should_stop) {
// shared between reader and writer threads
auto cli = std::make_shared<httplib::Client>(host, port);
auto pipe = std::make_shared<pipe_t<msg_t>>();
// setup Client
cli->set_connection_timeout(0, 200000); // 200 milliseconds
cli->set_write_timeout(timeout_read, 0); // reversed for cli (client) vs srv (server)
cli->set_read_timeout(timeout_write, 0);
this->status = 500; // to be overwritten upon response
this->cleanup = [pipe]() {
pipe->close_read();

View File

@@ -9,7 +9,6 @@
#include <condition_variable>
#include <functional>
#include <memory>
#include <set>
/**
* state diagram:
@@ -57,7 +56,6 @@ struct server_model_meta {
int64_t last_used = 0; // for LRU unloading
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
bool is_active() const {
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
@@ -85,10 +83,6 @@ private:
std::condition_variable cv;
std::map<std::string, instance_t> mapping;
// for stopping models
std::condition_variable cv_stop;
std::set<std::string> stopping_models;
common_preset_context ctx_preset;
common_params base_params;
@@ -105,7 +99,7 @@ private:
void add_model(server_model_meta && meta);
public:
server_models(const common_params & params, int argc, char ** argv);
server_models(const common_params & params, int argc, char ** argv, char ** envp);
void load_models();
@@ -125,7 +119,7 @@ public:
void unload_all();
// update the status of a model instance (thread-safe)
void update_status(const std::string & name, server_model_status status, int exit_code);
void update_status(const std::string & name, server_model_status status);
// wait until the model instance is fully loaded (thread-safe)
// return when the model is loaded or failed to load
@@ -147,8 +141,8 @@ struct server_models_routes {
common_params params;
json webui_settings = json::object();
server_models models;
server_models_routes(const common_params & params, int argc, char ** argv)
: params(params), models(params, argc, argv) {
server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
: params(params), models(params, argc, argv, envp) {
if (!this->params.webui_config_json.empty()) {
try {
webui_settings = json::parse(this->params.webui_config_json);
@@ -183,10 +177,7 @@ public:
const std::string & path,
const std::map<std::string, std::string> & headers,
const std::string & body,
const std::function<bool()> should_stop,
int32_t timeout_read,
int32_t timeout_write
);
const std::function<bool()> should_stop);
~server_http_proxy() {
if (cleanup) {
cleanup();

View File

@@ -74,26 +74,11 @@ int server_queue::get_new_id() {
return new_id;
}
void server_queue::pop_deferred_task(int id_slot) {
void server_queue::pop_deferred_task() {
std::unique_lock<std::mutex> lock(mutex_tasks);
if (!queue_tasks_deferred.empty()) {
// try to find a task that uses the specified slot
bool found = false;
for (auto it = queue_tasks_deferred.begin(); it != queue_tasks_deferred.end(); ++it) {
if (it->id_slot == id_slot) {
QUE_DBG("pop deferred task (use slot %d), id_task = %d\n", id_slot, it->id);
queue_tasks.emplace_front(std::move(*it));
queue_tasks_deferred.erase(it);
found = true;
break;
}
}
// if not tasks found using the slot, just pop the first deferred task (default behavior)
if (!found) {
QUE_DBG("pop deferred task, id_task = %d\n", queue_tasks_deferred.front().id);
queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
queue_tasks_deferred.pop_front();
}
queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
queue_tasks_deferred.pop_front();
}
time_last_task = ggml_time_ms();
condition_tasks.notify_one();
@@ -232,12 +217,12 @@ void server_response::add_waiting_task_id(int id_task) {
waiting_task_ids.insert(id_task);
}
void server_response::add_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
void server_response::add_waiting_tasks(const std::vector<server_task> & tasks) {
std::unique_lock<std::mutex> lock(mutex_results);
for (const auto & id_task : id_tasks) {
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
waiting_task_ids.insert(id_task);
for (const auto & task : tasks) {
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
waiting_task_ids.insert(task.id);
}
}
@@ -342,7 +327,6 @@ void server_response::terminate() {
void server_response_reader::post_task(server_task && task, bool front) {
GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
GGML_ASSERT(!task.is_parent() && "not supported, use post_tasks() instead");
task.index = 0;
id_tasks.insert(task.id);
states.push_back(task.create_state());
@@ -354,18 +338,11 @@ void server_response_reader::post_tasks(std::vector<server_task> && tasks, bool
GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
id_tasks = server_task::get_list_id(tasks);
states.reserve(tasks.size());
size_t index = 0;
for (auto & task : tasks) {
task.index = index++;
states.push_back(task.create_state());
// for child tasks
for (auto & child_task : task.child_tasks) {
child_task.index = index++;
states.push_back(child_task.create_state());
}
for (size_t i = 0; i < tasks.size(); i++) {
tasks[i].index = i;
states.push_back(tasks[i].create_state());
}
GGML_ASSERT(states.size() == id_tasks.size());
queue_results.add_waiting_task_ids(id_tasks);
queue_results.add_waiting_tasks(tasks);
queue_tasks.post(std::move(tasks), front);
}

View File

@@ -44,8 +44,7 @@ public:
int get_new_id();
// Call when the state of one slot is changed, it will move one task from deferred to main queue
// prioritize tasks that use the specified slot (otherwise, pop the first deferred task)
void pop_deferred_task(int id_slot);
void pop_deferred_task();
// if sleeping, request exiting sleep state and wait until it is done
// returns immediately if not sleeping
@@ -125,7 +124,7 @@ public:
// add the id_task to the list of tasks waiting for response
void add_waiting_task_id(int id_task);
void add_waiting_task_ids(const std::unordered_set<int> & id_tasks);
void add_waiting_tasks(const std::vector<server_task> & tasks);
// when the request is finished, we can remove task associated with it
void remove_waiting_task_id(int id_task);

View File

@@ -78,7 +78,6 @@ json task_params::to_json(bool only_metrics) const {
{"speculative.p_min", speculative.p_min},
{"timings_per_token", timings_per_token},
{"post_sampling_probs", post_sampling_probs},
{"backend_sampling", sampling.backend_sampling},
{"lora", lora},
};
}
@@ -137,7 +136,6 @@ json task_params::to_json(bool only_metrics) const {
{"speculative.p_min", speculative.p_min},
{"timings_per_token", timings_per_token},
{"post_sampling_probs", post_sampling_probs},
{"backend_sampling", sampling.backend_sampling},
{"lora", lora},
};
}
@@ -160,7 +158,6 @@ task_params server_task::params_from_json_cmpl(
defaults.n_keep = params_base.n_keep;
defaults.n_predict = params_base.n_predict;
defaults.n_cache_reuse = params_base.n_cache_reuse;
defaults.cache_prompt = params_base.cache_prompt;
defaults.antiprompt = params_base.antiprompt;
// enabling this will output extra debug information in the HTTP responses from the server
@@ -170,7 +167,7 @@ task_params server_task::params_from_json_cmpl(
params.stream = json_value(data, "stream", false);
auto stream_opt = json_value(data, "stream_options", json::object());
params.include_usage = json_value(stream_opt, "include_usage", false);
params.cache_prompt = json_value(data, "cache_prompt", defaults.cache_prompt);
params.cache_prompt = json_value(data, "cache_prompt", true);
params.return_tokens = json_value(data, "return_tokens", false);
params.return_progress = json_value(data, "return_progress", false);
params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
@@ -204,12 +201,9 @@ task_params server_task::params_from_json_cmpl(
params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
params.sampling.adaptive_target = json_value(data, "adaptive_target", defaults.sampling.adaptive_target);
params.sampling.adaptive_decay = json_value(data, "adaptive_decay", defaults.sampling.adaptive_decay);
params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling);
params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
@@ -817,15 +811,6 @@ json server_task_result_cmpl_final::to_json_anthropic() {
msg.content = content;
}
// thinking block comes first (Anthropic extended thinking format)
if (!msg.reasoning_content.empty()) {
content_blocks.push_back({
{"type", "thinking"},
{"thinking", msg.reasoning_content},
{"signature", ""} // empty signature for local models (no cryptographic verification)
});
}
if (!msg.content.empty()) {
content_blocks.push_back({
{"type", "text"},
@@ -874,57 +859,20 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
}
bool has_thinking = !oaicompat_msg.reasoning_content.empty();
bool has_text = !oaicompat_msg.content.empty();
bool has_text = !oaicompat_msg.content.empty();
size_t num_tool_calls = oaicompat_msg.tool_calls.size();
// content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
size_t thinking_block_index = 0;
size_t text_block_index = has_thinking ? 1 : 0;
bool thinking_block_started = false;
bool text_block_started = false;
bool text_block_started = false;
std::unordered_set<size_t> tool_calls_started;
for (const auto & diff : oaicompat_msg_diffs) {
// handle thinking/reasoning content
if (!diff.reasoning_content_delta.empty()) {
if (!thinking_block_started) {
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", thinking_block_index},
{"content_block", {
{"type", "thinking"},
{"thinking", ""}
}}
}}
});
thinking_block_started = true;
}
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", thinking_block_index},
{"delta", {
{"type", "thinking_delta"},
{"thinking", diff.reasoning_content_delta}
}}
}}
});
}
// handle regular text content
if (!diff.content_delta.empty()) {
if (!text_block_started) {
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", text_block_index},
{"index", 0},
{"content_block", {
{"type", "text"},
{"text", ""}
@@ -938,7 +886,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", text_block_index},
{"index", 0},
{"delta", {
{"type", "text_delta"},
{"text", diff.content_delta}
@@ -947,9 +895,8 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
});
}
// handle tool calls
if (diff.tool_call_index != std::string::npos) {
size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + diff.tool_call_index;
size_t content_block_index = (has_text ? 1 : 0) + diff.tool_call_index;
if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
@@ -985,42 +932,18 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
}
}
// close content blocks in order
if (has_thinking) {
// Anthropic API requires a signature_delta before closing thinking blocks
// We use an empty signature since we can't generate a cryptographic signature for local models
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", thinking_block_index},
{"delta", {
{"type", "signature_delta"},
{"signature", ""}
}}
}}
});
events.push_back({
{"event", "content_block_stop"},
{"data", {
{"type", "content_block_stop"},
{"index", thinking_block_index}
}}
});
}
if (has_text) {
events.push_back({
{"event", "content_block_stop"},
{"data", {
{"type", "content_block_stop"},
{"index", text_block_index}
{"index", 0}
}}
});
}
for (size_t i = 0; i < num_tool_calls; i++) {
size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + i;
size_t content_block_index = (has_text ? 1 : 0) + i;
events.push_back({
{"event", "content_block_stop"},
{"data", {
@@ -1228,10 +1151,11 @@ json server_task_result_rerank::to_json() {
json server_task_result_cmpl_partial::to_json_anthropic() {
json events = json::array();
bool first = (n_decoded == 1);
// use member variables to track block state across streaming calls
// (anthropic_thinking_block_started, anthropic_text_block_started)
bool text_block_started = false;
if (first) {
text_block_started = false;
events.push_back({
{"event", "message_start"},
{"data", {
@@ -1253,69 +1177,28 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
});
}
// content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
size_t thinking_block_index = 0;
// use anthropic_has_reasoning (set in update()) to know if ANY reasoning was generated
size_t text_block_index = anthropic_has_reasoning ? 1 : 0;
// use local copies of streaming state (copied from task_result_state in update())
// these reflect the state BEFORE this chunk was processed
bool thinking_started = anthropic_thinking_block_started;
bool text_started = anthropic_text_block_started;
for (const auto & diff : oaicompat_msg_diffs) {
// handle thinking/reasoning content
if (!diff.reasoning_content_delta.empty()) {
if (!thinking_started) {
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", thinking_block_index},
{"content_block", {
{"type", "thinking"},
{"thinking", ""}
}}
}}
});
thinking_started = true;
}
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", thinking_block_index},
{"delta", {
{"type", "thinking_delta"},
{"thinking", diff.reasoning_content_delta}
}}
}}
});
}
// handle regular text content
if (!diff.content_delta.empty()) {
if (!text_started) {
if (!text_block_started) {
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", text_block_index},
{"index", 0},
{"content_block", {
{"type", "text"},
{"text", ""}
}}
}}
});
text_started = true;
text_block_started = true;
}
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", text_block_index},
{"index", 0},
{"delta", {
{"type", "text_delta"},
{"text", diff.content_delta}
@@ -1324,10 +1207,8 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
});
}
// handle tool calls
if (diff.tool_call_index != std::string::npos) {
// use anthropic_has_reasoning for thinking block count (persists across calls)
size_t content_block_index = (anthropic_has_reasoning ? 1 : 0) + (text_started ? 1 : 0) + diff.tool_call_index;
size_t content_block_index = (text_block_started ? 1 : 0) + diff.tool_call_index;
if (!diff.tool_call_delta.name.empty()) {
events.push_back({

View File

@@ -96,10 +96,6 @@ struct task_result_state {
std::string generated_text; // append new chunks of generated text here
std::vector<std::string> generated_tool_call_ids;
// for Anthropic API streaming: track content block state across chunks
bool anthropic_thinking_block_started = false;
bool anthropic_text_block_started = false;
task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
: oaicompat_chat_syntax(oaicompat_chat_syntax) {}
@@ -121,10 +117,8 @@ struct server_task {
int id_slot = -1;
// used by parallel sampling (multiple completions from same prompt)
int id_parent = -1;
// temporary store of child tasks for scheduling
// note: accessing to elements is invalid after the task is moved to server_slot
std::vector<server_task> child_tasks;
size_t n_children = 0; // number of tasks reusing this prompt
int id_parent = -1;
// used by SERVER_TASK_TYPE_INFERENCE
task_params params;
@@ -158,36 +152,6 @@ struct server_task {
return tokens.size();
}
bool need_embd() const {
switch (type) {
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
return true;
default:
return false;
}
}
bool need_logits() const {
switch (type) {
case SERVER_TASK_TYPE_COMPLETION:
case SERVER_TASK_TYPE_INFILL:
return true;
default:
return false;
}
}
bool need_sampling() const {
switch (type) {
case SERVER_TASK_TYPE_COMPLETION:
case SERVER_TASK_TYPE_INFILL:
return true;
default:
return false;
}
}
static task_params params_from_json_cmpl(
const llama_vocab * vocab,
const common_params & params_base,
@@ -199,30 +163,18 @@ struct server_task {
std::unordered_set<int> ids(tasks.size());
for (size_t i = 0; i < tasks.size(); i++) {
ids.insert(tasks[i].id);
for (auto & child : tasks[i].child_tasks) {
ids.insert(child.id);
}
}
return ids;
}
void add_child(int id_parent, int id_child) {
server_task create_child(int id_parent, int id_child) const {
server_task copy;
copy.id = id_child;
copy.id_parent = id_parent;
copy.params = params;
copy.type = type;
copy.tokens = tokens.clone();
copy.id_slot = -1; // child tasks cannot specify slot
// use different sampling seed for each child
// note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) {
copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1;
}
child_tasks.push_back(std::move(copy));
return copy;
}
// the task will be moved into queue, then onto slots
@@ -230,14 +182,6 @@ struct server_task {
task_result_state create_state() const {
return task_result_state(params.oaicompat_chat_syntax);
}
bool is_parent() const {
return child_tasks.size() > 0;
}
bool is_child() const {
return id_parent != -1;
}
};
struct result_timings {
@@ -393,12 +337,6 @@ struct server_task_result_cmpl_partial : server_task_result {
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
bool is_updated = false;
// for Anthropic API: track if any reasoning content has been generated
bool anthropic_has_reasoning = false;
// Streaming state copied from task_result_state for this chunk
bool anthropic_thinking_block_started = false;
bool anthropic_text_block_started = false;
virtual bool is_stop() override {
return false; // in stream mode, partial responses are not considered stop
}
@@ -408,22 +346,6 @@ struct server_task_result_cmpl_partial : server_task_result {
virtual void update(task_result_state & state) override {
is_updated = true;
state.update_chat_msg(content, true, oaicompat_msg_diffs);
// track if the accumulated message has any reasoning content
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
// Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk)
anthropic_thinking_block_started = state.anthropic_thinking_block_started;
anthropic_text_block_started = state.anthropic_text_block_started;
// Pre-compute state updates based on diffs (for next chunk)
for (const auto & diff : oaicompat_msg_diffs) {
if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) {
state.anthropic_thinking_block_started = true;
}
if (!diff.content_delta.empty() && !state.anthropic_text_block_started) {
state.anthropic_text_block_started = true;
}
}
}
json to_json_non_oaicompat();

View File

@@ -66,7 +66,7 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
};
}
int main(int argc, char ** argv) {
int main(int argc, char ** argv, char ** envp) {
// own arguments required by this example
common_params params;
@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
if (is_router_server) {
// setup server instances manager
try {
models_routes.emplace(params, argc, argv);
models_routes.emplace(params, argc, argv, envp);
} catch (const std::exception & e) {
LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
return 1;

View File

@@ -434,8 +434,8 @@ def test_context_size_exceeded_stream():
@pytest.mark.parametrize(
"n_batch,batch_count,reuse_cache",
[
(64, 4, False),
(64, 2, True),
(64, 3, False),
(64, 1, True),
]
)
def test_return_progress(n_batch, batch_count, reuse_cache):
@@ -462,18 +462,10 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
res = make_cmpl_request()
last_progress = None
total_batch_count = 0
for data in res:
cur_progress = data.get("prompt_progress", None)
if cur_progress is None:
continue
if total_batch_count == 0:
# first progress report must have n_cache == n_processed
assert cur_progress["total"] > 0
assert cur_progress["cache"] == cur_progress["processed"]
if reuse_cache:
# when reusing cache, we expect some cached tokens
assert cur_progress["cache"] > 0
if last_progress is not None:
assert cur_progress["total"] == last_progress["total"]
assert cur_progress["cache"] == last_progress["cache"]
@@ -481,7 +473,6 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
total_batch_count += 1
last_progress = cur_progress
# last progress should indicate completion (all tokens processed)
assert last_progress is not None
assert last_progress["total"] > 0
assert last_progress["processed"] == last_progress["total"]
@@ -491,22 +482,17 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
def test_chat_completions_multiple_choices():
global server
server.start()
# make sure cache can be reused across multiple choices and multiple requests
# ref: https://github.com/ggml-org/llama.cpp/pull/18663
for _ in range(2):
res = server.make_request("POST", "/chat/completions", data={
"max_tokens": 8,
"n": 2,
"messages": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
# test forcing the same slot to be used
# the scheduler should not be locked up in this case
"id_slot": 0,
})
assert res.status_code == 200
assert len(res.body["choices"]) == 2
for choice in res.body["choices"]:
assert "assistant" == choice["message"]["role"]
assert choice["finish_reason"] == "length"
res = server.make_request("POST", "/chat/completions", data={
"max_tokens": 8,
"n": 2,
"messages": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
})
assert res.status_code == 200
assert len(res.body["choices"]) == 2
for choice in res.body["choices"]:
assert "assistant" == choice["message"]["role"]
assert match_regex("Suddenly", choice["message"]["content"])
assert choice["finish_reason"] == "length"

View File

@@ -805,92 +805,3 @@ def test_anthropic_vs_openai_different_response_format():
assert "input_tokens" in anthropic_res.body["usage"]
assert "completion_tokens" in openai_res.body["usage"]
assert "output_tokens" in anthropic_res.body["usage"]
# Extended thinking tests with reasoning models
@pytest.mark.slow
@pytest.mark.parametrize("stream", [False, True])
def test_anthropic_thinking_with_reasoning_model(stream):
"""Test that thinking content blocks are properly returned for reasoning models"""
global server
server = ServerProcess()
server.model_hf_repo = "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF"
server.model_hf_file = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
server.reasoning_format = "deepseek"
server.jinja = True
server.n_ctx = 8192
server.n_predict = 1024
server.server_port = 8084
server.start(timeout_seconds=600) # large model needs time to download
if stream:
res = server.make_stream_request("POST", "/v1/messages", data={
"model": "test",
"max_tokens": 1024,
"thinking": {
"type": "enabled",
"budget_tokens": 500
},
"messages": [
{"role": "user", "content": "What is 2+2?"}
],
"stream": True
})
events = list(res)
# should have thinking content block events
thinking_starts = [e for e in events if
e.get("type") == "content_block_start" and
e.get("content_block", {}).get("type") == "thinking"]
assert len(thinking_starts) > 0, "Should have thinking content_block_start event"
assert thinking_starts[0]["index"] == 0, "Thinking block should be at index 0"
# should have thinking_delta events
thinking_deltas = [e for e in events if
e.get("type") == "content_block_delta" and
e.get("delta", {}).get("type") == "thinking_delta"]
assert len(thinking_deltas) > 0, "Should have thinking_delta events"
# should have signature_delta event before thinking block closes (Anthropic API requirement)
signature_deltas = [e for e in events if
e.get("type") == "content_block_delta" and
e.get("delta", {}).get("type") == "signature_delta"]
assert len(signature_deltas) > 0, "Should have signature_delta event for thinking block"
# should have text block after thinking
text_starts = [e for e in events if
e.get("type") == "content_block_start" and
e.get("content_block", {}).get("type") == "text"]
assert len(text_starts) > 0, "Should have text content_block_start event"
assert text_starts[0]["index"] == 1, "Text block should be at index 1 (after thinking)"
else:
res = server.make_request("POST", "/v1/messages", data={
"model": "test",
"max_tokens": 1024,
"thinking": {
"type": "enabled",
"budget_tokens": 500
},
"messages": [
{"role": "user", "content": "What is 2+2?"}
]
})
assert res.status_code == 200
assert res.body["type"] == "message"
content = res.body["content"]
assert len(content) >= 2, "Should have at least thinking and text blocks"
# first block should be thinking
thinking_blocks = [b for b in content if b.get("type") == "thinking"]
assert len(thinking_blocks) > 0, "Should have thinking content block"
assert "thinking" in thinking_blocks[0], "Thinking block should have 'thinking' field"
assert len(thinking_blocks[0]["thinking"]) > 0, "Thinking content should not be empty"
assert "signature" in thinking_blocks[0], "Thinking block should have 'signature' field (Anthropic API requirement)"
# should also have text block
text_blocks = [b for b in content if b.get("type") == "text"]
assert len(text_blocks) > 0, "Should have text content block"

View File

@@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
if expect_ok:
assert res.status_code == 200
# note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
if res.status_code == 200:
assert "content" in res.body
if "timings" in res.body:
assert res.body["timings"]["predicted_n"] == n_predict
else:
assert res.status_code == 500
assert "content" not in res.body
@pytest.mark.parametrize(

View File

@@ -10,11 +10,21 @@
import { INPUT_CLASSES } from '$lib/constants/input-classes';
import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
import { config } from '$lib/stores/settings.svelte';
import { modelOptions, selectedModelId } from '$lib/stores/models.svelte';
import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
import { isRouterMode } from '$lib/stores/server.svelte';
import { chatStore } from '$lib/stores/chat.svelte';
import { activeMessages } from '$lib/stores/conversations.svelte';
import { MimeTypeText } from '$lib/enums';
import {
FileTypeCategory,
MimeTypeApplication,
FileExtensionAudio,
FileExtensionImage,
FileExtensionPdf,
FileExtensionText,
MimeTypeAudio,
MimeTypeImage,
MimeTypeText
} from '$lib/enums';
import { isIMEComposing, parseClipboardContent } from '$lib/utils';
import {
AudioRecorder,
@@ -51,6 +61,7 @@
let audioRecorder: AudioRecorder | undefined;
let chatFormActionsRef: ChatFormActions | undefined = $state(undefined);
let currentConfig = $derived(config());
let fileAcceptString = $state<string | undefined>(undefined);
let fileInputRef: ChatFormFileInputInvisible | undefined = $state(undefined);
let isRecording = $state(false);
let message = $state('');
@@ -93,6 +104,40 @@
return null;
});
// State for model props reactivity
let modelPropsVersion = $state(0);
// Fetch model props when active model changes (works for both MODEL and ROUTER mode)
$effect(() => {
if (activeModelId) {
const cached = modelsStore.getModelProps(activeModelId);
if (!cached) {
modelsStore.fetchModelProps(activeModelId).then(() => {
modelPropsVersion++;
});
}
}
});
// Derive modalities from active model (works for both MODEL and ROUTER mode)
let hasAudioModality = $derived.by(() => {
if (activeModelId) {
void modelPropsVersion; // Trigger reactivity on props fetch
return modelsStore.modelSupportsAudio(activeModelId);
}
return false;
});
let hasVisionModality = $derived.by(() => {
if (activeModelId) {
void modelPropsVersion; // Trigger reactivity on props fetch
return modelsStore.modelSupportsVision(activeModelId);
}
return false;
});
function checkModelSelected(): boolean {
if (!hasModelSelected) {
// Open the model selector
@@ -103,12 +148,42 @@
return true;
}
function getAcceptStringForFileType(fileType: FileTypeCategory): string {
switch (fileType) {
case FileTypeCategory.IMAGE:
return [...Object.values(FileExtensionImage), ...Object.values(MimeTypeImage)].join(',');
case FileTypeCategory.AUDIO:
return [...Object.values(FileExtensionAudio), ...Object.values(MimeTypeAudio)].join(',');
case FileTypeCategory.PDF:
return [...Object.values(FileExtensionPdf), ...Object.values(MimeTypeApplication)].join(
','
);
case FileTypeCategory.TEXT:
return [...Object.values(FileExtensionText), MimeTypeText.PLAIN].join(',');
default:
return '';
}
}
function handleFileSelect(files: File[]) {
onFileUpload?.(files);
}
function handleFileUpload() {
fileInputRef?.click();
function handleFileUpload(fileType?: FileTypeCategory) {
if (fileType) {
fileAcceptString = getAcceptStringForFileType(fileType);
} else {
fileAcceptString = undefined;
}
// Use setTimeout to ensure the accept attribute is applied before opening dialog
setTimeout(() => {
fileInputRef?.click();
}, 10);
}
async function handleKeydown(event: KeyboardEvent) {
@@ -268,7 +343,13 @@
});
</script>
<ChatFormFileInputInvisible bind:this={fileInputRef} onFileSelect={handleFileSelect} />
<ChatFormFileInputInvisible
bind:this={fileInputRef}
bind:accept={fileAcceptString}
{hasAudioModality}
{hasVisionModality}
onFileSelect={handleFileSelect}
/>
<form
onsubmit={handleSubmit}

View File

@@ -4,13 +4,14 @@
import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
import * as Tooltip from '$lib/components/ui/tooltip';
import { FILE_TYPE_ICONS } from '$lib/constants/icons';
import { FileTypeCategory } from '$lib/enums';
interface Props {
class?: string;
disabled?: boolean;
hasAudioModality?: boolean;
hasVisionModality?: boolean;
onFileUpload?: () => void;
onFileUpload?: (fileType?: FileTypeCategory) => void;
}
let {
@@ -26,6 +27,10 @@
? 'Text files and PDFs supported. Images, audio, and video require vision models.'
: 'Attach files';
});
function handleFileUpload(fileType?: FileTypeCategory) {
onFileUpload?.(fileType);
}
</script>
<div class="flex items-center gap-1 {className}">
@@ -56,7 +61,7 @@
<DropdownMenu.Item
class="images-button flex cursor-pointer items-center gap-2"
disabled={!hasVisionModality}
onclick={() => onFileUpload?.()}
onclick={() => handleFileUpload(FileTypeCategory.IMAGE)}
>
<FILE_TYPE_ICONS.image class="h-4 w-4" />
@@ -76,7 +81,7 @@
<DropdownMenu.Item
class="audio-button flex cursor-pointer items-center gap-2"
disabled={!hasAudioModality}
onclick={() => onFileUpload?.()}
onclick={() => handleFileUpload(FileTypeCategory.AUDIO)}
>
<FILE_TYPE_ICONS.audio class="h-4 w-4" />
@@ -93,7 +98,7 @@
<DropdownMenu.Item
class="flex cursor-pointer items-center gap-2"
onclick={() => onFileUpload?.()}
onclick={() => handleFileUpload(FileTypeCategory.TEXT)}
>
<FILE_TYPE_ICONS.text class="h-4 w-4" />
@@ -104,7 +109,7 @@
<Tooltip.Trigger class="w-full">
<DropdownMenu.Item
class="flex cursor-pointer items-center gap-2"
onclick={() => onFileUpload?.()}
onclick={() => handleFileUpload(FileTypeCategory.PDF)}
>
<FILE_TYPE_ICONS.pdf class="h-4 w-4" />

View File

@@ -24,7 +24,7 @@
isRecording?: boolean;
hasText?: boolean;
uploadedFiles?: ChatUploadedFile[];
onFileUpload?: () => void;
onFileUpload?: (fileType?: FileTypeCategory) => void;
onMicClick?: () => void;
onStop?: () => void;
}

View File

@@ -1,14 +1,35 @@
<script lang="ts">
import { generateModalityAwareAcceptString } from '$lib/utils';
interface Props {
accept?: string;
class?: string;
hasAudioModality?: boolean;
hasVisionModality?: boolean;
multiple?: boolean;
onFileSelect?: (files: File[]) => void;
}
let { class: className = '', multiple = true, onFileSelect }: Props = $props();
let {
accept = $bindable(),
class: className = '',
hasAudioModality = false,
hasVisionModality = false,
multiple = true,
onFileSelect
}: Props = $props();
let fileInputElement: HTMLInputElement | undefined;
// Use modality-aware accept string by default, but allow override
let finalAccept = $derived(
accept ??
generateModalityAwareAcceptString({
hasVision: hasVisionModality,
hasAudio: hasAudioModality
})
);
export function click() {
fileInputElement?.click();
}
@@ -25,6 +46,7 @@
bind:this={fileInputElement}
type="file"
{multiple}
accept={finalAccept}
onchange={handleFileSelect}
class="hidden {className}"
/>

View File

@@ -89,7 +89,6 @@
const fallbackToolCalls = $derived(typeof toolCallContent === 'string' ? toolCallContent : null);
const processingState = useProcessingState();
let currentConfig = $derived(config());
let isRouter = $derived(isRouterMode());
let displayedModel = $derived((): string | null => {
@@ -117,12 +116,6 @@
}
});
$effect(() => {
if (isLoading() && !message?.content?.trim()) {
processingState.startMonitoring();
}
});
function formatToolCallBadge(toolCall: ApiChatCompletionToolCall, index: number) {
const callNumber = index + 1;
const functionName = toolCall.function?.name?.trim();
@@ -193,7 +186,7 @@
<div class="mt-6 w-full max-w-[48rem]" in:fade>
<div class="processing-container">
<span class="processing-text">
{processingState.getPromptProgressText() ?? processingState.getProcessingMessage()}
{processingState.getProcessingMessage()}
</span>
</div>
</div>
@@ -270,23 +263,6 @@
predictedTokens={message.timings.predicted_n}
predictedMs={message.timings.predicted_ms}
/>
{:else if isLoading() && currentConfig.showMessageStats}
{@const liveStats = processingState.getLiveProcessingStats()}
{@const genStats = processingState.getLiveGenerationStats()}
{@const promptProgress = processingState.processingState?.promptProgress}
{@const isStillProcessingPrompt =
promptProgress && promptProgress.processed < promptProgress.total}
{#if liveStats || genStats}
<ChatMessageStatistics
isLive={true}
isProcessingPrompt={!!isStillProcessingPrompt}
promptTokens={liveStats?.tokensProcessed}
promptMs={liveStats?.timeMs}
predictedTokens={genStats?.tokensGenerated}
predictedMs={genStats?.timeMs}
/>
{/if}
{/if}
</div>
{/if}

View File

@@ -5,64 +5,21 @@
import { ChatMessageStatsView } from '$lib/enums';
interface Props {
predictedTokens?: number;
predictedMs?: number;
predictedTokens: number;
predictedMs: number;
promptTokens?: number;
promptMs?: number;
// Live mode: when true, shows stats during streaming
isLive?: boolean;
// Whether prompt processing is still in progress
isProcessingPrompt?: boolean;
// Initial view to show (defaults to READING in live mode)
initialView?: ChatMessageStatsView;
}
let {
predictedTokens,
predictedMs,
promptTokens,
promptMs,
isLive = false,
isProcessingPrompt = false,
initialView = ChatMessageStatsView.GENERATION
}: Props = $props();
let { predictedTokens, predictedMs, promptTokens, promptMs }: Props = $props();
let activeView: ChatMessageStatsView = $state(initialView);
let hasAutoSwitchedToGeneration = $state(false);
let activeView: ChatMessageStatsView = $state(ChatMessageStatsView.GENERATION);
// In live mode: auto-switch to GENERATION tab when prompt processing completes
$effect(() => {
if (isLive) {
// Auto-switch to generation tab only when prompt processing is done (once)
if (
!hasAutoSwitchedToGeneration &&
!isProcessingPrompt &&
predictedTokens &&
predictedTokens > 0
) {
activeView = ChatMessageStatsView.GENERATION;
hasAutoSwitchedToGeneration = true;
} else if (!hasAutoSwitchedToGeneration) {
// Stay on READING while prompt is still being processed
activeView = ChatMessageStatsView.READING;
}
}
});
let hasGenerationStats = $derived(
predictedTokens !== undefined &&
predictedTokens > 0 &&
predictedMs !== undefined &&
predictedMs > 0
);
let tokensPerSecond = $derived(hasGenerationStats ? (predictedTokens! / predictedMs!) * 1000 : 0);
let timeInSeconds = $derived(
predictedMs !== undefined ? (predictedMs / 1000).toFixed(2) : '0.00'
);
let tokensPerSecond = $derived((predictedTokens / predictedMs) * 1000);
let timeInSeconds = $derived((predictedMs / 1000).toFixed(2));
let promptTokensPerSecond = $derived(
promptTokens !== undefined && promptMs !== undefined && promptMs > 0
promptTokens !== undefined && promptMs !== undefined
? (promptTokens / promptMs) * 1000
: undefined
);
@@ -77,14 +34,11 @@
promptTokensPerSecond !== undefined &&
promptTimeInSeconds !== undefined
);
// In live mode, generation tab is disabled until we have generation stats
let isGenerationDisabled = $derived(isLive && !hasGenerationStats);
</script>
<div class="inline-flex items-center text-xs text-muted-foreground">
<div class="inline-flex items-center rounded-sm bg-muted-foreground/15 p-0.5">
{#if hasPromptStats || isLive}
{#if hasPromptStats}
<Tooltip.Root>
<Tooltip.Trigger>
<button
@@ -111,32 +65,25 @@
class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
ChatMessageStatsView.GENERATION
? 'bg-background text-foreground shadow-sm'
: isGenerationDisabled
? 'cursor-not-allowed opacity-40'
: 'hover:text-foreground'}"
onclick={() => !isGenerationDisabled && (activeView = ChatMessageStatsView.GENERATION)}
disabled={isGenerationDisabled}
: 'hover:text-foreground'}"
onclick={() => (activeView = ChatMessageStatsView.GENERATION)}
>
<Sparkles class="h-3 w-3" />
<span class="sr-only">Generation</span>
</button>
</Tooltip.Trigger>
<Tooltip.Content>
<p>
{isGenerationDisabled
? 'Generation (waiting for tokens...)'
: 'Generation (token output)'}
</p>
<p>Generation (token output)</p>
</Tooltip.Content>
</Tooltip.Root>
</div>
<div class="flex items-center gap-1 px-2">
{#if activeView === ChatMessageStatsView.GENERATION && hasGenerationStats}
{#if activeView === ChatMessageStatsView.GENERATION}
<BadgeChatStatistic
class="bg-transparent"
icon={WholeWord}
value="{predictedTokens?.toLocaleString()} tokens"
value="{predictedTokens} tokens"
tooltipLabel="Generated tokens"
/>
<BadgeChatStatistic

View File

@@ -185,11 +185,6 @@
key: 'samplers',
label: 'Samplers',
type: 'input'
},
{
key: 'backend_sampling',
label: 'Backend sampling',
type: 'checkbox'
}
]
},

View File

@@ -21,7 +21,6 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
autoMicOnEmpty: false,
// make sure these default values are in sync with `common.h`
samplers: 'top_k;typ_p;top_p;min_p;temperature',
backend_sampling: false,
temperature: 0.8,
dynatemp_range: 0.0,
dynatemp_exponent: 1.0,
@@ -58,8 +57,6 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
samplers:
'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
backend_sampling:
'Enable backend-based samplers. When enabled, supported samplers run on the accelerator backend for faster sampling.',
temperature:
'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
dynatemp_range:

View File

@@ -1,27 +1,10 @@
import { activeProcessingState } from '$lib/stores/chat.svelte';
import { config } from '$lib/stores/settings.svelte';
export interface LiveProcessingStats {
tokensProcessed: number;
totalTokens: number;
timeMs: number;
tokensPerSecond: number;
etaSecs?: number;
}
export interface LiveGenerationStats {
tokensGenerated: number;
timeMs: number;
tokensPerSecond: number;
}
export interface UseProcessingStateReturn {
readonly processingState: ApiProcessingState | null;
getProcessingDetails(): string[];
getProcessingMessage(): string;
getPromptProgressText(): string | null;
getLiveProcessingStats(): LiveProcessingStats | null;
getLiveGenerationStats(): LiveGenerationStats | null;
shouldShowDetails(): boolean;
startMonitoring(): void;
stopMonitoring(): void;
@@ -46,7 +29,6 @@ export interface UseProcessingStateReturn {
export function useProcessingState(): UseProcessingStateReturn {
let isMonitoring = $state(false);
let lastKnownState = $state<ApiProcessingState | null>(null);
let lastKnownProcessingStats = $state<LiveProcessingStats | null>(null);
// Derive processing state reactively from chatStore's direct state
const processingState = $derived.by(() => {
@@ -64,34 +46,6 @@ export function useProcessingState(): UseProcessingStateReturn {
}
});
// Track last known processing stats for when promptProgress disappears
$effect(() => {
if (processingState?.promptProgress) {
const { processed, total, time_ms, cache } = processingState.promptProgress;
const actualProcessed = processed - cache;
const actualTotal = total - cache;
if (actualProcessed > 0 && time_ms > 0) {
const tokensPerSecond = actualProcessed / (time_ms / 1000);
lastKnownProcessingStats = {
tokensProcessed: actualProcessed,
totalTokens: actualTotal,
timeMs: time_ms,
tokensPerSecond
};
}
}
});
function getETASecs(done: number, total: number, elapsedMs: number): number | undefined {
const elapsedSecs = elapsedMs / 1000;
const progressETASecs =
done === 0 || elapsedSecs < 0.5
? undefined // can be the case for the 0% progress report
: elapsedSecs * (total / done - 1);
return progressETASecs;
}
function startMonitoring(): void {
if (isMonitoring) return;
isMonitoring = true;
@@ -105,25 +59,28 @@ export function useProcessingState(): UseProcessingStateReturn {
const currentConfig = config();
if (!currentConfig.keepStatsVisible) {
lastKnownState = null;
lastKnownProcessingStats = null;
}
}
function getProcessingMessage(): string {
if (!processingState) {
const state = processingState;
if (!state) {
return 'Processing...';
}
switch (processingState.status) {
switch (state.status) {
case 'initializing':
return 'Initializing...';
case 'preparing':
if (processingState.progressPercent !== undefined) {
return `Processing (${processingState.progressPercent}%)`;
if (state.progressPercent !== undefined) {
return `Processing (${state.progressPercent}%)`;
}
return 'Preparing response...';
case 'generating':
return '';
if (state.tokensDecoded > 0) {
return `Generating... (${state.tokensDecoded} tokens)`;
}
return 'Generating...';
default:
return 'Processing...';
}
@@ -174,76 +131,8 @@ export function useProcessingState(): UseProcessingStateReturn {
}
function shouldShowDetails(): boolean {
return processingState !== null && processingState.status !== 'idle';
}
/**
* Returns a short progress message with percent
*/
function getPromptProgressText(): string | null {
if (!processingState?.promptProgress) return null;
const { processed, total, cache } = processingState.promptProgress;
const actualProcessed = processed - cache;
const actualTotal = total - cache;
const percent = Math.round((actualProcessed / actualTotal) * 100);
const eta = getETASecs(actualProcessed, actualTotal, processingState.promptProgress.time_ms);
if (eta !== undefined) {
const etaSecs = Math.ceil(eta);
return `Processing ${percent}% (ETA: ${etaSecs}s)`;
}
return `Processing ${percent}%`;
}
/**
* Returns live processing statistics for display (prompt processing phase)
* Returns last known stats when promptProgress becomes unavailable
*/
function getLiveProcessingStats(): LiveProcessingStats | null {
if (processingState?.promptProgress) {
const { processed, total, time_ms, cache } = processingState.promptProgress;
const actualProcessed = processed - cache;
const actualTotal = total - cache;
if (actualProcessed > 0 && time_ms > 0) {
const tokensPerSecond = actualProcessed / (time_ms / 1000);
return {
tokensProcessed: actualProcessed,
totalTokens: actualTotal,
timeMs: time_ms,
tokensPerSecond
};
}
}
// Return last known stats if promptProgress is no longer available
return lastKnownProcessingStats;
}
/**
* Returns live generation statistics for display (token generation phase)
*/
function getLiveGenerationStats(): LiveGenerationStats | null {
if (!processingState) return null;
const { tokensDecoded, tokensPerSecond } = processingState;
if (tokensDecoded <= 0) return null;
// Calculate time from tokens and speed
const timeMs =
tokensPerSecond && tokensPerSecond > 0 ? (tokensDecoded / tokensPerSecond) * 1000 : 0;
return {
tokensGenerated: tokensDecoded,
timeMs,
tokensPerSecond: tokensPerSecond || 0
};
const state = processingState;
return state !== null && state.status !== 'idle';
}
return {
@@ -252,9 +141,6 @@ export function useProcessingState(): UseProcessingStateReturn {
},
getProcessingDetails,
getProcessingMessage,
getPromptProgressText,
getLiveProcessingStats,
getLiveGenerationStats,
shouldShowDetails,
startMonitoring,
stopMonitoring

View File

@@ -86,7 +86,6 @@ export class ChatService {
dry_penalty_last_n,
// Other parameters
samplers,
backend_sampling,
custom,
timings_per_token,
// Config options
@@ -118,8 +117,7 @@ export class ChatService {
role: msg.role,
content: msg.content
})),
stream,
return_progress: stream ? true : undefined
stream
};
// Include model in request if provided (required in ROUTER mode)
@@ -160,8 +158,6 @@ export class ChatService {
: samplers;
}
if (backend_sampling !== undefined) requestBody.backend_sampling = backend_sampling;
if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;
if (custom) {
@@ -275,7 +271,7 @@ export class ChatService {
onReasoningChunk?: (chunk: string) => void,
onToolCallChunk?: (chunk: string) => void,
onModel?: (model: string) => void,
onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
onTimings?: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
conversationId?: string,
abortSignal?: AbortSignal
): Promise<void> {
@@ -370,13 +366,11 @@ export class ChatService {
onModel?.(chunkModel);
}
if (promptProgress) {
ChatService.notifyTimings(undefined, promptProgress, onTimings);
}
if (timings) {
if (timings || promptProgress) {
ChatService.notifyTimings(timings, promptProgress, onTimings);
lastTimings = timings;
if (timings) {
lastTimings = timings;
}
}
if (content) {
@@ -774,11 +768,10 @@ export class ChatService {
timings: ChatMessageTimings | undefined,
promptProgress: ChatMessagePromptProgress | undefined,
onTimingsCallback:
| ((timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
| ((timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
| undefined
): void {
if (!onTimingsCallback || (!timings && !promptProgress)) return;
if (!timings || !onTimingsCallback) return;
onTimingsCallback(timings, promptProgress);
}
}

View File

@@ -303,17 +303,11 @@ class ChatStore {
const currentConfig = config();
const outputTokensMax = currentConfig.max_tokens || -1;
// Note: for timings data, the n_prompt does NOT include cache tokens
const contextUsed = promptTokens + cacheTokens + predictedTokens;
const outputTokensUsed = predictedTokens;
// Note: for prompt progress, the "processed" DOES include cache tokens
// we need to exclude them to get the real prompt tokens processed count
const progressCache = promptProgress?.cache || 0;
const progressActualDone = (promptProgress?.processed ?? 0) - progressCache;
const progressActualTotal = (promptProgress?.total ?? 0) - progressCache;
const progressPercent = promptProgress
? Math.round((progressActualDone / progressActualTotal) * 100)
? Math.round((promptProgress.processed / promptProgress.total) * 100)
: undefined;
return {
@@ -330,7 +324,6 @@ class ChatStore {
topP: currentConfig.top_p ?? 0.95,
speculative: false,
progressPercent,
promptProgress,
promptTokens,
promptMs,
cacheTokens
@@ -541,7 +534,7 @@ class ChatStore {
conversationsStore.updateMessageAtIndex(idx, { toolCalls: streamedToolCallContent });
},
onModel: (modelName: string) => recordModel(modelName),
onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
const tokensPerSecond =
timings?.predicted_ms && timings?.predicted_n
? (timings.predicted_n / timings.predicted_ms) * 1000
@@ -1039,7 +1032,7 @@ class ChatStore {
});
},
onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
const tokensPerSecond =
timings?.predicted_ms && timings?.predicted_n
? (timings.predicted_n / timings.predicted_ms) * 1000
@@ -1461,8 +1454,6 @@ class ChatStore {
if (hasValue(currentConfig.dry_penalty_last_n))
apiOptions.dry_penalty_last_n = Number(currentConfig.dry_penalty_last_n);
if (currentConfig.samplers) apiOptions.samplers = currentConfig.samplers;
if (currentConfig.backend_sampling)
apiOptions.backend_sampling = currentConfig.backend_sampling;
if (currentConfig.custom) apiOptions.custom = currentConfig.custom;
return apiOptions;

View File

@@ -294,14 +294,15 @@ class SettingsStore {
* This sets up the default values from /props endpoint
*/
syncWithServerDefaults(): void {
const propsDefaults = this.getServerDefaults();
if (Object.keys(propsDefaults).length === 0) {
console.warn('No server defaults available for initialization');
const serverParams = serverStore.defaultParams;
if (!serverParams) {
console.warn('No server parameters available for initialization');
return;
}
const propsDefaults = this.getServerDefaults();
for (const [key, propsValue] of Object.entries(propsDefaults)) {
const currentValue = getConfigValue(this.config, key);

View File

@@ -149,7 +149,6 @@ export interface ApiLlamaCppServerProps {
reasoning_in_content: boolean;
thinking_forced_open: boolean;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;
'speculative.n_min': number;
'speculative.p_min': number;
@@ -187,7 +186,6 @@ export interface ApiChatCompletionRequest {
}>;
stream?: boolean;
model?: string;
return_progress?: boolean;
// Reasoning parameters
reasoning_format?: string;
// Generation parameters
@@ -213,7 +211,6 @@ export interface ApiChatCompletionRequest {
dry_penalty_last_n?: number;
// Sampler configuration
samplers?: string[];
backend_sampling?: boolean;
// Custom parameters (JSON string)
custom?: Record<string, unknown>;
timings_per_token?: boolean;
@@ -314,7 +311,6 @@ export interface ApiSlotData {
reasoning_in_content: boolean;
thinking_forced_open: boolean;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;
'speculative.n_min': number;
'speculative.p_min': number;
@@ -345,7 +341,6 @@ export interface ApiProcessingState {
tokensPerSecond?: number;
// Progress information from prompt_progress
progressPercent?: number;
promptProgress?: ChatMessagePromptProgress;
promptTokens?: number;
promptMs?: number;
cacheTokens?: number;

View File

@@ -43,7 +43,6 @@ export interface SettingsChatServiceOptions {
dry_penalty_last_n?: number;
// Sampler configuration
samplers?: string | string[];
backend_sampling?: boolean;
// Custom parameters
custom?: string;
timings_per_token?: boolean;
@@ -52,7 +51,7 @@ export interface SettingsChatServiceOptions {
onReasoningChunk?: (chunk: string) => void;
onToolCallChunk?: (chunk: string) => void;
onModel?: (model: string) => void;
onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
onTimings?: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
onComplete?: (
response: string,
reasoningContent?: string,

View File

@@ -65,7 +65,10 @@ export async function copyCodeToClipboard(
successMessage = 'Code copied to clipboard',
errorMessage = 'Failed to copy code'
): Promise<boolean> {
return copyToClipboard(rawCode, successMessage, errorMessage);
const doc = new DOMParser().parseFromString(rawCode, 'text/html');
const decodedCode = doc.body.textContent ?? rawCode;
return copyToClipboard(decodedCode, successMessage, errorMessage);
}
/**

View File

@@ -195,28 +195,9 @@ export function getFileTypeByExtension(filename: string): string | null {
}
export function isFileTypeSupported(filename: string, mimeType?: string): boolean {
// Images are detected and handled separately for vision models
if (mimeType) {
const category = getFileTypeCategory(mimeType);
if (
category === FileTypeCategory.IMAGE ||
category === FileTypeCategory.AUDIO ||
category === FileTypeCategory.PDF
) {
return true;
}
}
// Check extension for known types (especially images without MIME)
const extCategory = getFileTypeCategoryByExtension(filename);
if (
extCategory === FileTypeCategory.IMAGE ||
extCategory === FileTypeCategory.AUDIO ||
extCategory === FileTypeCategory.PDF
) {
if (mimeType && getFileTypeCategory(mimeType)) {
return true;
}
// Fallback: treat everything else as text (inclusive by default)
return true;
return getFileTypeByExtension(filename) !== null;
}

View File

@@ -76,6 +76,7 @@ export {
isFileTypeSupportedByModel,
filterFilesByModalities,
generateModalityErrorMessage,
generateModalityAwareAcceptString,
type ModalityCapabilities
} from './modality-file-validation';

View File

@@ -4,7 +4,17 @@
*/
import { getFileTypeCategory } from '$lib/utils';
import { FileTypeCategory } from '$lib/enums';
import {
FileExtensionAudio,
FileExtensionImage,
FileExtensionPdf,
FileExtensionText,
MimeTypeAudio,
MimeTypeImage,
MimeTypeApplication,
MimeTypeText,
FileTypeCategory
} from '$lib/enums';
/** Modality capabilities for file validation */
export interface ModalityCapabilities {
@@ -160,3 +170,29 @@ export function generateModalityErrorMessage(
* @param capabilities - The modality capabilities to check against
* @returns Accept string for HTML file input element
*/
export function generateModalityAwareAcceptString(capabilities: ModalityCapabilities): string {
const { hasVision, hasAudio } = capabilities;
const acceptedExtensions: string[] = [];
const acceptedMimeTypes: string[] = [];
// Always include text files and PDFs
acceptedExtensions.push(...Object.values(FileExtensionText));
acceptedMimeTypes.push(...Object.values(MimeTypeText));
acceptedExtensions.push(...Object.values(FileExtensionPdf));
acceptedMimeTypes.push(...Object.values(MimeTypeApplication));
// Include images only if vision is supported
if (hasVision) {
acceptedExtensions.push(...Object.values(FileExtensionImage));
acceptedMimeTypes.push(...Object.values(MimeTypeImage));
}
// Include audio only if audio is supported
if (hasAudio) {
acceptedExtensions.push(...Object.values(FileExtensionAudio));
acceptedMimeTypes.push(...Object.values(MimeTypeAudio));
}
return [...acceptedExtensions, ...acceptedMimeTypes].join(',');
}

View File

@@ -1,4 +1,5 @@
import { isSvgMimeType, svgBase64UrlToPngDataURL } from './svg-to-png';
import { isTextFileByName } from './text-files';
import { isWebpMimeType, webpBase64UrlToPngDataURL } from './webp-to-png';
import { FileTypeCategory } from '$lib/enums';
import { modelsStore } from '$lib/stores/models.svelte';
@@ -83,6 +84,17 @@ export async function processFilesToChatUploaded(
}
results.push({ ...base, preview });
} else if (
getFileTypeCategory(file.type) === FileTypeCategory.TEXT ||
isTextFileByName(file.name)
) {
try {
const textContent = await readFileAsUTF8(file);
results.push({ ...base, textContent });
} catch (err) {
console.warn('Failed to read text file, adding without content:', err);
results.push(base);
}
} else if (getFileTypeCategory(file.type) === FileTypeCategory.PDF) {
// Extract text content from PDF for preview
try {
@@ -117,14 +129,8 @@ export async function processFilesToChatUploaded(
const preview = await readFileAsDataURL(file);
results.push({ ...base, preview });
} else {
// Fallback: treat unknown files as text
try {
const textContent = await readFileAsUTF8(file);
results.push({ ...base, textContent });
} catch (err) {
console.warn('Failed to read file as text, adding without content:', err);
results.push(base);
}
// Other files: add as-is
results.push(base);
}
} catch (error) {
console.error('Error processing file', file.name, error);

View File

@@ -119,7 +119,7 @@
$effect(() => {
const serverProps = serverStore.props;
if (serverProps) {
if (serverProps?.default_generation_settings?.params) {
settingsStore.syncWithServerDefaults();
}
});

View File

@@ -65,7 +65,10 @@
await expect(textarea).toHaveValue(text);
const fileInput = document.querySelector('input[type="file"]');
await expect(fileInput).not.toHaveAttribute('accept');
const acceptAttr = fileInput?.getAttribute('accept');
await expect(fileInput).toHaveAttribute('accept');
await expect(acceptAttr).not.toContain('image/');
await expect(acceptAttr).not.toContain('audio/');
// Open file attachments dropdown
const fileUploadButton = canvas.getByText('Attach files');