sync from b7516

2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -18,13 +18,14 @@ else()
    add_subdirectory(gguf-split)
    add_subdirectory(imatrix)
    add_subdirectory(llama-bench)
+    add_subdirectory(cli)
    add_subdirectory(completion)
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    if (LLAMA_BUILD_SERVER)
-        add_subdirectory(cli)
        add_subdirectory(server)
    endif()
+    add_subdirectory(run)
    add_subdirectory(tokenize)
    add_subdirectory(tts)
    add_subdirectory(mtmd)
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -12,7 +12,6 @@
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
 | `--verbose-prompt` | print a verbose prompt before generation (default: false) |
@@ -57,23 +56,22 @@
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
-| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
-| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type<br/>(env: LLAMA_ARG_OVERRIDE_TENSOR) |
+| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
 | `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
 | `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
-| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
 | `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
-| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
+| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
 | `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
 | `--check-tensors` | check model tensor data for invalid values (default: false) |
-| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
+| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
 | `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
 | `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
 | `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
@@ -113,8 +111,6 @@
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
 | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
 | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
-| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) |
-| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) |
 | `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
 | `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
 | `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
@@ -138,7 +134,6 @@
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
 | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
-| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)<br/>(env: LLAMA_ARG_BACKEND_SAMPLING) |


 ### CLI-specific params
@@ -169,19 +164,19 @@
 | `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
 | `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
 | `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
-| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
+| `--chat-template-kwargs STRING` | sets additional params for the json template parser<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
 | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
 | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
 | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
 | `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
 | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
 | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
 | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
 | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
 | `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) |
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -95,7 +95,6 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
 | `--verbose-prompt` | print a verbose prompt before generation (default: false) |
@@ -140,23 +139,22 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
-| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
-| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type<br/>(env: LLAMA_ARG_OVERRIDE_TENSOR) |
+| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
 | `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
 | `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
-| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
 | `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
-| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
+| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
 | `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
 | `--check-tensors` | check model tensor data for invalid values (default: false) |
-| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
+| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
 | `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
 | `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
 | `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
@@ -219,7 +217,6 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
 | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
-| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)<br/>(env: LLAMA_ARG_BACKEND_SAMPLING) |


 ### Completion-specific params
@@ -251,8 +248,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
 | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |

 <!-- HELP_END -->
@@ -436,19 +433,6 @@ The Min-P sampling method was designed as an alternative to Top-P, and aims to e

 Example usage: `--min-p 0.05`

-### Adaptive-P Sampling
-
-   `--adaptive-target N`: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
-   `--adaptive-decay N`: EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
-
-Adaptive-P: Select tokens near a configurable target probability over time.
-
-The adaptive-p sampler transforms the token probability distribution to favor tokens that fall near a user-configurable probability target. Internally, the sampler maintains an exponential moving average of the *ORIGINAL* probabilities of selected tokens at each sampling step. It uses this EMA to compute an adapted target probability at each sampling step, thus maintaining the desired target probability over time. Only mild truncation before this sampler is recommended. It is suggested to apply min-p before adaptive-p as the only other active sampler.
-
-Recommended starting values: `--adaptive-target 0.55 --adaptive-decay 0.9`
-
-For more info, refer to: [llama.cpp#17927](https://github.com/ggml-org/llama.cpp/pull/17927)
-
 ### Locally Typical Sampling

 -   `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -175,10 +175,7 @@ int main(int argc, char ** argv) {
    struct ggml_threadpool_params tpp =
            ggml_threadpool_params_from_cpu_params(params.cpuparams);

-    if (!set_process_priority(params.cpuparams.priority)) {
-        LOG_ERR("%s: error: failed to set process priority\n", __func__);
-        return 1;
-    }
+    set_process_priority(params.cpuparams.priority);

    struct ggml_threadpool * threadpool_batch = NULL;
    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
--- a/tools/fit-params/fit-params.cpp
+++ b/tools/fit-params/fit-params.cpp
@@ -26,16 +26,16 @@ int main(int argc, char ** argv) {
    llama_numa_init(params.numa);
    auto mparams = common_model_params_to_llama(params);
    auto cparams = common_context_params_to_llama(params);
-    const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-        params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
+    const bool success = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
+        params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
        params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
-    if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
+    if (!success) {
        LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
        exit(1);
    }

    LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__);
-    common_log_flush(common_log_main());
+    std::this_thread::sleep_for(10ms); // to avoid a race between stderr and stdout
    printf("-c %" PRIu32 " -ngl %" PRIu32, cparams.n_ctx, mparams.n_gpu_layers);

    size_t nd = llama_max_devices();
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -334,7 +334,6 @@ struct cmd_params {
    std::vector<std::vector<float>>  tensor_split;
    std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
    std::vector<bool>                use_mmap;
-    std::vector<bool>                use_direct_io;
    std::vector<bool>                embeddings;
    std::vector<bool>                no_op_offload;
    std::vector<bool>                no_host;
@@ -373,7 +372,6 @@ static const cmd_params cmd_params_defaults = {
    /* tensor_split         */ { std::vector<float>(llama_max_devices(), 0.0f) },
    /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
    /* use_mmap             */ { true },
-    /* use_direct_io        */ { true },
    /* embeddings           */ { false },
    /* no_op_offload        */ { false },
    /* no_host              */ { false },
@@ -451,8 +449,6 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -dev, --device <dev0/dev1/...>            (default: auto)\n");
    printf("  -mmp, --mmap <0|1>                        (default: %s)\n",
           join(cmd_params_defaults.use_mmap, ",").c_str());
-    printf("  -dio, --direct-io <0|1>                   (default: %s)\n",
-           join(cmd_params_defaults.use_direct_io, ",").c_str());
    printf("  -embd, --embeddings <0|1>                 (default: %s)\n",
           join(cmd_params_defaults.embeddings, ",").c_str());
    printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
@@ -776,13 +772,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                }
                auto p = string_split<bool>(argv[i], split_delim);
                params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
-            } else if (arg == "-dio" || arg == "--direct-io") {
-                if (++i >= argc) {
-                    invalid_param = true;
-                    break;
-                }
-                auto p = string_split<bool>(argv[i], split_delim);
-                params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
            } else if (arg == "-embd" || arg == "--embeddings") {
                if (++i >= argc) {
                    invalid_param = true;
@@ -1019,9 +1008,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.use_mmap.empty()) {
        params.use_mmap = cmd_params_defaults.use_mmap;
    }
-    if (params.use_direct_io.empty()) {
-        params.use_direct_io = cmd_params_defaults.use_direct_io;
-    }
    if (params.embeddings.empty()) {
        params.embeddings = cmd_params_defaults.embeddings;
    }
@@ -1070,7 +1056,6 @@ struct cmd_params_instance {
    std::vector<float> tensor_split;
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
    bool               use_mmap;
-    bool               use_direct_io;
    bool               embeddings;
    bool               no_op_offload;
    bool               no_host;
@@ -1082,12 +1067,11 @@ struct cmd_params_instance {
        if (!devices.empty()) {
            mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
        }
-        mparams.split_mode    = split_mode;
-        mparams.main_gpu      = main_gpu;
-        mparams.tensor_split  = tensor_split.data();
-        mparams.use_mmap      = use_mmap;
-        mparams.use_direct_io = use_direct_io;
-        mparams.no_host       = no_host;
+        mparams.split_mode   = split_mode;
+        mparams.main_gpu     = main_gpu;
+        mparams.tensor_split = tensor_split.data();
+        mparams.use_mmap     = use_mmap;
+        mparams.no_host      = no_host;

        if (n_cpu_moe <= 0) {
            if (tensor_buft_overrides.empty()) {
@@ -1131,8 +1115,7 @@ struct cmd_params_instance {
    bool equal_mparams(const cmd_params_instance & other) const {
        return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
               split_mode == other.split_mode &&
-               main_gpu == other.main_gpu && tensor_split == other.tensor_split &&
-               use_mmap == other.use_mmap && use_direct_io == other.use_direct_io &&
+               main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
               devices == other.devices &&
               no_host == other.no_host &&
               vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
@@ -1170,7 +1153,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & ts : params.tensor_split)
    for (const auto & ot : params.tensor_buft_overrides)
    for (const auto & mmp : params.use_mmap)
-    for (const auto & dio : params.use_direct_io)
    for (const auto & noh : params.no_host)
    for (const auto & embd : params.embeddings)
    for (const auto & nopo : params.no_op_offload)
@@ -1212,7 +1194,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .tensor_split = */ ts,
                /* .tensor_buft_overrides = */ ot,
                /* .use_mmap     = */ mmp,
-                /* .use_direct_io= */ dio,
                /* .embeddings   = */ embd,
                /* .no_op_offload= */ nopo,
                /* .no_host      = */ noh,
@@ -1247,7 +1228,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .tensor_split = */ ts,
                /* .tensor_buft_overrides = */ ot,
                /* .use_mmap     = */ mmp,
-                /* .use_direct_io= */ dio,
                /* .embeddings   = */ embd,
                /* .no_op_offload= */ nopo,
                /* .no_host      = */ noh,
@@ -1282,7 +1262,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .tensor_split = */ ts,
                /* .tensor_buft_overrides = */ ot,
                /* .use_mmap     = */ mmp,
-                /* .use_direct_io= */ dio,
                /* .embeddings   = */ embd,
                /* .no_op_offload= */ nopo,
                /* .no_host      = */ noh,
@@ -1322,7 +1301,6 @@ struct test {
    std::vector<float>       tensor_split;
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
    bool                     use_mmap;
-    bool                     use_direct_io;
    bool                     embeddings;
    bool                     no_op_offload;
    bool                     no_host;
@@ -1360,7 +1338,6 @@ struct test {
        tensor_split   = inst.tensor_split;
        tensor_buft_overrides = inst.tensor_buft_overrides;
        use_mmap       = inst.use_mmap;
-        use_direct_io  = inst.use_direct_io;
        embeddings     = inst.embeddings;
        no_op_offload  = inst.no_op_offload;
        no_host        = inst.no_host;
@@ -1420,9 +1397,9 @@ struct test {
            "n_ubatch",       "n_threads",      "cpu_mask",      "cpu_strict",     "poll",
            "type_k",         "type_v",         "n_gpu_layers",  "n_cpu_moe",      "split_mode",
            "main_gpu",       "no_kv_offload",  "flash_attn",    "devices",        "tensor_split",
-            "tensor_buft_overrides",            "use_mmap",      "use_direct_io",  "embeddings",
-            "no_op_offload",  "no_host",        "n_prompt",      "n_gen",          "n_depth",
-            "test_time",      "avg_ns",         "stddev_ns",     "avg_ts",         "stddev_ts"
+            "tensor_buft_overrides",            "use_mmap",      "embeddings",     "no_op_offload",
+            "no_host",        "n_prompt",       "n_gen",          "n_depth",       "test_time",
+            "avg_ns",         "stddev_ns",      "avg_ts",         "stddev_ts"
        };
        return fields;
    }
@@ -1437,7 +1414,7 @@ struct test {
            return INT;
        }
        if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
-            field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") {
+            field == "use_mmap" || field == "embeddings" || field == "no_host") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -1510,7 +1487,6 @@ struct test {
                                            tensor_split_str,
                                            tensor_buft_overrides_str,
                                            std::to_string(use_mmap),
-                                            std::to_string(use_direct_io),
                                            std::to_string(embeddings),
                                            std::to_string(no_op_offload),
                                            std::to_string(no_host),
@@ -1696,9 +1672,6 @@ struct markdown_printer : public printer {
        if (field == "use_mmap") {
            return 4;
        }
-        if (field == "use_direct_io") {
-            return 3;
-        }
        if (field == "test") {
            return 15;
        }
@@ -1736,9 +1709,6 @@ struct markdown_printer : public printer {
        if (field == "use_mmap") {
            return "mmap";
        }
-        if (field == "use_direct_io") {
-            return "dio";
-        }
        if (field == "embeddings") {
            return "embd";
        }
@@ -1823,9 +1793,6 @@ struct markdown_printer : public printer {
        if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
            fields.emplace_back("use_mmap");
        }
-        if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
-            fields.emplace_back("use_direct_io");
-        }
        if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
            fields.emplace_back("embeddings");
        }
@@ -2070,10 +2037,7 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    if (!set_process_priority(params.prio)) {
-        fprintf(stderr, "%s: error: failed to set process priority\n", __func__);
-        return 1;
-    }
+    set_process_priority(params.prio);

    // initialize printer
    std::unique_ptr<printer> p     = create_printer(params.output_format);
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -27,8 +27,6 @@ add_library(mtmd
            models/qwen3vl.cpp
            models/siglip.cpp
            models/whisper-enc.cpp
-            models/mobilenetv5.cpp
-            models/youtuvl.cpp
            )

 set_target_properties(mtmd PROPERTIES
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -32,6 +32,10 @@ struct clip_graph {
    const float kq_scale;
    const clip_flash_attn_type flash_attn_type;

+    // for debugging
+    const bool debug_graph;
+    std::vector<ggml_tensor *> & debug_print_tensors;
+
    ggml_context_ptr ctx0_ptr;
    ggml_context * ctx0;
    ggml_cgraph * gf;
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -45,14 +45,13 @@
 #define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
 #define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"

-#define KEY_MM_PATCH_MERGE_TYPE    "clip.vision.mm_patch_merge_type"
-#define KEY_IMAGE_GRID_PINPOINTS   "clip.vision.image_grid_pinpoints"
-#define KEY_IMAGE_CROP_RESOLUTION  "clip.vision.image_crop_resolution"
-#define KEY_WIN_ATTN_PATTERN       "clip.vision.n_wa_pattern"
-#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
-#define KEY_ATTN_WINDOW_SIZE       "clip.vision.window_size"
-#define KEY_MINICPMV_VERSION       "clip.minicpmv_version"
-#define KEY_MINICPMV_QUERY_NUM     "clip.minicpmv_query_num"
+#define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+#define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
+#define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION      "clip.minicpmv_version"
+#define KEY_MINICPMV_QUERY_NUM    "clip.minicpmv_query_num"

 // audio-specific
 #define KEY_AUDIO_PROJ_TYPE     "clip.audio.projector_type" // for models with mixed modalities
@@ -154,47 +153,6 @@
 #define TN_CONV_PW1        "%s.blk.%d.conv_pw1.%s"
 #define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"

-// mobilenetv5 (gemma3n) definitions
-#define TN_MNV5_STEM_CONV        "v.conv_stem.conv.weight"
-#define TN_MNV5_STEM_BIAS        "v.conv_stem.conv.bias"
-#define TN_MNV5_STEM_BN          "v.conv_stem.bn.weight"
-
-// Stage 0 Block (Edge Residual)
-#define TN_MNV5_BLK_S0_EXP_W     "v.blk.%d.%d.conv_exp.weight"
-#define TN_MNV5_BLK_S0_BN1_W     "v.blk.%d.%d.bn1.weight"
-#define TN_MNV5_BLK_S0_PWL_W     "v.blk.%d.%d.conv_pwl.weight"
-#define TN_MNV5_BLK_S0_BN2_W     "v.blk.%d.%d.bn2.weight"
-
-// Stage 1+ Block (Universal Inverted Residual)
-#define TN_MNV5_BLK_DW_START_W   "v.blk.%d.%d.dw_start.conv.weight"
-#define TN_MNV5_BLK_DW_START_BN  "v.blk.%d.%d.dw_start.bn.weight"
-#define TN_MNV5_BLK_DW_MID_W     "v.blk.%d.%d.dw_mid.conv.weight"
-#define TN_MNV5_BLK_DW_MID_BN    "v.blk.%d.%d.dw_mid.bn.weight"
-#define TN_MNV5_BLK_PW_EXP_W     "v.blk.%d.%d.pw_exp.conv.weight"
-#define TN_MNV5_BLK_PW_EXP_BN    "v.blk.%d.%d.pw_exp.bn.weight"
-#define TN_MNV5_BLK_PW_PROJ_W    "v.blk.%d.%d.pw_proj.conv.weight"
-#define TN_MNV5_BLK_PW_PROJ_BN   "v.blk.%d.%d.pw_proj.bn.weight"
-#define TN_MNV5_BLK_LAYER_SCALE  "v.blk.%d.%d.layer_scale.gamma"
-
-// Attention Components
-#define TN_MNV5_ATTN_Q_W         "v.blk.%d.%d.attn.query.proj.weight"
-#define TN_MNV5_ATTN_K_W         "v.blk.%d.%d.attn.key.proj.weight"
-#define TN_MNV5_ATTN_V_W         "v.blk.%d.%d.attn.value.proj.weight"
-#define TN_MNV5_ATTN_O_W         "v.blk.%d.%d.attn.output.proj.weight"
-#define TN_MNV5_ATTN_K_DW        "v.blk.%d.%d.attn.key.down_conv.weight"
-#define TN_MNV5_ATTN_K_NORM      "v.blk.%d.%d.attn.key.norm.weight"
-#define TN_MNV5_ATTN_V_DW        "v.blk.%d.%d.attn.value.down_conv.weight"
-#define TN_MNV5_ATTN_V_NORM      "v.blk.%d.%d.attn.value.norm.weight"
-#define TN_MNV5_ATTN_NORM        "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
-
-// MSFA
-#define TN_MNV5_MSFA_FFN_EXP_W   "v.msfa.ffn.pw_exp.conv.weight"
-#define TN_MNV5_MSFA_FFN_EXP_BN  "v.msfa.ffn.pw_exp.bn.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_W  "v.msfa.ffn.pw_proj.conv.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
-#define TN_MNV5_MSFA_NORM        "v.msfa.norm.weight"
-
-
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

@@ -212,8 +170,6 @@ enum projector_type {
    PROJECTOR_TYPE_QWEN2VL,
    PROJECTOR_TYPE_QWEN3VL,
    PROJECTOR_TYPE_GEMMA3,
-    PROJECTOR_TYPE_GEMMA3NV,
-    PROJECTOR_TYPE_GEMMA3NA,
    PROJECTOR_TYPE_IDEFICS3,
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
@@ -224,7 +180,6 @@ enum projector_type {
    PROJECTOR_TYPE_GLMA,
    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
    PROJECTOR_TYPE_VOXTRAL,
-    PROJECTOR_TYPE_MUSIC_FLAMINGO,
    PROJECTOR_TYPE_LFM2,
    PROJECTOR_TYPE_KIMIVL,
    PROJECTOR_TYPE_LIGHTONOCR,
@@ -232,7 +187,6 @@ enum projector_type {
    PROJECTOR_TYPE_JANUS_PRO,
    PROJECTOR_TYPE_LFM2A,
    PROJECTOR_TYPE_GLM4V,
-    PROJECTOR_TYPE_YOUTUVL,
    PROJECTOR_TYPE_UNKNOWN,
 };

@@ -246,8 +200,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
-    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
@@ -257,7 +209,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_GLMA,      "glma"},
    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
-    { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
    { PROJECTOR_TYPE_LFM2,      "lfm2"},
    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
@@ -265,7 +216,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
-    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -61,7 +61,6 @@ struct clip_hparams {
    std::unordered_set<int32_t> vision_feature_layer;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
-    std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)

    // audio
    int32_t n_mel_bins = 0; // whisper preprocessor
@@ -173,45 +172,6 @@ struct clip_layer {
    }
 };

-// Expanded MobileNetV5 block structure for Gemma3n vision encoder
-struct mobilenetv5_block {
-    // Stage 0 (Edge Residual)
-    ggml_tensor * s0_conv_exp_w = nullptr;
-    ggml_tensor * s0_bn1_w      = nullptr;
-    ggml_tensor * s0_conv_pwl_w = nullptr;
-    ggml_tensor * s0_bn2_w      = nullptr;
-
-    // Stage 1+ (Universal Inverted Residual)
-    ggml_tensor * dw_start_w    = nullptr;
-    ggml_tensor * dw_start_bn_w = nullptr;
-
-    ggml_tensor * pw_exp_w      = nullptr;
-    ggml_tensor * pw_exp_bn_w   = nullptr;
-
-    ggml_tensor * dw_mid_w      = nullptr;
-    ggml_tensor * dw_mid_bn_w   = nullptr;
-
-    ggml_tensor * pw_proj_w     = nullptr;
-    ggml_tensor * pw_proj_bn_w  = nullptr;
-
-    ggml_tensor * layer_scale_w = nullptr;
-
-    // Attention (MQA) components
-    ggml_tensor * attn_q_w = nullptr;
-    ggml_tensor * attn_k_w = nullptr;
-    ggml_tensor * attn_v_w = nullptr;
-    ggml_tensor * attn_o_w = nullptr;
-
-    // Optional downsampling/norm in attention
-    ggml_tensor * attn_k_dw_w   = nullptr;
-    ggml_tensor * attn_k_norm_w = nullptr;
-    ggml_tensor * attn_v_dw_w   = nullptr;
-    ggml_tensor * attn_v_norm_w = nullptr;
-
-    // Block norm (often present in attention blocks)
-    ggml_tensor * attn_norm_w   = nullptr;
-};
-
 struct clip_model {
    clip_modality modality = CLIP_MODALITY_VISION;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -328,23 +288,6 @@ struct clip_model {
    ggml_tensor * mm_input_proj_w = nullptr;
    ggml_tensor * mm_soft_emb_norm_w = nullptr;

-    // mobilenetv5 for gemma3n
-    std::vector<mobilenetv5_block> mobilenet_blocks;
-    std::vector<int> mobilenet_stage_ends;
-    ggml_tensor * mobilenet_stem_conv_w = nullptr;
-    ggml_tensor * mobilenet_stem_conv_b = nullptr;
-    ggml_tensor * mobilenet_stem_norm_w = nullptr;
-    ggml_tensor * mm_post_proj_norm_w = nullptr;
-
-    // Multi-Scale Fusion Adapter (MSFA) components
-    ggml_tensor * msfa_concat_conv_w = nullptr;
-    ggml_tensor * msfa_concat_norm_w = nullptr;
-    ggml_tensor * msfa_ffn_expand_w = nullptr;
-    ggml_tensor * msfa_ffn_project_w = nullptr;
-    ggml_tensor * msfa_ffn_expand_bn = nullptr;
-    ggml_tensor * msfa_ffn_project_bn = nullptr;
-
-
    // pixtral, glm4v
    ggml_tensor * token_embd_img_break = nullptr;
    ggml_tensor * mm_patch_merger_w = nullptr;
@@ -376,8 +319,7 @@ struct clip_model {

    bool audio_has_avgpool() const {
        return proj_type == PROJECTOR_TYPE_QWEN2A
-            || proj_type == PROJECTOR_TYPE_VOXTRAL
-            || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
    }

    bool audio_has_stack_frames() const {
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -152,14 +152,18 @@ struct clip_ctx {
    ggml_backend_t backend_cpu = nullptr;
    ggml_backend_buffer_ptr buf;

-
    int max_nodes = 8192;
    ggml_backend_sched_ptr sched;
    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
    bool is_allocated = false;

+    // for debugging
+    bool debug_graph = false;
+    std::vector<ggml_tensor *> debug_print_tensors;
+
    clip_ctx(clip_context_params & ctx_params) {
        flash_attn_type = ctx_params.flash_attn_type;
+        debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
        if (!backend_cpu) {
            throw std::runtime_error("failed to initialize CPU backend");
@@ -200,10 +204,6 @@ struct clip_ctx {
        sched.reset(
            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
        );
-
-        if (ctx_params.cb_eval != nullptr) {
-            ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
-        }
    }

    ~clip_ctx() {
@@ -239,7 +239,9 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
        n_mmproj_embd(clip_n_mmproj_embd(ctx)),
        eps(hparams.eps),
        kq_scale(1.0f / sqrtf((float)d_head)),
-        flash_attn_type(ctx->flash_attn_type) {
+        flash_attn_type(ctx->flash_attn_type),
+        debug_graph(ctx->debug_graph),
+        debug_print_tensors(ctx->debug_print_tensors) {
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
@@ -250,11 +252,14 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
    gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
 }

-void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
-    if (il >= 0) {
-        ggml_format_name(cur, "%s-%d", name, il);
-    } else {
-        ggml_set_name(cur, name);
+void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
+    if (debug_graph) {
+        ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
+        std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
+        ggml_set_name(cur, cur_name.c_str());
+        ggml_set_output(cur);
+        ggml_build_forward_expand(gf, cur);
+        debug_print_tensors.push_back(cur);
    }
 }

@@ -783,10 +788,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_siglip>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
-            } break;
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            {
@@ -817,7 +818,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_QWEN2A:
        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
            } break;
@@ -845,10 +845,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_glm4v>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
-            } break;
        default:
            GGML_ABORT("missing cgraph builder");
    }
@@ -1145,14 +1141,6 @@ struct clip_model_loader {
                        // test model (tinygemma3) has a different value, we optionally read it
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                    } break;
-
-                case PROJECTOR_TYPE_GEMMA3NV:
-                    {
-                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
-                        // Similar configuration to Gemma3
-                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
-                    } break;
                case PROJECTOR_TYPE_QWEN2VL:
                case PROJECTOR_TYPE_QWEN25VL:
                case PROJECTOR_TYPE_QWEN3VL:
@@ -1170,20 +1158,6 @@ struct clip_model_loader {
                            LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
                        }
                    } break;
-                case PROJECTOR_TYPE_YOUTUVL:
-                    {
-                        hparams.n_merge = 2;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
-                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
-                        std::vector<int> wa_layer_indexes_vec;
-                        get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
-                        for (auto & layer : wa_layer_indexes_vec) {
-                            hparams.wa_layer_indexes.insert(layer);
-                        }
-                        // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
-                        hparams.set_limit_image_tokens(1, 62500);
-                        hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
-                    } break;
                case PROJECTOR_TYPE_GLM4V:
                    {
                        hparams.rope_theta = 10000.0f;
@@ -1202,7 +1176,6 @@ struct clip_model_loader {
                case PROJECTOR_TYPE_QWEN2A:
                case PROJECTOR_TYPE_GLMA:
                case PROJECTOR_TYPE_VOXTRAL:
-                case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                    {
                        bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
                                             model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@@ -1252,14 +1225,7 @@ struct clip_model_loader {
                LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
                LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
                LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
-                LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
-                if (!hparams.wa_layer_indexes.empty()) {
-                    LOG_INF("%s: wa_layer_indexes:  ", __func__);
-                    for (auto & layer : hparams.wa_layer_indexes) {
-                        LOG_INF("%d ", layer);
-                    }
-                    LOG_INF("\n");
-                }
+                LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
                if (hparams.image_min_pixels > 0) {
                    LOG_INF("%s: image_min_pixels:   %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
                }
@@ -1341,10 +1307,6 @@ struct clip_model_loader {

        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);

-        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
-            hparams.n_layer = 0; // gemma3n does not use normal layer structure
-        }
-
        // layers
        model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1419,7 +1381,6 @@ struct clip_model_loader {
            }
        }

-
        switch (model.proj_type) {
            case PROJECTOR_TYPE_MLP:
            case PROJECTOR_TYPE_MLP_NORM:
@@ -1514,8 +1475,8 @@ struct clip_model_loader {
                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
                } break;
            case PROJECTOR_TYPE_QWEN2VL:
            case PROJECTOR_TYPE_QWEN25VL:
@@ -1532,14 +1493,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
-            case PROJECTOR_TYPE_YOUTUVL:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);        // merger.ln_q (RMS norm)
-                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));  // merger.mlp.0
-                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));  // merger.mlp.2
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_GLM4V:
                {
                    model.projection     = get_tensor(TN_MM_PROJECTOR);
@@ -1559,112 +1512,11 @@ struct clip_model_loader {
                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                } break;
-            case PROJECTOR_TYPE_GEMMA3NV:
-                {
-                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
-                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
-                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
-
-                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
-                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
-                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
-                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
-
-                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
-
-                    // Dynamically load blocks stage by stage
-                    for (int stage = 0; stage < 4; ++stage) {
-                        int blocks_found_in_stage = 0;
-
-                        for (int blk_idx = 0; ; ++blk_idx) {
-                            bool found_block = false;
-                            mobilenetv5_block block;
-
-                            // 1. Check for Edge Residual (S0)
-                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
-                            if (block.s0_conv_exp_w) {
-                                found_block = true;
-                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
-                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
-                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
-                            }
-                            // 2. Check for UIR (Universal Inverted Residual)
-                            else {
-                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
-                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
-                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
-
-                                if (block.dw_start_w || block.pw_exp_w) {
-                                    found_block = true;
-                                    if (block.dw_start_w) {
-                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
-                                    }
-                                    if (block.pw_exp_w) {
-                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
-                                    }
-                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
-                                    if (block.dw_mid_w) {
-                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
-                                    }
-                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
-                                    if (block.pw_proj_w) {
-                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
-                                    }
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            // 3. Check for Attention (MQA)
-                            // Even if UIR/Edge check failed, this might be a pure attention block
-                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
-                            if (attn_q_check) {
-                                found_block = true;
-                                block.attn_q_w = attn_q_check;
-                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
-                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
-                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
-                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
-                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
-                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
-                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
-                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
-                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
-                                if (!block.layer_scale_w) {
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            if (found_block) {
-                                model.mobilenet_blocks.push_back(block);
-                                blocks_found_in_stage++;
-                            } else {
-                                // End of blocks for this stage
-                                break;
-                            }
-                        }
-
-                        // Track where this stage ends in the flat vector
-                        if (blocks_found_in_stage > 0) {
-                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
-                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
-                        }
-                    }
-                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
-                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
-                } break;
            case PROJECTOR_TYPE_IDEFICS3:
                {
                    model.projection = get_tensor(TN_MM_PROJECTOR);
                } break;
            case PROJECTOR_TYPE_LFM2:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
-                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_KIMIVL:
                {
                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@@ -1724,17 +1576,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
                } break;
-            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-                {
-                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
-                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
-                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
-                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_INTERNVL:
                {
                    model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -1756,8 +1597,8 @@ struct clip_model_loader {
                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
                    model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
                } break;
            case PROJECTOR_TYPE_LLAMA4:
                {
@@ -2107,7 +1948,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

    try {
        clip_model_loader loader(fname);
-        bool skip_audio = false;

        if (loader.has_vision) {
            ctx_vision = new clip_ctx(ctx_params);
@@ -2117,14 +1957,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
                loader.warmup(*ctx_vision);
            }

-            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
-            // we can remove this check when we implement audio support for Gemma 3N
-            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
-
            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
        }

-        if (loader.has_audio && !skip_audio) {
+        if (loader.has_audio) {
            ctx_audio = new clip_ctx(ctx_params);
            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
            loader.load_tensors(*ctx_audio);
@@ -2848,57 +2684,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                // res_imgs->data[0] = *res;
                res_imgs->entries.push_back(std::move(img_f32));
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                const int patch_size = params.patch_size;  // typically 16
-                const int merge_size = params.n_merge;      // typically 2
-                const int align_size = patch_size * merge_size;  // 32
-
-                const int max_num_patches = params.image_max_pixels > 0 ?
-                    params.image_max_pixels / (patch_size * patch_size) : 256;
-
-                // Linear search for optimal scale to fit within max_num_patches
-                float scale = 1.0f;
-                int target_height = original_size.height;
-                int target_width = original_size.width;
-
-                auto get_scaled_image_size = [align_size](float scale, int size) -> int {
-                    float scaled_size = size * scale;
-                    // Round up to nearest multiple of align_size
-                    int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
-                    // Ensure at least one patch
-                    return std::max(align_size, aligned);
-                };
-
-                // Linear search with 0.02 step size
-                while (scale > 0.0f) {
-                    target_height = get_scaled_image_size(scale, original_size.height);
-                    target_width = get_scaled_image_size(scale, original_size.width);
-
-                    int num_patches_h = target_height / patch_size;
-                    int num_patches_w = target_width / patch_size;
-                    int num_patches = num_patches_h * num_patches_w;
-
-                    if (num_patches > max_num_patches) {
-                        scale -= 0.02f;
-                    } else {
-                        break;
-                    }
-                }
-
-                clip_image_size new_size = {target_width, target_height};
-
-                // Resize the image
-                clip_image_u8 resized;
-                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
-
-                // Normalize to float32
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
-
-                // Add to results
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;

        case PROJECTOR_TYPE_IDEFICS3:
            {
@@ -2962,16 +2747,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                res_imgs->entries.push_back(std::move(img_f32));
            } break;

-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                clip_image_u8 resized_image;
-                int sz = params.image_size;
-                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
        case PROJECTOR_TYPE_JANUS_PRO:
            {
                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@@ -3141,7 +2916,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->nx / params.patch_size) / 2;
        default:
            break;
@@ -3157,7 +2931,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->ny / params.patch_size) / 2;
        default:
            break;
@@ -3218,7 +2991,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // dynamic size (2 conv, so double patch size)
                int x_patch = img->nx / (params.patch_size * 2);
@@ -3234,12 +3006,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                int scale_factor = ctx->model.hparams.n_merge;
                n_patches /= (scale_factor * scale_factor);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
-                // regardless of input size (see architecture description)
-                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
-            } break;
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_KIMIVL:
            {
@@ -3265,7 +3031,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                n_patches = img->nx;

@@ -3334,6 +3099,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }

    // build the inference graph
+    ctx->debug_print_tensors.clear();
    ggml_backend_sched_reset(ctx->sched.get());
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@@ -3351,6 +3117,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int pos_w = image_size_width  / patch_size;
    const int pos_h = image_size_height / patch_size;

+    const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl

    auto get_inp_tensor = [&gf](const char * name) {
        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3499,11 +3266,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("positions", positions);
            } break;
        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // pw * ph = number of tokens output by ViT after apply patch merger
                // ipw * ipw = number of vision token been processed inside ViT
-                const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
                const int merge_ratio = 2;
                const int pw  = image_size_width  / patch_size / merge_ratio;
                const int ph  = image_size_height / patch_size / merge_ratio;
@@ -3514,7 +3279,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                std::vector<int> inv_idx(ph * pw);

                if (use_window_attn) {
-                    const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
+                    const int attn_window_size = 112;
                    const int grid_window = attn_window_size / patch_size / merge_ratio;
                    int dst = 0;
                    // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3631,7 +3396,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("patches", patches);
            } break;
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_INTERNVL:
        case PROJECTOR_TYPE_QWEN2A:
@@ -3639,7 +3403,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
        case PROJECTOR_TYPE_JANUS_PRO:
        case PROJECTOR_TYPE_COGVLM:
            {
@@ -3703,6 +3466,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        return false;
    }

+    // print debug nodes
+    if (ctx->debug_graph) {
+        LOG_INF("\n\n---\n\n");
+        LOG_INF("\n\nDebug graph:\n\n");
+        for (ggml_tensor * t : ctx->debug_print_tensors) {
+            std::vector<uint8_t> data(ggml_nbytes(t));
+            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+            print_tensor_shape(t);
+            print_tensor_data(t, data.data(), 3);
+        }
+    }
+
    // the last node is the embedding tensor
    ggml_tensor * embeddings = ggml_graph_node(gf, -1);

@@ -3741,19 +3516,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_JANUS_PRO:
-        case PROJECTOR_TYPE_YOUTUVL:
            return ctx->model.mm_1_b->ne[0];
        case PROJECTOR_TYPE_QWEN3VL:
            // main path + deepstack paths
            return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
            return ctx->model.mm_input_proj_w->ne[0];
        case PROJECTOR_TYPE_IDEFICS3:
            return ctx->model.projection->ne[1];
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_INTERNVL:
            return ctx->model.mm_3_w->ne[1];
@@ -3778,7 +3550,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
 }

 int clip_is_minicpmv(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
        return ctx->model.hparams.minicpmv_version;
    }
@@ -3786,14 +3557,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
 }

 bool clip_is_glm(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }

+bool clip_is_mrope(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
+        || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
+}
+
 bool clip_is_llava(const struct clip_ctx * ctx) {
    return ctx->model.hparams.has_llava_projector;
 }

+bool clip_is_gemma3(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
+}
+
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_VISION;
 }
@@ -3803,16 +3584,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
 }

 bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_ULTRAVOX:
-        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
+        || ctx->proj_type() == PROJECTOR_TYPE_GLMA
+        || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
 }

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
@@ -3854,6 +3629,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
 //
 // API for debugging
 //
+
 void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    clip_image_f32 img;
    img.nx = w;
@@ -3862,6 +3638,9 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    for (int i = 0; i < h * w * 3; i++) {
        img.buf[i] = static_cast<float>(fill_value);
    }
+    bool cur_debug_graph = ctx->debug_graph;
+    ctx->debug_graph = true;
    clip_image_encode(ctx, 1, &img, nullptr);
+    ctx->debug_graph = cur_debug_graph;
    GGML_ASSERT(img.buf.empty() && "expected, always stop here");
 }
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -1,7 +1,6 @@
 #pragma once

 #include "ggml.h"
-#include "mtmd.h"

 #include <stddef.h>
 #include <stdint.h>
@@ -38,8 +37,6 @@ struct clip_context_params {
    int image_min_tokens;
    int image_max_tokens;
    bool warmup;
-    ggml_backend_sched_eval_callback cb_eval;
-    void * cb_eval_user_data;
 };

 struct clip_init_result {
@@ -107,9 +104,9 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct

 int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
+bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
-// note for contributor: this clip_is_(model) pattern is deprecated
-//                       do NOT add new functions like this
+bool clip_is_gemma3(const struct clip_ctx * ctx);

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -1,451 +0,0 @@
-#include "models.h"
-
-// Helpers for MobileNetV5 Blocks
-// RMS Norm 2D - normalizes over channels for each spatial position
-ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
-    // inp: [W, H, C, B]
-
-    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-    cur = ggml_rms_norm(ctx0, cur, eps);
-
-    if (weight) {
-        cur = ggml_mul(ctx0, cur, weight);
-    }
-
-    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-
-    return cur;
-}
-
-// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
-ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
-    const int64_t ih = inp->ne[1];  // height
-    const int64_t iw = inp->ne[0];  // width
-
-    // Calculate output size (ceil division)
-    const int64_t oh = (ih + stride_h - 1) / stride_h;
-    const int64_t ow = (iw + stride_w - 1) / stride_w;
-
-    // Calculate padding needed
-    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
-    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
-
-    // Split padding asymmetrically
-    const int pad_h_top = pad_h / 2;
-    const int pad_h_bottom = pad_h - pad_h_top;
-    const int pad_w_left = pad_w / 2;
-    const int pad_w_right = pad_w - pad_w_left;
-
-    // Apply padding if needed
-    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
-    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
-    if (pad_h > 0 || pad_w > 0) {
-        inp = ggml_pad_ext(ctx0, inp,
-            pad_w_left, pad_w_right,     // width padding (dim 0)
-            pad_h_top, pad_h_bottom,      // height padding (dim 1)
-            0, 0,                         // no channel padding (dim 2)
-            0, 0);                        // no batch padding (dim 3)
-    }
-
-    return inp;
-}
-
-
-// Edge Residual Block (Stage 0)
-ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Expansion Conv (3x3)
-    if (stride == 2) {
-        // Case: Downsampling (Block 0)
-        // Replicates Conv2dSame(kernel=3, stride=2)
-        cur = pad_same_2d(cur, 3, 3, stride, stride);
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
-    } else {
-        // Case: Normal 3x3 Block (Block 1, 2)
-        // Replicates Conv2d(kernel=3, stride=1, padding=1)
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
-    }
-
-    // BN + Activation
-    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
-    cur = ggml_gelu(ctx0, cur);
-
-    // 2. Pointwise Linear Conv (1x1)
-    // 1x1 Convs usually have padding=0 and stride=1
-    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
-    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
-
-    // 3. Residual Connection
-    // Only apply residual if spatial dimensions and channels match (stride 1)
-    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-// Universal Inverted Residual Block (Stage 1+)
-ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Depthwise Start (Optional)
-    // NOTE: dw_start always has stride=1 (no downsampling here)
-    if (block.dw_start_w) {
-        int k = block.dw_start_w->ne[0]; // 3 or 5
-        int p = k / 2;
-        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
-        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
-    }
-
-    // 2. Pointwise Expansion (1x1)
-    if (block.pw_exp_w) {
-        // Standard 1x1 conv, pad=0, stride=1
-        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 3. Depthwise Mid (Optional)
-    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
-    if (block.dw_mid_w) {
-        int k = block.dw_mid_w->ne[0]; // 3 or 5
-
-        if (stride > 1) {
-            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
-            cur = pad_same_2d(cur, k, k, stride, stride);
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
-        } else {
-            // Case: Stride 1 -> Use Standard Symmetric Padding
-            int p = k / 2;
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
-        }
-
-        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 4. Pointwise Projection (1x1)
-    if (block.pw_proj_w) {
-        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
-    }
-
-    // Apply Layer Scaling if present
-    if (block.layer_scale_w) {
-        cur = ggml_mul(ctx0, cur, block.layer_scale_w);
-    }
-
-    // 5. Residual Connection
-    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
-    bool same_channel = (inp->ne[2] == cur->ne[2]);
-    if (same_spatial && same_channel) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-// Attention Block (MQA)
-ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
-    ggml_tensor * cur = inp;
-
-    // Norm
-    if (block.attn_norm_w) {
-        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
-    }
-
-    // 1. Q Calculation
-    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
-
-    // 2. K Calculation (Downsampled)
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * k_inp = cur;
-    if (block.attn_k_dw_w) {
-        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
-        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
-        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_k_norm_w) {
-            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
-
-    // 3. V Calculation (Downsampled)
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * v_inp = cur;
-    if (block.attn_v_dw_w) {
-        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
-        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
-        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_v_norm_w) {
-            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
-
-    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
-    const int D = k->ne[2]; // Head dimension
-    const int n_head = q->ne[2] / D;
-    const int N = W * H;
-
-    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
-    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
-    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
-    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
-    q = ggml_cont(ctx0, q);
-
-    const int Wk = k->ne[0]; const int Hk = k->ne[1];
-    const int M = Wk * Hk;
-
-    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
-    k = ggml_reshape_3d(ctx0, k, M, D, B);
-    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
-    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
-    k = ggml_cont(ctx0, k);
-
-    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
-    v = ggml_reshape_3d(ctx0, v, M, D, B);
-    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
-    v = ggml_cont(ctx0, v); // [M, D, 1, B]
-
-    // Multi-Query Attention
-    float scale = 1.0f / sqrtf((float)D);
-
-    // Step 1: Compute Q @ K.T
-    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
-
-    scores = ggml_scale(ctx0, scores, scale);
-
-    scores = ggml_soft_max(ctx0, scores);
-
-    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
-
-    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
-    kqv = ggml_cont(ctx0, kqv);
-
-
-    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
-    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
-    kqv = ggml_cont(ctx0, kqv);
-
-    // Output projection
-    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
-
-    // Residual & Layer Scale
-    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
-        if (block.layer_scale_w) {
-            cur = ggml_mul(ctx0, cur, block.layer_scale_w);
-        }
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-ggml_cgraph * clip_graph_mobilenetv5::build() {
-    ggml_tensor * inp = build_inp_raw();
-
-    // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
-    ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2);  // Apply SAME padding
-
-    cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
-    if (model.mobilenet_stem_conv_b) {
-        cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
-    }
-    if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
-    cur = ggml_gelu(ctx0, cur);
-
-
-    // 2. Blocks
-    std::vector<ggml_tensor*> intermediate_features;
-    const int total_blocks = model.mobilenet_blocks.size();
-
-    auto is_stage_start = [&](int i) {
-        if (i == 0) return true;
-        for (int end_idx : model.mobilenet_stage_ends) {
-            if (i == end_idx + 1) return true;
-        }
-        return false;
-    };
-
-    auto is_fusion_point = [&](int i) {
-        if (model.mobilenet_stage_ends.size() >= 4) {
-                if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
-                if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
-        } else {
-            if (i == total_blocks - 1) return true;
-        }
-        return false;
-    };
-
-    for (int i = 0; i < total_blocks; i++) {
-        const auto & block = model.mobilenet_blocks[i];
-        int stride = is_stage_start(i) ? 2 : 1;
-
-        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride);
-        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block);
-        else                          cur = build_inverted_residual(cur, block, stride);
-
-        if (is_fusion_point(i)) {
-
-            intermediate_features.push_back(cur);
-        }
-    }
-
-    // 3. Multi-Scale Fusion Adapter (MSFA)
-    if (!intermediate_features.empty()) {
-
-        // A. Reference Resolution: PyTorch implementation uses inputs[0]
-        // We assume intermediate_features[0] is the "High Resolution" target.
-        // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
-        ggml_tensor* target_feat = intermediate_features[0];
-        int high_res_w = target_feat->ne[0];
-        int high_res_h = target_feat->ne[1];
-
-        std::vector<ggml_tensor*> resized_feats;
-
-        // B. Resize inputs to match inputs[0] (High Resolution)
-        for (auto feat : intermediate_features) {
-            int feat_w = feat->ne[0];
-            int feat_h = feat->ne[1];
-
-            // PyTorch: if feat_size < high_resolution: interpolate
-            if (feat_w < high_res_w || feat_h < high_res_h) {
-                // Calculate scale factor.
-                // Note: PyTorch 'nearest' works on arbitrary float scales.
-                // ggml_upscale generally takes integer factors or target sizes depending on helper.
-                // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
-                int scale_w = high_res_w / feat_w;
-                // int scale_h = high_res_h / feat_h;
-
-                // Safety check for non-integer scaling if strictly replicating
-                GGML_ASSERT(high_res_w % feat_w == 0);
-
-                // Upsample (Nearest Neighbor)
-                // 2 is the scale factor
-                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
-            }
-            resized_feats.push_back(feat);
-        }
-
-        // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
-        cur = resized_feats[0];
-        for (size_t k = 1; k < resized_feats.size(); ++k) {
-            cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
-        }
-
-        // D. FFN (UniversalInvertedResidual)
-        // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
-
-        // 1. Expansion
-        if (model.msfa_ffn_expand_w) {
-            // 1x1 Conv
-            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
-
-            if (model.msfa_ffn_expand_bn) {
-                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
-            }
-
-            cur = ggml_gelu(ctx0, cur);
-
-        }
-
-        // 2. Projection (No DW because kernel_size=0)
-        if (model.msfa_ffn_project_w) {
-            // 1x1 Conv
-            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
-
-            // UniversalInvertedResidual typically has a norm after projection
-            if (model.msfa_ffn_project_bn) {
-                cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
-            }
-
-        }
-
-        // E. Final Downsample to Target Resolution (Output Resolution)
-        // PyTorch: matches self.output_resolution (e.g. 16x16)
-        const int target_out_res = 16;
-        int current_w = cur->ne[0];
-
-        if (current_w > target_out_res) {
-            int s = current_w / target_out_res;
-
-            GGML_ASSERT(current_w % target_out_res == 0);
-
-            // Avg Pool: Kernel=s, Stride=s
-            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
-
-        }
-
-        // F. Final Norm
-        if (model.msfa_concat_norm_w) {
-            cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
-
-        }
-    }
-
-    // 4. Gemma 3n Multimodal Projection (Embedder)
-    // Input: 'cur' is [Width, Height, Channels, Batch]
-    int W = cur->ne[0];
-    int H = cur->ne[1];
-    int C = cur->ne[2];
-    int B = cur->ne[3];
-
-    GGML_ASSERT(C == hparams.n_embd);
-
-    // 1. Permute and Flatten to [Channels, Tokens, Batch]
-    // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
-    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
-    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
-    cur = ggml_cont(ctx0, cur);
-    cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
-    cur = ggml_cont(ctx0, cur);
-
-
-    // 2. FEATURE SCALING
-    // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
-    const float scale_factor = sqrtf((float)C);
-    cur = ggml_scale(ctx0, cur, scale_factor);
-
-
-    // 3. SOFT EMBEDDING NORM
-    // PyTorch: self._norm(x) * self.weight
-    // We must normalize regardless, then multiply if weight exists.
-    {
-        const float eps = 1e-6f; // Gemma3n uses 1e-6
-        cur = ggml_rms_norm(ctx0, cur, eps);
-
-        if (model.mm_soft_emb_norm_w) {
-            // Weight shape is (2048,) -> Element-wise broadcast multiply
-            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
-        }
-
-    }
-
-    // 4. PROJECTION
-    // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
-    // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
-    if (model.mm_input_proj_w) {
-        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
-    }
-
-    // 5. POST PROJECTION NORM
-    // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
-    // with_scale=False means weight is registered as buffer with value 1.0
-    // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
-    {
-        const float eps = 1e-6f;
-        cur = ggml_rms_norm(ctx0, cur, eps);
-
-        if (model.mm_post_proj_norm_w) {
-            // If weight is loaded, multiply (should be ~1.0 anyway)
-            cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
-        }
-    }
-
-    ggml_build_forward_expand(gf, cur);
-    return gf;
-}
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -2,11 +2,6 @@

 #include "../clip-graph.h"

-/*
- * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
- * We encourage human contributors to ensure the quality and reliability of the codebase.
- */
-
 struct clip_graph_siglip : clip_graph {
    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -27,11 +22,6 @@ struct clip_graph_qwen3vl : clip_graph {
    ggml_cgraph * build() override;
 };

-struct clip_graph_youtuvl : clip_graph {
-    clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-};
-
 struct clip_graph_minicpmv : clip_graph {
    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -76,36 +66,3 @@ struct clip_graph_glm4v : clip_graph {
    clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
-
-struct clip_graph_mobilenetv5 : clip_graph {
-    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-
-    ggml_tensor * rms_norm_2d(
-        ggml_tensor * inp,
-        ggml_tensor * weight,
-        float eps = 1e-6f);
-
-    ggml_tensor* pad_same_2d(
-        ggml_tensor* inp,
-        int kernel_h,
-        int kernel_w,
-        int stride_h,
-        int stride_w,
-        int dilation_h = 1,
-        int dilation_w = 1);
-
-    ggml_tensor * build_edge_residual(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block,
-        int stride);
-
-    ggml_tensor * build_inverted_residual(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block,
-        int stride);
-
-    ggml_tensor * build_mobilenet_attn(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block);
-};
--- a/tools/mtmd/models/siglip.cpp
+++ b/tools/mtmd/models/siglip.cpp
@@ -50,15 +50,10 @@ ggml_cgraph * clip_graph_siglip::build() {
        const int scale_factor = model.hparams.n_merge;
        cur = build_patch_merge_permute(cur, scale_factor);

-        // projection, in LFM2-VL input norm is optional
-        if (model.mm_input_norm_w) {
-            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
-            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
-        }
-
-        if (model.mm_input_norm_b) {
-            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
-        }
+        // projection
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);

        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
--- a/tools/mtmd/models/whisper-enc.cpp
+++ b/tools/mtmd/models/whisper-enc.cpp
@@ -86,15 +86,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
            FFN_GELU_ERF,
            -1);

-    } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
-        // projector
-        cur = build_ffn(cur,
-            model.mm_1_w, model.mm_1_b,
-            nullptr, nullptr,
-            model.mm_2_w, model.mm_2_b,
-            FFN_GELU_ERF,
-            -1);
-
    } else if (proj_type == PROJECTOR_TYPE_GLMA) {
            cur = ggml_norm(ctx0, cur, hparams.eps);
            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
--- a/tools/mtmd/models/youtuvl.cpp
+++ b/tools/mtmd/models/youtuvl.cpp
@@ -1,179 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_youtuvl::build() {
-    GGML_ASSERT(model.class_embedding == nullptr);
-    const int batch_size       = 1;
-    const bool use_window_attn = !hparams.wa_layer_indexes.empty();
-    const int n_pos            = n_patches;
-    const int num_position_ids = n_pos * 4;
-    const int m = 2;
-    const int Wp = n_patches_x;
-    const int Hp = n_patches_y;
-    const int Hm = Hp / m;
-    const int Wm = Wp / m;
-    norm_type norm_t = NORM_TYPE_NORMAL;
-
-    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
-
-    ggml_tensor * inp = build_inp_raw();
-
-    // change conv3d to linear
-    // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
-    {
-        inp = ggml_reshape_4d(
-            ctx0, inp,
-            Wm * m * patch_size, m * patch_size, Hm, 3);
-        inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            m * patch_size * 3, Wm, m * patch_size, Hm);
-
-        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            m * patch_size * 3, patch_size, m, Hm * Wm);
-
-        inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            patch_size, 3, patch_size, Hm * Wm * m * m);
-
-        inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
-        inp = ggml_cont_3d(
-            ctx0, inp,
-            3*patch_size* patch_size,  Hm * Wm * m * m, 1);
-    }
-    inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
-
-    if (model.patch_bias) {
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-
-    inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
-
-    ggml_tensor * inpL           = inp;
-    ggml_tensor * window_mask    = nullptr;
-    ggml_tensor * window_idx     = nullptr;
-    ggml_tensor * inv_window_idx = nullptr;
-
-    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    // pre-layernorm
-    if (model.pre_ln_w) {
-        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
-    }
-    if (use_window_attn) {
-        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
-        ggml_set_name(inv_window_idx, "inv_window_idx");
-        ggml_set_input(inv_window_idx);
-        // mask for window attention
-        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
-        ggml_set_name(window_mask, "window_mask");
-        ggml_set_input(window_mask);
-
-        // if flash attn is used, we need to pad the mask and cast to f16
-        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
-            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
-        }
-
-        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
-        GGML_ASSERT(batch_size == 1);
-        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
-        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
-        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
-    }
-
-    // loop over layers
-    for (int il = 0; il < n_layer; il++) {
-        const auto & layer = model.layers[il];
-        const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
-
-        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
-
-        // layernorm1
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
-        // self-attention
-        {
-            ggml_tensor * Qcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
-            ggml_tensor * Kcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
-            ggml_tensor * Vcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
-
-            Qcur = ggml_rope_multi(
-                ctx0, Qcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            Kcur = ggml_rope_multi(
-                ctx0, Kcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-
-            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
-
-            cur = build_attn(layer.o_w, layer.o_b,
-                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
-        }
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, inpL);
-
-        inpL = cur; // inpL = residual, cur = hidden_states
-
-        // layernorm2
-        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
-
-        // ffn
-        cur = build_ffn(cur,
-            layer.ff_up_w, layer.ff_up_b,
-            nullptr, nullptr,
-            layer.ff_down_w, layer.ff_down_b,
-            hparams.ffn_op, il);
-
-        // residual 2
-        cur = ggml_add(ctx0, inpL, cur);
-
-        inpL = cur;
-    }
-
-    ggml_tensor * embeddings = inpL;
-    if (use_window_attn) {
-        const int spatial_merge_unit = 4;
-        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
-        ggml_set_name(window_idx, "window_idx");
-        ggml_set_input(window_idx);
-        GGML_ASSERT(batch_size == 1);
-        embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
-        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
-        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
-        cb(embeddings, "window_order_restored", -1);
-    }
-
-    // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
-    if (model.post_ln_w) {
-        embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
-    }
-
-    // Now apply merger (VLPatchMerger):
-    // 1. Apply RMS norm (ln_q in VLPatchMerger)
-    embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
-    cb(embeddings, "merger_normed", -1);
-
-    // 2. First reshape for spatial merge (merge 2x2 patches)
-    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
-    cb(embeddings, "merger_reshaped", -1);
-
-    embeddings = build_ffn(embeddings,
-                    model.mm_0_w, model.mm_0_b,
-                    nullptr, nullptr,
-                    model.mm_1_w, model.mm_1_b,
-                    FFN_GELU,
-                    -1);
-    ggml_build_forward_expand(gf, embeddings);
-
-    return gf;
-}
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -9,250 +9,207 @@
 #include <fstream>
 #include <algorithm>

-// some of the code here is copied from whisper.cpp
+// most of the code here is copied from whisper.cpp

 constexpr bool DEBUG = false;

-void mtmd_audio_cache::fill_sin_cos_table(int n) {
-    sin_vals.resize(n);
-    cos_vals.resize(n);
-    for (int i = 0; i < n; i++) {
-        double theta = (2 * M_PI * i) / n;
-        sin_vals[i]  = sinf(theta);
-        cos_vals[i]  = cosf(theta);
-    }
-}
+struct mtmd_audio_mel_filters {
+    int32_t n_mel;
+    int32_t n_fft;

-void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
-    hann_window.resize(length);
-    int offset = -1;
-    if (periodic) {
-        offset = 0;
-    }
-    for (int i = 0; i < length; i++) {
-        hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
-    }
-}
+    std::vector<float> data;
+};

-void mtmd_audio_cache::fill_mel_filterbank_matrix(int   n_mel,
-                                                  int   n_fft,
-                                                  int   sample_rate,
-                                                  float fmin,
-                                                  float fmax,
-                                                  bool  slaney_area_norm,
-                                                  float scale) {
-    GGML_ASSERT(n_mel > 0 && n_fft > 1);
-    if (fmax <= 0.0f) {
-        fmax = 0.5f * sample_rate;
-    }
+// note: this global cache is shared among all preprocessors
+//       if we want to use multiple preprocessors at the same time,
+//       we will need to enclose it in the preprocessor class in the future
+static struct mtmd_audio_global_cache {
+    // precomputed sin/cos table for FFT
+    std::vector<float> sin_vals;
+    std::vector<float> cos_vals;

-    // Slaney scale (matches librosa default)
-    const double min_log_hz  = 1000.0;
-    const double lin_slope   = 3 / 200.;
-    const double min_log_mel = min_log_hz * lin_slope;
-    const double log_step    = log(6.4) / 27.0;
-    auto         hz_to_mel   = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
-        return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
-    };
-    auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
-        return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
-    };
+    // hann window
+    std::vector<float> hann_window;

-    // infer N_fft from n_fft_bins
-    const double bin_hz_step = double(sample_rate) / double(n_fft);
+    // mel filter bank
+    mtmd_audio_mel_filters filters;

-    // mel grid: n_mel + 2 edges
-    const double        m_lo = hz_to_mel(fmin);
-    const double        m_hi = hz_to_mel(fmax);
-    std::vector<double> mel_pts(n_mel + 2);
-    for (int i = 0; i < n_mel + 2; ++i) {
-        mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
-    }
-
-    // convert to Hz
-    std::vector<double> hz_pts(n_mel + 2);
-    for (int i = 0; i < n_mel + 2; ++i) {
-        hz_pts[i] = mel_to_hz(mel_pts[i]);
-    }
-
-    const int n_fft_bins = n_fft / 2 + 1;
-
-    // filterbank
-    std::vector<float> out(n_mel * n_fft_bins, 0);
-    for (int m = 0; m < n_mel; ++m) {
-        const double f_left   = hz_pts[m];
-        const double f_center = hz_pts[m + 1];
-        const double f_right  = hz_pts[m + 2];
-
-        const double denom_l = std::max(1e-30, f_center - f_left);
-        const double denom_r = std::max(1e-30, f_right - f_center);
-        const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
-
-        for (int k = 0; k < n_fft_bins; ++k) {
-            const double f = k * bin_hz_step;
-            double       w = 0.0;
-            if (f >= f_left && f <= f_center) {
-                w = (f - f_left) / denom_l;
-            } else if (f > f_center && f <= f_right) {
-                w = (f_right - f) / denom_r;
-            }
-            out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+    void fill_sin_cos_table(int n) {
+        sin_vals.resize(n);
+        cos_vals.resize(n);
+        for (int i = 0; i < n; i++) {
+            double theta = (2 * M_PI * i) / n;
+            sin_vals[i] = sinf(theta);
+            cos_vals[i] = cosf(theta);
        }
    }

-    filters.n_mel = n_mel;
-    filters.n_fft = n_fft;
-    filters.data  = std::move(out);
+    void fill_hann_window(int length, bool periodic) {
+        hann_window.resize(length);
+        int offset = -1;
+        if (periodic) {
+            offset = 0;
+        }
+        for (int i = 0; i < length; i++) {
+            hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+        }
+    }

-    if (DEBUG) {  // debug
-        for (size_t i = 0; i < filters.data.size(); ++i) {
-            if (filters.data[i] != 0.0f) {
-                printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
+    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
+    void fill_mel_filterbank_matrix(
+        int n_mel,
+        int n_fft,
+        int sample_rate,            // e.g. 16000
+        float fmin = 0.0f,          // e.g. 0.0
+        float fmax = -1.0f,         // e.g. sr/2; pass -1 for auto
+        bool slaney_area_norm = true,
+        float scale = 1.0f          // optional extra scaling; use 1.0f/1000.0f to mimic your code
+    ) {
+        GGML_ASSERT(n_mel > 0 && n_fft > 1);
+        if (fmax <= 0.0f) {
+            fmax = 0.5f * sample_rate;
+        }
+
+        // Slaney scale (matches librosa default)
+        const double min_log_hz = 1000.0;
+        const double lin_slope = 3 / 200.;
+        const double min_log_mel = min_log_hz * lin_slope;
+        const double log_step = log(6.4) / 27.0;
+        auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
+            return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
+        };
+        auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
+            return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
+        };
+
+        // infer N_fft from n_fft_bins
+        const double bin_hz_step = double(sample_rate) / double(n_fft);
+
+        // mel grid: n_mel + 2 edges
+        const double m_lo = hz_to_mel(fmin);
+        const double m_hi = hz_to_mel(fmax);
+        std::vector<double> mel_pts(n_mel + 2);
+        for (int i = 0; i < n_mel + 2; ++i) {
+            mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
+        }
+
+        // convert to Hz
+        std::vector<double> hz_pts(n_mel + 2);
+        for (int i = 0; i < n_mel + 2; ++i) {
+            hz_pts[i] = mel_to_hz(mel_pts[i]);
+        }
+
+        const int n_fft_bins = n_fft / 2 + 1;
+
+        // filterbank
+        std::vector<float> out(n_mel * n_fft_bins, 0);
+        for (int m = 0; m < n_mel; ++m) {
+            const double f_left   = hz_pts[m];
+            const double f_center = hz_pts[m + 1];
+            const double f_right  = hz_pts[m + 2];
+
+            const double denom_l = std::max(1e-30, f_center - f_left);
+            const double denom_r = std::max(1e-30, f_right  - f_center);
+            const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
+
+            for (int k = 0; k < n_fft_bins; ++k) {
+                const double f = k * bin_hz_step;
+                double w = 0.0;
+                if (f >= f_left && f <= f_center) {
+                    w = (f - f_left) / denom_l;
+                } else if (f > f_center && f <= f_right) {
+                    w = (f_right - f) / denom_r;
+                }
+                out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+            }
+        }
+
+        filters.n_mel = n_mel;
+        filters.n_fft = n_fft;
+        filters.data  = std::move(out);
+
+        if (DEBUG) { // debug
+            for (size_t i = 0; i < filters.data.size(); ++i) {
+                if (filters.data[i] != 0.0f) {
+                    printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+                }
            }
        }
    }
-}
+} g_cache;

-// Unified DFT implementation for both forward and inverse transforms
-// Template parameters:
-//   Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
-//            true  = IDFT with exp(+2πi·k·n/N), scales by 1/N
-//   RealInput: true = input is real-valued (stride 1), avoids imaginary computations
-//              false = input is complex-valued (interleaved real/imag, stride 2)
-template <bool Inverse, bool RealInput>
-static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
-    const int n_sin_cos_vals = cache.sin_vals.size();
-    const int sin_cos_step   = n_sin_cos_vals / N;
-
-    constexpr float sign  = Inverse ? 1.0f : -1.0f;
-    const float     scale = Inverse ? (1.0f / N) : 1.0f;
+// naive Discrete Fourier Transform
+// input is real-valued
+// output is complex-valued
+static void dft(const float * in, int N, float * out) {
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
+    const int sin_cos_step = n_sin_cos_vals / N;

    for (int k = 0; k < N; k++) {
        float re = 0;
        float im = 0;

        for (int n = 0; n < N; n++) {
-            int   idx     = (k * n * sin_cos_step) % n_sin_cos_vals;
-            float cos_val = cache.cos_vals[idx];
-            float sin_val = cache.sin_vals[idx];
-
-            if constexpr (RealInput) {
-                // Real input: in_im = 0, simplifies to:
-                // re += in_re * cos_val
-                // im += sign * in_re * sin_val
-                float in_re = in[n];
-                re += in_re * cos_val;
-                im += sign * in_re * sin_val;
-            } else {
-                float in_re = in[n * 2 + 0];
-                float in_im = in[n * 2 + 1];
-                // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
-                re += in_re * cos_val - sign * in_im * sin_val;
-                im += sign * in_re * sin_val + in_im * cos_val;
-            }
+            int idx = (k * n * sin_cos_step) % (n_sin_cos_vals); // t = 2*M_PI*k*n/N
+            re += in[n] * g_cache.cos_vals[idx]; // cos(t)
+            im -= in[n] * g_cache.sin_vals[idx]; // sin(t)
        }

-        out[k * 2 + 0] = re * scale;
-        out[k * 2 + 1] = im * scale;
+        out[k*2 + 0] = re;
+        out[k*2 + 1] = im;
    }
 }

-// Cooley-Tukey FFT/IFFT unified implementation
-// Template parameters:
-//   Inverse: false = FFT with exp(-2πi·k/N), no scaling
-//            true  = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
-//   RealInput: true = input is real-valued (stride 1)
-//              false = input is complex-valued (interleaved real/imag, stride 2)
-template <bool Inverse, bool RealInput>
-static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    const int n_sin_cos_vals = cache.sin_vals.size();
-
+// Cooley-Tukey FFT
+// poor man's implementation - use something better
+// input is real-valued
+// output is complex-valued
+static void fft(float * in, int N, float * out) {
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
    if (N == 1) {
        out[0] = in[0];
-        if constexpr (RealInput) {
-            out[1] = 0.0f;
-        } else {
-            out[1] = in[1];
-        }
+        out[1] = 0;
        return;
    }

    const int half_N = N / 2;
-    if (N - half_N * 2 == 1) {
-        // Odd N: fall back to DFT
-        dft_impl<Inverse, RealInput>(cache, in, N, out);
+    if (N - half_N*2 == 1) {
+        dft(in, N, out);
        return;
    }

-    // Split into even and odd
-    if constexpr (RealInput) {
-        // Real input: stride is 1, copy only real values
-        float * even = in + N;
-        for (int i = 0; i < half_N; ++i) {
-            even[i] = in[2 * i];
-        }
-        float * even_fft = out + 2 * N;
-        fft_impl<Inverse, true>(cache, even, half_N, even_fft);
-
-        float * odd = even;
-        for (int i = 0; i < half_N; ++i) {
-            odd[i] = in[2 * i + 1];
-        }
-        float * odd_fft = even_fft + N;
-        fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
-    } else {
-        // Complex input: stride is 2, copy complex pairs
-        float * even = in + N * 2;
-        for (int i = 0; i < half_N; ++i) {
-            even[i * 2 + 0] = in[2 * i * 2 + 0];
-            even[i * 2 + 1] = in[2 * i * 2 + 1];
-        }
-        float * even_fft = out + 2 * N;
-        fft_impl<Inverse, false>(cache, even, half_N, even_fft);
-
-        float * odd = even;
-        for (int i = 0; i < half_N; ++i) {
-            odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
-            odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
-        }
-        float * odd_fft = even_fft + N;
-        fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
+    float* even = in + N;
+    for (int i = 0; i < half_N; ++i) {
+        even[i]= in[2*i];
    }
+    float* even_fft = out + 2 * N;
+    fft(even, half_N, even_fft);

-    float * even_fft = out + 2 * N;
-    float * odd_fft  = even_fft + N;
+    float* odd = even;
+    for (int i = 0; i < half_N; ++i) {
+        odd[i] = in[2*i + 1];
+    }
+    float* odd_fft = even_fft + N;
+    fft(odd, half_N, odd_fft);

    const int sin_cos_step = n_sin_cos_vals / N;
-
-    constexpr float sign  = Inverse ? 1.0f : -1.0f;
-    constexpr float scale = Inverse ? 0.5f : 1.0f;
-
    for (int k = 0; k < half_N; k++) {
-        int   idx = k * sin_cos_step;  // t = 2*M_PI*k/N
-        float re  = cache.cos_vals[idx];
-        float im  = sign * cache.sin_vals[idx];
+        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
+        float re =  g_cache.cos_vals[idx]; // cos(t)
+        float im = -g_cache.sin_vals[idx]; // sin(t)

-        float re_odd = odd_fft[2 * k + 0];
-        float im_odd = odd_fft[2 * k + 1];
+        float re_odd = odd_fft[2*k + 0];
+        float im_odd = odd_fft[2*k + 1];

-        out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
-        out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
+        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
+        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;

-        out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
-        out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
+        out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
+        out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
    }
 }

-// Forward FFT for real input (used by mel spectrogram)
-static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    fft_impl<false, true>(cache, in, N, out);
-}
-
-// Inverse FFT for complex input
-static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    fft_impl<true, false>(cache, in, N, out);
-}
-
 struct filter_params {
    int32_t n_mel;
    int32_t n_fft_bins;
@@ -265,27 +222,20 @@ struct filter_params {
    bool    norm_per_feature = false;
 };

-static void log_mel_spectrogram_worker_thread(int                        ith,
-                                              const float *              hann,
-                                              const std::vector<float> & samples,
-                                              int                        n_samples,
-                                              int                        frame_size,
-                                              int                        frame_step,
-                                              int                        n_threads,
-                                              const filter_params &      params,
-                                              const mtmd_audio_cache &   cache,
-                                              mtmd_audio_mel &           out) {
+static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
+                                              int n_samples, int frame_size, int frame_step, int n_threads,
+                                              const filter_params & params, mtmd_audio_mel & out) {
    std::vector<float> fft_in(frame_size * 2, 0.0);
    std::vector<float> fft_out(frame_size * 2 * 2 * 2);

    int n_fft_bins = params.n_fft_bins;
    int i = ith;

-    const auto & filters = cache.filters;
+    const auto & filters = g_cache.filters;

    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
    GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
-    GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
+    GGML_ASSERT(g_cache.sin_vals.size() == g_cache.cos_vals.size());
    // calculate FFT only when fft_in are not all zero
    for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
        const int offset = i * frame_step;
@@ -301,7 +251,7 @@ static void log_mel_spectrogram_worker_thread(int                        ith,
        }

        // FFT
-        fft(cache, fft_in.data(), frame_size, fft_out.data());
+        fft(fft_in.data(), frame_size, fft_out.data());

        // Calculate modulus^2 of complex numbers
        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
@@ -348,7 +298,6 @@ static bool log_mel_spectrogram(
        const int     n_samples_in,
        const int     n_threads,
        const filter_params & params,
-        const mtmd_audio_cache & cache,
        mtmd_audio_mel & out) {
    //const int64_t t_start_us = ggml_time_us();

@@ -356,9 +305,9 @@ static bool log_mel_spectrogram(
    int n_samples = n_samples_in;

    // Hann window
-    const float * hann       = cache.hann_window.data();
-    const int     frame_size = (params.n_fft_bins - 1) * 2;
-    const int     frame_step = params.hop_length;
+    const float * hann = g_cache.hann_window.data();
+    const int frame_size = (params.n_fft_bins - 1) * 2;
+    const int frame_step = params.hop_length;

    // Padding
    std::vector<float> samples_padded;
@@ -386,9 +335,9 @@ static bool log_mel_spectrogram(

    // preemphasis
    if (params.preemph) {
-        const int   pad_amount = frame_size / 2;
+        const int pad_amount = frame_size / 2;
        const float preemph = 0.97f;
-        float       prev = samples_padded[pad_amount];
+        float prev = samples_padded[pad_amount];
        for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
            float cur = samples_padded[i];
            samples_padded[i] = cur - preemph * prev;
@@ -423,14 +372,14 @@ static bool log_mel_spectrogram(
    {
        std::vector<std::thread> workers(n_threads - 1);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw] =
-                std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
-                            frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
+            workers[iw] = std::thread(
+                    log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
+                    n_samples, frame_size, frame_step, n_threads,
+                    std::cref(params), std::ref(out));
        }

        // main thread
-        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
-                                          cache, out);
+        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, out);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
            workers[iw].join();
        }
@@ -455,7 +404,7 @@ static bool log_mel_spectrogram(

            for (int j = 0; j < effective_n_len; ++j) {
                auto &value = out.data[i * out.n_len + j];
-                value        = (value - mean) / mstd;
+                value = (value - mean) / mstd;
            }

            // pad the rest with zeros
@@ -501,14 +450,18 @@ static bool log_mel_spectrogram(
 //

 void mtmd_audio_preprocessor_whisper::initialize() {
-    cache.fill_sin_cos_table(hparams.audio_n_fft);
-    cache.fill_hann_window(hparams.audio_window_len, true);
-    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+    g_cache.fill_sin_cos_table(hparams.audio_n_fft);
+    g_cache.fill_hann_window(hparams.audio_window_len, true);
+    g_cache.fill_mel_filterbank_matrix(
+        hparams.n_mel_bins,
+        hparams.audio_n_fft,
+        hparams.audio_sample_rate);
 }

-bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 samples,
-                                                 size_t                        n_samples,
-                                                 std::vector<mtmd_audio_mel> & output) {
+bool mtmd_audio_preprocessor_whisper::preprocess(
+        const float * samples,
+        size_t n_samples,
+        std::vector<mtmd_audio_mel> & output) {
    if (n_samples == 0) {
        // empty audio
        return false;
@@ -518,7 +471,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    // if input is too short, pad with zeros
    // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
    // TODO: maybe handle this better
-    size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1);  // +1 second margin
+    size_t min_samples = (size_t)hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
    if (n_samples < min_samples) {
        smpl.resize(min_samples, 0.0f);
        std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
@@ -533,19 +486,22 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    params.hop_length       = hparams.audio_hop_len;
    params.sample_rate      = hparams.audio_sample_rate;
    params.center_padding   = false;
-    params.preemph          = 0.0f;  // disabled
+    params.preemph          = 0.0f; // disabled
    params.use_natural_log  = false;
    params.norm_per_feature = false;

-    // make sure the cache is initialized
-    GGML_ASSERT(!cache.sin_vals.empty());
-    GGML_ASSERT(!cache.cos_vals.empty());
-    GGML_ASSERT(!cache.filters.data.empty());
+    // make sure the global cache is initialized
+    GGML_ASSERT(!g_cache.sin_vals.empty());
+    GGML_ASSERT(!g_cache.cos_vals.empty());
+    GGML_ASSERT(!g_cache.filters.data.empty());

    mtmd_audio_mel out_full;
-    bool           ok = log_mel_spectrogram(samples, n_samples,
-                                            4,  // n_threads
-                                            params, cache, out_full);
+    bool ok = log_mel_spectrogram(
+                samples,
+                n_samples,
+                4, // n_threads
+                params,
+                out_full);
    if (!ok) {
        return false;
    }
@@ -556,21 +512,21 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
        printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
    }
    const size_t frames_per_chunk = 3000;
-    GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
-    for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
-        int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
-        if ((size_t) n_len < frames_per_chunk) {
-            break;  // last uncomplete chunk will always be a padded chunk, safe to ignore
+    GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
+    for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
+        int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
+        if ((size_t)n_len < frames_per_chunk) {
+            break; // last uncomplete chunk will always be a padded chunk, safe to ignore
        }

        mtmd_audio_mel out_chunk;
        out_chunk.n_len     = n_len;
        out_chunk.n_mel     = out_full.n_mel;
-        out_chunk.n_len_org = out_full.n_mel;  // unused
+        out_chunk.n_len_org = out_full.n_mel; // unused
        out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);

        for (int i = 0; i < out_full.n_mel; i++) {
-            auto src = out_full.data.begin() + i * out_full.n_len + off;
+            auto src = out_full.data.begin() + i*out_full.n_len + off;
            out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
        }

@@ -585,14 +541,18 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
 //

 void mtmd_audio_preprocessor_conformer::initialize() {
-    cache.fill_sin_cos_table(hparams.audio_n_fft);
-    cache.fill_hann_window(hparams.audio_window_len, true);
-    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+    g_cache.fill_sin_cos_table(hparams.audio_n_fft);
+    g_cache.fill_hann_window(hparams.audio_window_len, true);
+    g_cache.fill_mel_filterbank_matrix(
+        hparams.n_mel_bins,
+        hparams.audio_n_fft,
+        hparams.audio_sample_rate);
 }

-bool mtmd_audio_preprocessor_conformer::preprocess(const float *                 samples,
-                                                   size_t                        n_samples,
-                                                   std::vector<mtmd_audio_mel> & output) {
+bool mtmd_audio_preprocessor_conformer::preprocess(
+        const float * samples,
+        size_t n_samples,
+        std::vector<mtmd_audio_mel> & output) {
    // empty audio
    if (n_samples == 0) {
        return false;
@@ -609,15 +569,18 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float *
    params.use_natural_log  = true;
    params.norm_per_feature = true;

-    // make sure the cache is initialized
-    GGML_ASSERT(!cache.sin_vals.empty());
-    GGML_ASSERT(!cache.cos_vals.empty());
-    GGML_ASSERT(!cache.filters.data.empty());
+    // make sure the global cache is initialized
+    GGML_ASSERT(!g_cache.sin_vals.empty());
+    GGML_ASSERT(!g_cache.cos_vals.empty());
+    GGML_ASSERT(!g_cache.filters.data.empty());

    mtmd_audio_mel out_full;
-    bool           ok = log_mel_spectrogram(samples, n_samples,
-                                            4,  // n_threads
-                                            params, cache, out_full);
+    bool ok = log_mel_spectrogram(
+                samples,
+                n_samples,
+                4, // n_threads
+                params,
+                out_full);
    if (!ok) {
        return false;
    }
@@ -625,106 +588,3 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float *
    output.push_back(std::move(out_full));
    return true;
 }
-
-//
-// mtmd_audio_streaming_istft implementation
-//
-
-mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
-    n_fft(n_fft),
-    hop_length(hop_length),
-    n_fft_bins(n_fft / 2 + 1),
-    overlap_buffer(n_fft, 0.0f),
-    window_sum_buffer(n_fft, 0.0f),
-    padding_to_remove((n_fft - hop_length) / 2),
-    ifft_in(n_fft * 2 * 4, 0.0f),  // extra space for recursive IFFT
-    ifft_out(n_fft * 2 * 4, 0.0f) {
-    cache.fill_sin_cos_table(n_fft);
-    cache.fill_hann_window(n_fft, true);
-}
-
-void mtmd_audio_streaming_istft::reset() {
-    std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
-    std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
-    padding_to_remove = (n_fft - hop_length) / 2;
-}
-
-std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
-    std::vector<float> output(hop_length);
-
-    // copy frequencies
-    for (int j = 0; j < n_fft_bins; j++) {
-        ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
-        ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
-    }
-
-    // mirror negative frequencies
-    for (int j = 1; j < n_fft_bins - 1; j++) {
-        int mirror_idx              = n_fft - j;
-        ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
-        ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1];  // conjugate
-    }
-
-    ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
-
-    // update window sum and overlap buffer
-    for (int j = 0; j < n_fft; j++) {
-        window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
-        overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
-    }
-
-    // extract hop_length samples with normalization
-    for (int i = 0; i < hop_length; i++) {
-        if (window_sum_buffer[i] > 1e-8f) {
-            output[i] = overlap_buffer[i] / window_sum_buffer[i];
-        } else {
-            output[i] = overlap_buffer[i];
-        }
-    }
-
-    // shift buffers left by hop_length
-    std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
-    std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
-
-    std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
-    std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
-
-    // Remove padding if needed
-    int to_remove = std::min(padding_to_remove, (int) output.size());
-    padding_to_remove -= to_remove;
-    output.erase(output.begin(), output.begin() + to_remove);
-
-    return output;
-}
-
-std::vector<float> mtmd_audio_streaming_istft::flush() {
-    std::vector<float> output;
-
-    // Extract remaining samples from overlap buffer
-    // Continue until we've extracted all meaningful samples
-    int remaining = n_fft - hop_length;
-    while (remaining > 0) {
-        int chunk_size = std::min(remaining, hop_length);
-
-        for (int i = 0; i < chunk_size; i++) {
-            float sample;
-            if (window_sum_buffer[i] > 1e-8f) {
-                sample = overlap_buffer[i] / window_sum_buffer[i];
-            } else {
-                sample = overlap_buffer[i];
-            }
-            output.push_back(sample);
-        }
-
-        // Shift buffers
-        std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
-        std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
-
-        std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
-        std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
-
-        remaining -= chunk_size;
-    }
-
-    return output;
-}
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@@ -17,38 +17,6 @@ struct mtmd_audio_mel {
    std::vector<float> data;
 };

-struct mtmd_audio_mel_filters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-// cache for audio processing, each processor instance owns its own cache
-struct mtmd_audio_cache {
-    std::vector<float> sin_vals;
-    std::vector<float> cos_vals;
-
-    std::vector<float> hann_window;
-
-    mtmd_audio_mel_filters filters;
-
-    void fill_sin_cos_table(int n);
-
-    void fill_hann_window(int length, bool periodic);
-
-    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
-    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
-    void fill_mel_filterbank_matrix(int   n_mel,
-                                    int   n_fft,
-                                    int   sample_rate,               // e.g. 16000
-                                    float fmin             = 0.0f,   // e.g. 0.0
-                                    float fmax             = -1.0f,  // e.g. sr/2; pass -1 for auto
-                                    bool  slaney_area_norm = true,
-                                    float scale = 1.0f  // optional extra scaling
-    );
-};
-
 struct mtmd_audio_preprocessor {
    const clip_hparams & hparams;

@@ -63,51 +31,10 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
    mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
    void initialize() override;
    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
-
-  private:
-    mtmd_audio_cache cache;
 };

 struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
    mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
    void initialize() override;
    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
-
-  private:
-    mtmd_audio_cache cache;
-};
-
-//
-// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
-//
-struct mtmd_audio_streaming_istft {
-    mtmd_audio_streaming_istft(int n_fft, int hop_length);
-
-    // reset streaming state
-    void reset();
-
-    // process a single STFT frame (streaming)
-    // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
-    // returns: up to hop_length samples
-    std::vector<float> process_frame(const float * frame_spectrum);
-
-    // flush remaining samples at end of stream
-    std::vector<float> flush();
-
-  private:
-    int n_fft;
-    int hop_length;
-    int n_fft_bins;
-
-    // Own cache for output processing
-    mtmd_audio_cache cache;
-
-    // Streaming state
-    std::vector<float> overlap_buffer;
-    std::vector<float> window_sum_buffer;
-    int                padding_to_remove;
-
-    // Working buffers for IFFT
-    std::vector<float> ifft_in;
-    std::vector<float> ifft_out;
 };
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -1,5 +1,4 @@
 #include "arg.h"
-#include "debug.h"
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
@@ -89,8 +88,6 @@ struct mtmd_cli_context {
    int n_threads    = 1;
    llama_pos n_past = 0;

-    base_callback_data cb_data;
-
    mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
        model = llama_init->model();
        lctx = llama_init->context();
@@ -142,10 +139,6 @@ struct mtmd_cli_context {
        mparams.warmup           = params.warmup;
        mparams.image_min_tokens = params.image_min_tokens;
        mparams.image_max_tokens = params.image_max_tokens;
-        if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) {
-            mparams.cb_eval_user_data = &cb_data;
-            mparams.cb_eval = common_debug_cb_eval<false>;
-        }
        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
        if (!ctx_vision.get()) {
            LOG_ERR("Failed to load vision model from %s\n", clip_path);
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -111,8 +111,6 @@ mtmd_context_params mtmd_context_params_default() {
        /* warmup            */ true,
        /* image_min_tokens  */ -1,
        /* image_max_tokens  */ -1,
-        /* cb_eval           */ nullptr,
-        /* cb_eval_user_data */ nullptr,
    };
    return params;
 }
@@ -148,6 +146,8 @@ struct mtmd_context {
    bool        tok_row_end_trail = false;
    bool        ov_img_first      = false;

+    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
+
    // string template for slice image delimiters with row/col (idefics3)
    std::string sli_img_start_tmpl;

@@ -178,8 +178,6 @@ struct mtmd_context {
            /* image_min_tokens  */ ctx_params.image_min_tokens,
            /* image_max_tokens  */ ctx_params.image_max_tokens,
            /* warmup            */ ctx_params.warmup,
-            /* cb_eval           */ ctx_params.cb_eval,
-            /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
        };

        auto res = clip_init(mmproj_fname, ctx_clip_params);
@@ -219,6 +217,7 @@ struct mtmd_context {

    void init_vision() {
        GGML_ASSERT(ctx_v != nullptr);
+        use_mrope = clip_is_mrope(ctx_v);

        projector_type proj = clip_get_projector_type(ctx_v);
        int minicpmv_version = clip_is_minicpmv(ctx_v);
@@ -267,7 +266,7 @@ struct mtmd_context {
        }

        // set boi/eoi
-        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
+        if (proj == PROJECTOR_TYPE_GEMMA3) {
            // <start_of_image> ... (image embeddings) ... <end_of_image>
            img_beg = "<start_of_image>";
            img_end = "<end_of_image>";
@@ -284,7 +283,7 @@ struct mtmd_context {
            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
            img_end = "[IMG_END]";

-        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
+        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
            img_beg = "<|vision_start|>";
            img_end = "<|vision_end|>";
@@ -331,7 +330,6 @@ struct mtmd_context {
            case PROJECTOR_TYPE_ULTRAVOX:
            case PROJECTOR_TYPE_VOXTRAL:
            case PROJECTOR_TYPE_GLMA:
-            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                break;
            case PROJECTOR_TYPE_LFM2A:
@@ -354,9 +352,6 @@ struct mtmd_context {
            // [BEGIN_AUDIO] ... (embeddings) ...
            aud_beg = "[BEGIN_AUDIO]";

-        } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
-            // <sound> ... (embeddings) ...
-            aud_beg = "<sound>";
        }
    }

@@ -628,7 +623,7 @@ struct mtmd_tokenizer {
                }

                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                if (mtmd_decode_use_mrope(ctx)) {
+                if (ctx->use_mrope) {
                    // for Qwen2VL, we need this information for M-RoPE decoding positions
                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
@@ -863,24 +858,14 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }

 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    switch (ctx->proj_type_v()) {
-        case PROJECTOR_TYPE_GEMMA3:
-            return true;
-        default:
-            return false;
+    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
+        return true;
    }
+    return false;
 }

 bool mtmd_decode_use_mrope(mtmd_context * ctx) {
-    switch (ctx->proj_type_v()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->use_mrope;
 }

 bool mtmd_support_vision(mtmd_context * ctx) {
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -27,9 +27,6 @@
 * - Make sure the C API is aligned with the libllama C API (as in llama.h)
 * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
 * - Keep the API minimal, do not expose internal details unless necessary
- *
- * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
- * We encourage human contributors to ensure the quality and reliability of the codebase.
 */

 #ifdef LLAMA_SHARED
@@ -95,10 +92,6 @@ struct mtmd_context_params {
    // limit number of image tokens, only for vision models with dynamic resolution
    int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
    int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
-
-    // callback function passed over to mtmd proper
-    ggml_backend_sched_eval_callback cb_eval;
-    void * cb_eval_user_data;
 };

 MTMD_API const char * mtmd_default_marker(void);
@@ -277,12 +270,12 @@ struct bitmap {
        ptr.reset(mtmd_bitmap_init(nx, ny, data));
    }
    ~bitmap() = default;
-    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
-    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
-    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
-    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
-    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
-    void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
+    uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
+    uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
+    const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
+    size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
+    std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
+    void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
 };

 struct bitmaps {
@@ -306,8 +299,8 @@ struct input_chunks {
    input_chunks() = default;
    input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
    ~input_chunks() = default;
-    size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
-    const mtmd_input_chunk * operator[](size_t idx) const {
+    size_t size() { return mtmd_input_chunks_size(ptr.get()); }
+    const mtmd_input_chunk * operator[](size_t idx) {
        return mtmd_input_chunks_get(ptr.get(), idx);
    }
 };
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -12,7 +12,6 @@
 #include <cmath>
 #include <cctype>
 #include <algorithm>
-#include <filesystem>

 struct quant_option {
    std::string name;
@@ -644,11 +643,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
-        fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
-        return 1;
-    }
-
    print_build_info();

    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
--- a/tools/run/CMakeLists.txt
+++ b/tools/run/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(TARGET llama-run)
+add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
+
+# TODO: avoid copying this code block from common/CMakeLists.txt
+set(LLAMA_RUN_EXTRA_LIBS "")
+if (LLAMA_CURL)
+    find_package(CURL REQUIRED)
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
+    include_directories(${CURL_INCLUDE_DIRS})
+    set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES})
+endif ()
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "AIX")
+    # AIX's flock() function comes from libbsd.a
+    target_link_libraries(${TARGET} PRIVATE -lbsd)
+endif()
+
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/tools/run/README.md
+++ b/tools/run/README.md
@@ -0,0 +1,52 @@
+# llama.cpp/example/run
+
+The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
+
+```bash
+llama-run granite3-moe
+```
+
+```bash
+Description:
+  Runs a llm
+
+Usage:
+  llama-run [options] model [prompt]
+
+Options:
+  -c, --context-size <value>
+      Context size (default: 2048)
+  -n, -ngl, --ngl <value>
+      Number of GPU layers (default: 0)
+  --temp <value>
+      Temperature (default: 0.8)
+  -v, --verbose, --log-verbose
+      Set verbosity level to infinity (i.e. log all messages, useful for debugging)
+  -h, --help
+      Show help message
+
+Commands:
+  model
+      Model is a string with an optional prefix of
+      huggingface:// (hf://), ollama://, https:// or file://.
+      If no protocol is specified and a file exists in the specified
+      path, file:// is assumed, otherwise if a file does not exist in
+      the specified path, ollama:// is assumed. Models that are being
+      pulled are downloaded with .partial extension while being
+      downloaded and then renamed as the file without the .partial
+      extension when complete.
+
+Examples:
+  llama-run llama3
+  llama-run ollama://granite-code
+  llama-run ollama://smollm:135m
+  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
+  llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
+  llama-run ms://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
+  llama-run modelscope://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
+  llama-run https://example.com/some-file1.gguf
+  llama-run some-file2.gguf
+  llama-run file://some-file3.gguf
+  llama-run --ngl 999 some-file4.gguf
+  llama-run --ngl 999 some-file5.gguf Hello World
+```
--- a/tools/run/linenoise.cpp/linenoise.cpp
+++ b/tools/run/linenoise.cpp/linenoise.cpp
--- a/tools/run/linenoise.cpp/linenoise.h
+++ b/tools/run/linenoise.cpp/linenoise.h
@@ -0,0 +1,137 @@
+/* linenoise.h -- VERSION 1.0
+ *
+ * Guerrilla line editing library against the idea that a line editing lib
+ * needs to be 20,000 lines of C++ code.
+ *
+ * See linenoise.cpp for more information.
+ *
+ * ------------------------------------------------------------------------
+ *
+ * Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
+ * Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
+ * Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *  *  Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  *  Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __LINENOISE_H
+#define __LINENOISE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h> /* For size_t. */
+#include <stdlib.h>
+
+extern const char * linenoiseEditMore;
+
+/* The linenoiseState structure represents the state during line editing.
+ * We pass this state to functions implementing specific editing
+ * functionalities. */
+struct linenoiseState {
+    int          in_completion;  /* The user pressed TAB and we are now in completion
+                         * mode, so input is handled by completeLine(). */
+    size_t       completion_idx; /* Index of next completion to propose. */
+    int          ifd;            /* Terminal stdin file descriptor. */
+    int          ofd;            /* Terminal stdout file descriptor. */
+    char *       buf;            /* Edited line buffer. */
+    size_t       buflen;         /* Edited line buffer size. */
+    const char * prompt;         /* Prompt to display. */
+    size_t       plen;           /* Prompt length. */
+    size_t       pos;            /* Current cursor position. */
+    size_t       oldcolpos;      /* Previous refresh cursor column position. */
+    size_t       len;            /* Current edited line length. */
+    size_t       cols;           /* Number of columns in terminal. */
+    size_t       oldrows;        /* Rows used by last refreshed line (multiline mode) */
+    int          history_index;  /* The history index we are currently editing. */
+};
+
+struct linenoiseCompletions {
+    size_t  len     = 0;
+    char ** cvec    = nullptr;
+    bool    to_free = true;
+
+    ~linenoiseCompletions() {
+        if (!to_free) {
+            return;
+        }
+
+        for (size_t i = 0; i < len; ++i) {
+            free(cvec[i]);
+        }
+
+        free(cvec);
+    }
+};
+
+/* Non blocking API. */
+int          linenoiseEditStart(struct linenoiseState * l, int stdin_fd, int stdout_fd, char * buf, size_t buflen,
+                                const char * prompt);
+const char * linenoiseEditFeed(struct linenoiseState * l);
+void         linenoiseEditStop(struct linenoiseState * l);
+void         linenoiseHide(struct linenoiseState * l);
+void         linenoiseShow(struct linenoiseState * l);
+
+/* Blocking API. */
+const char * linenoise(const char * prompt);
+void         linenoiseFree(void * ptr);
+
+/* Completion API. */
+typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
+typedef const char *(linenoiseHintsCallback) (const char *, int * color, int * bold);
+typedef void(linenoiseFreeHintsCallback)(const char *);
+void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
+void linenoiseSetHintsCallback(linenoiseHintsCallback *);
+void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
+void linenoiseAddCompletion(linenoiseCompletions *, const char *);
+
+/* History API. */
+int linenoiseHistoryAdd(const char * line);
+int linenoiseHistorySetMaxLen(int len);
+int linenoiseHistorySave(const char * filename);
+int linenoiseHistoryLoad(const char * filename);
+
+/* Other utilities. */
+void linenoiseClearScreen(void);
+void linenoiseSetMultiLine(int ml);
+void linenoisePrintKeyCodes(void);
+void linenoiseMaskModeEnable(void);
+void linenoiseMaskModeDisable(void);
+
+/* Encoding functions. */
+typedef size_t(linenoisePrevCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
+typedef size_t(linenoiseNextCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
+typedef size_t(linenoiseReadCode)(int fd, char * buf, size_t buf_len, int * c);
+
+void linenoiseSetEncodingFunctions(linenoisePrevCharLen * prevCharLenFunc, linenoiseNextCharLen * nextCharLenFunc,
+                                   linenoiseReadCode * readCodeFunc);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __LINENOISE_H */
--- a/tools/run/run.cpp
+++ b/tools/run/run.cpp
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -38,6 +38,14 @@ set(TARGET_SRCS
    server-http.h
    server-models.cpp
    server-models.h
+    server-task.cpp
+    server-task.h
+    server-queue.cpp
+    server-queue.h
+    server-common.cpp
+    server-common.h
+    server-context.cpp
+    server-context.h
 )
 set(PUBLIC_ASSETS
    index.html.gz
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -33,7 +33,6 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
 | `--verbose-prompt` | print a verbose prompt before generation (default: false) |
@@ -74,23 +73,22 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
-| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
-| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type<br/>(env: LLAMA_ARG_OVERRIDE_TENSOR) |
+| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
 | `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
 | `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
-| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
 | `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
-| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
+| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
 | `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
 | `--check-tensors` | check model tensor data for invalid values (default: false) |
-| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
+| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
 | `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
 | `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
 | `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
@@ -130,8 +128,6 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
 | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
 | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
-| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) |
-| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) |
 | `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
 | `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
 | `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
@@ -155,7 +151,6 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
 | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
-| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)<br/>(env: LLAMA_ARG_BACKEND_SAMPLING) |


 ### Server-specific params
@@ -192,11 +187,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
-| `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
+| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
-| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
+| `--chat-template-kwargs STRING` | sets additional params for the json template parser<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
 | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
@@ -212,8 +207,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
 | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
@@ -225,7 +220,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
 | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
 | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
 | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
 | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
 | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
@@ -784,8 +779,7 @@ By default, it is read-only. To make POST request to change global properties, y
  "modalities": {
    "vision": false
  },
-  "build_info": "b(build number)-(build commit hash)",
-  "is_sleeping": false
+  "build_info": "b(build number)-(build commit hash)"
 }
 ```

@@ -794,7 +788,6 @@ By default, it is read-only. To make POST request to change global properties, y
 - `model_path` - the path to model file (same with `-m` argument)
 - `chat_template` - the model's original Jinja2 prompt template
 - `modalities` - the list of supported modalities
- `is_sleeping` - sleeping status, see [Sleeping on idle](#sleeping-on-idle)

 ### POST `/props`: Change server global properties.

@@ -1493,7 +1486,6 @@ The precedence rule for preset options is as follows:

 We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
 - `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
- `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)

 ### Routing requests

@@ -1582,7 +1574,8 @@ Payload:

 ```json
 {
-  "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
+  "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
+  "extra_args": ["-n", "128", "--top-k", "4"]
 }
 ```

@@ -1637,12 +1630,9 @@ The server supports an automatic sleep mode that activates after a specified per

 When the server enters sleep mode, the model and its associated memory (including the KV cache) are unloaded from RAM to conserve resources. Any new incoming task will automatically trigger the model to reload.

-The sleeping status can be retrieved from the `GET /props` endpoint (or `/props?model=(model_name)` in router mode).
-
 Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer:
 - `GET /health`
 - `GET /props`
- `GET /models`

 ## More examples

--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1,10 +1,10 @@
 #include "common.h"
-#include "download.h"
 #include "log.h"
 #include "llama.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
 #include "chat.h"
+#include "arg.h" // for common_remote_get_content; TODO: use download.h only
 #include "base64.hpp"

 #include "server-common.h"
@@ -779,7 +779,7 @@ static void handle_media(
        // download remote image
        // TODO @ngxson : maybe make these params configurable
        common_remote_params params;
-        params.headers.push_back({"User-Agent", "llama.cpp/" + build_info});
+        params.headers.push_back("User-Agent: llama.cpp/" + build_info);
        params.max_size = 1024 * 1024 * 10; // 10MB
        params.timeout  = 10; // seconds
        SRV_INF("downloading image from '%s'\n", url.c_str());
@@ -1385,21 +1385,16 @@ json format_response_rerank(

 std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
    std::vector<llama_token_data> cur;
-
    const auto * logits = llama_get_logits_ith(ctx, idx);
-    const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);

-    const int n_logits = llama_get_sampled_logits_count_ith(ctx, idx);
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);

-    cur.resize(n_logits);
-    if (sampled_ids) {
-        for (int i = 0; i < n_logits; i++) {
-            cur[i] = llama_token_data{sampled_ids[i], logits[i], 0.0f};
-        }
-    } else {
-        for (llama_token token_id = 0; token_id < n_logits; token_id++) {
-            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
-        }
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    cur.resize(n_vocab);
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
    }

    // sort tokens by logits
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -4,6 +4,7 @@
 #include "server-task.h"
 #include "server-queue.h"

+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "log.h"
@@ -15,6 +16,7 @@
 #include <cstddef>
 #include <cinttypes>
 #include <memory>
+#include <unordered_set>
 #include <filesystem>

 // fix problem with std::min and std::max
@@ -45,6 +47,26 @@ enum server_state {
    SERVER_STATE_READY,          // Server is ready and model is loaded
 };

+static bool server_task_type_need_embd(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_EMBEDDING:
+        case SERVER_TASK_TYPE_RERANK:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool server_task_type_need_logits(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_COMPLETION:
+        case SERVER_TASK_TYPE_INFILL:
+            return true;
+        default:
+            return false;
+    }
+}
+
 struct server_slot {
    int id;

@@ -59,8 +81,6 @@ struct server_slot {

    common_speculative * spec = nullptr;

-    // TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
-    //       see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
    std::unique_ptr<const server_task> task;
    std::unique_ptr<const server_task> task_prev; // used for debugging

@@ -127,17 +147,6 @@ struct server_slot {
        return res;
    }

-    void prompt_clear(bool allow_processing) {
-        if (!allow_processing) {
-            GGML_ASSERT(!is_processing());
-        }
-
-        SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
-
-        llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
-        prompt.tokens.clear();
-    }
-
    std::vector<common_adapter_lora_info> lora;
    int32_t alora_invocation_start = -1;

@@ -146,7 +155,7 @@ struct server_slot {

    common_sampler_ptr smpl;

-    llama_token  sampled; // in speculative mode, this is the last accepted token
+    llama_token sampled; // in speculative mode, this is the last accepted token
    llama_tokens drafted;

    // stats
@@ -158,7 +167,7 @@ struct server_slot {
    double t_prompt_processing; // ms
    double t_token_generation;  // ms

-    std::function<void(int /* slot_id */)> callback_on_release;
+    std::function<void(int)> callback_on_release;

    // Speculative decoding stats
    int32_t n_draft_total = 0;      // Total draft tokens generated
@@ -187,46 +196,30 @@ struct server_slot {
        n_draft_total = 0;
        n_draft_accepted = 0;

-        task_prev = std::move(task);
        task.reset();
-
-        llama_set_sampler(ctx, id, nullptr);
+        task_prev.reset();

        // clear alora start
        alora_invocation_start = -1;
    }

-    void init_sampler() const {
-        common_sampler_reset(smpl.get());
+    bool need_embd() const {
+        GGML_ASSERT(task);

-        if (!task->need_sampling()) {
-            return;
-        }
+        return server_task_type_need_embd(task->type);
+    }

-        const int64_t t_start = ggml_time_us();
+    bool need_logits() const {
+        GGML_ASSERT(task);

-        int n_text = 0;
-
-        for (int i = 0; i < (int) prompt.tokens.size(); i++) {
-            const llama_token id = prompt.tokens[i];
-
-            if (id != LLAMA_TOKEN_NULL) {
-                common_sampler_accept(smpl.get(), id, false);
-                n_text++;
-            }
-        }
-
-        SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
-                (ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
+        return server_task_type_need_logits(task->type);
    }

    // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
    // also we cannot split if the pooling would require any past tokens
    bool can_split() const {
-        GGML_ASSERT(task);
-
        return
-            !task->need_embd() ||
+            !need_embd() ||
            (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
    }

@@ -267,13 +260,10 @@ struct server_slot {
            SLT_WRN(*this, "%s", "slot is not processing\n");
            return;
        }
-
        generated_token_probs.push_back(token);
    }

    int get_n_draft_max() const {
-        GGML_ASSERT(task);
-
        if (!can_speculate()) {
            return 0;
        }
@@ -298,23 +288,27 @@ struct server_slot {
        return n_draft_max;
    }

+    // note: a slot can also be either a parent or a child
+    bool is_parent() const {
+        return is_processing() && task->n_children > 0;
+    }
+
+    bool is_child() const {
+        return is_processing() && task->id_parent >= 0;
+    }
+
    void release() {
        if (is_processing()) {
            GGML_ASSERT(task);

            SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated);

-            t_last_used        =  ggml_time_us();
+            t_last_used = ggml_time_us();
            t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
-
            state = SLOT_STATE_IDLE;

-            // do not keep context of the child slots - the parent's context is enough
-            if (task->is_child()) {
-                prompt_clear(false);
-            }
-
-            reset();
+            task_prev = std::move(task);
+            task.reset();

            callback_on_release(id);
        }
@@ -433,22 +427,14 @@ struct server_slot {
    }

    void copy_state_to(server_slot & other) const {
-        GGML_ASSERT(state == SLOT_STATE_DONE_PROMPT);
-
-        llama_memory_seq_rm(llama_get_memory(ctx), other.id,     -1, -1);
-        llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, -1, -1);
-
+        llama_memory_seq_rm(llama_get_memory(ctx), other.id, 0, -1);
+        llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, 0, -1);
        other.n_decoded   = n_decoded;
        other.n_remaining = n_remaining;
        other.i_batch     = i_batch;
-
-        other.t_start_process_prompt    = t_start_process_prompt;
-        other.t_prompt_processing       = t_prompt_processing;
        other.n_prompt_tokens_cache     = n_prompt_tokens_cache;
        other.n_prompt_tokens_processed = n_prompt_tokens_processed;
-
        other.prompt = prompt.clone();
-        other.init_sampler();
    }
 };

@@ -761,8 +747,6 @@ private:
        }

        slots.clear();
-
-        // initialize slots
        for (int i = 0; i < params_base.n_parallel; i++) {
            server_slot slot;

@@ -794,8 +778,8 @@ private:

            SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);

-            slot.callback_on_release = [this](int slot_id) {
-                queue_tasks.pop_deferred_task(slot_id);
+            slot.callback_on_release = [this](int) {
+                queue_tasks.pop_deferred_task();
            };

            slot.reset();
@@ -909,9 +893,9 @@ private:
        return true;
    }

-    server_slot * get_slot_by_id(int id_slot) {
+    server_slot * get_slot_by_id(int id) {
        for (server_slot & slot : slots) {
-            if (slot.id == id_slot) {
+            if (slot.id == id) {
                return &slot;
            }
        }
@@ -1011,7 +995,7 @@ private:
                ret->prompt_save(*prompt_cache);

                if (!ret->prompt_load(*prompt_cache, task.tokens)) {
-                    ret->prompt_clear(false);
+                    clear_slot(*ret);
                }

                prompt_cache->update();
@@ -1023,6 +1007,15 @@ private:
        return ret;
    }

+    void clear_slot(server_slot & slot) const {
+        GGML_ASSERT(!slot.is_processing());
+
+        SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());
+
+        llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
+        slot.prompt.tokens.clear();
+    }
+
    // return true if at least one slot has been cleared
    // TODO: improve logic
    //       - smarter decision which slot to clear (LRU or longest prompt?)
@@ -1043,7 +1036,7 @@ private:
            if (slot.prompt.n_tokens() > 0) {
                SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());

-                slot.prompt_clear(false);
+                clear_slot(slot);

                res = true;

@@ -1069,6 +1062,8 @@ private:
    }

    bool launch_slot_with_task(server_slot & slot, server_task && task) {
+        slot.reset();
+
        // process per-request lora adapters
        if (!task.params.lora.empty()) {
            auto task_loras = construct_lora_list(task.params.lora);
@@ -1142,7 +1137,7 @@ private:
        SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());

        // initialize samplers
-        if (task.need_sampling()) {
+        {
            slot.smpl.reset(common_sampler_init(model, task.params.sampling));

            if (slot.smpl == nullptr) {
@@ -1151,28 +1146,7 @@ private:
                return false;
            }

-            const bool need_logits = task.params.sampling.n_probs > 0;
-
-            bool backend_sampling = true;
-
-            backend_sampling &= task.params.sampling.backend_sampling;
-
-            // TODO: speculative decoding requires multiple samples per batch - not supported yet
-            backend_sampling &= !(slot.ctx_dft && task.params.speculative.n_max > 0);
-
-            // TODO: getting post/pre sampling logits is not yet supported with backend sampling
-            backend_sampling &= !need_logits;
-
-            // TODO: tmp until backend sampling is fully implemented
-            if (backend_sampling) {
-                llama_set_sampler(ctx, slot.id, common_sampler_get(slot.smpl.get()));
-            } else {
-                llama_set_sampler(ctx, slot.id, nullptr);
-            }
-
            SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
-        } else {
-            slot.smpl.reset();
        }

        // initialize draft batch
@@ -1185,11 +1159,12 @@ private:

        slot.task = std::make_unique<const server_task>(std::move(task));

-        slot.state = slot.task->is_child()
+        slot.state = slot.is_child()
            ? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
            : SLOT_STATE_STARTED;

-        SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
+        SLT_INF(slot, "%s", "processing task\n");
+
        return true;
    }

@@ -1509,9 +1484,9 @@ private:
        res->n_tokens  = slot.task->n_tokens();
        res->res_type  = slot.task->params.res_type;

-        const int n_embd_out = llama_model_n_embd_out(model);
+        const int n_embd = llama_model_n_embd(model);

-        std::vector<float> embd_res(n_embd_out, 0.0f);
+        std::vector<float> embd_res(n_embd, 0.0f);

        for (int i = 0; i < batch.n_tokens; ++i) {
            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
@@ -1528,18 +1503,18 @@ private:
            if (embd == nullptr) {
                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);

-                res->embedding.push_back(std::vector<float>(n_embd_out, 0.0f));
+                res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
                continue;
            }

            // normalize only when there is pooling
            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
-                common_embd_normalize(embd, embd_res.data(), n_embd_out, slot.task->params.embd_normalize);
+                common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize);
                res->embedding.push_back(embd_res);
                break;
            }

-            res->embedding.emplace_back(embd, embd + n_embd_out);
+            res->embedding.emplace_back(embd, embd + n_embd);
        }

        SLT_DBG(slot, "%s", "sending embeddings\n");
@@ -1584,7 +1559,9 @@ private:

    // tokenize the input if it's set by CLI, return false on error
    bool tokenize_cli_input(server_task & task) {
-        GGML_ASSERT(task.cli_input != nullptr);
+        if (task.cli_input == nullptr) {
+            return true; // nothing to do
+        }
        try {
            auto & opt = oai_parser_opt;
            common_chat_templates_inputs inputs;
@@ -1618,64 +1595,6 @@ private:
        return true;
    }

-    std::vector<server_slot *> get_free_slots(size_t n_slots_needed, int exclude_id_slot) {
-        std::vector<server_slot *> free_slots;
-        for (auto & slot : slots) {
-            if (!slot.is_processing() && slot.id != exclude_id_slot) {
-                free_slots.push_back(&slot);
-            }
-            if (free_slots.size() >= n_slots_needed) {
-                break;
-            }
-        }
-        return free_slots;
-    }
-
-    // launch multiple slots for parent + child tasks
-    bool launch_slots_with_parent_task(server_slot & parent_slot, std::vector<server_slot *> & child_slots, server_task && parent_task) {
-        GGML_ASSERT(!parent_slot.is_processing());
-        GGML_ASSERT(parent_task.is_parent());
-        GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
-
-        int id_parent = parent_task.id;
-
-        SRV_INF("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());
-
-        // to be called in case of failure to release all launched slots
-        auto release_slots = [this, id_parent]() {
-            for (auto & slot : slots) {
-                if (slot.is_processing() && (
-                        slot.task->id == id_parent ||
-                        slot.task->id_parent == id_parent
-                )) {
-                    slot.release();
-                }
-            }
-        };
-
-        // launch all child tasks first
-        size_t idx = 0;
-        GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
-        for (auto * slot : child_slots) {
-            int id_child = parent_task.child_tasks[idx].id;
-            if (!launch_slot_with_task(*slot, std::move(parent_task.child_tasks[idx]))) {
-                SRV_ERR("failed to launch slot with child task, id_task = %d\n", id_child);
-                release_slots();
-                return false;
-            }
-            idx++;
-        }
-
-        // finally, launch the parent task
-        if (!launch_slot_with_task(parent_slot, std::move(parent_task))) {
-            SRV_ERR("failed to launch slot with task, id_task = %d\n", id_parent);
-            release_slots();
-            return false;
-        }
-
-        return true;
-    }
-
    void process_single_task(server_task && task) {
        switch (task.type) {
            case SERVER_TASK_TYPE_COMPLETION:
@@ -1683,55 +1602,31 @@ private:
            case SERVER_TASK_TYPE_EMBEDDING:
            case SERVER_TASK_TYPE_RERANK:
                {
-                    // special case: if input is provided via CLI, tokenize it first
-                    // otherwise, no need to tokenize as it's already done inside the HTTP thread
-                    if (task.cli_input != nullptr) {
-                        if (!tokenize_cli_input(task)) {
-                            break;
-                        }
+                    if (!tokenize_cli_input(task)) {
+                        break;
                    }

                    const int id_slot = task.id_slot;
-                    const int id_task = task.id;

-                    server_slot * slot = id_slot != -1
-                                            ? get_slot_by_id(id_slot)
-                                            : get_available_slot(task);
-
-                    //
-                    // slot scheduling logic
-                    //
+                    server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);

                    if (slot == nullptr) {
                        // if no slot is available, we defer this task for processing later
-                        SRV_DBG("no slot is available, defer task, id_task = %d\n", id_task);
+                        SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
                        queue_tasks.defer(std::move(task));
                        break;
                    }

                    if (slot->is_processing()) {
                        // if requested slot is unavailable, we defer this task for processing later
-                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", id_task);
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                        queue_tasks.defer(std::move(task));
                        break;
                    }

-                    if (task.is_parent()) {
-                        // try getting free slots for all child tasks
-                        size_t n_child_tasks = task.child_tasks.size();
-                        std::vector<server_slot *> child_slots = get_free_slots(n_child_tasks, slot->id);
-                        if (child_slots.size() < n_child_tasks) {
-                            SRV_DBG("not enough free slots for child tasks, n_free = %zu, n_children = %zu, defer task, id_task = %d\n", child_slots.size(), n_child_tasks, id_task);
-                            queue_tasks.defer(std::move(task));
-                            break;
-                        }
-                        if (!launch_slots_with_parent_task(*slot, child_slots, std::move(task))) {
-                            SRV_ERR("failed to launch slot with parent task, id_task = %d\n", id_task);
-                            break; // drop the task
-                        }
-                    } else if (!launch_slot_with_task(*slot, std::move(task))) {
-                        SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
-                        break; // drop the task
+                    if (!launch_slot_with_task(*slot, std::move(task))) {
+                        SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
+                        break;
                    }
                } break;
            case SERVER_TASK_TYPE_CANCEL:
@@ -1905,7 +1800,7 @@ private:
                    // Erase token cache
                    const size_t n_erased = slot->prompt.tokens.size();

-                    slot->prompt_clear(false);
+                    clear_slot(*slot);

                    auto res = std::make_unique<server_task_result_slot_erase>();
                    res->id       = task.id;
@@ -2000,7 +1895,7 @@ private:
                    GGML_ABORT("not supported by multimodal");
                }

-                if (slot.task->is_parent() || slot.task->is_child()) {
+                if (slot.is_parent() || slot.is_child()) {
                    send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
                    slot.release();
                    continue;
@@ -2139,12 +2034,6 @@ private:
                    continue;
                }

-                // check if this is a child slot
-                if (slot.state == SLOT_STATE_WAIT_OTHER) {
-                    SLT_DBG(slot, "%s", "waiting for parent slot to complete\n");
-                    continue;
-                }
-
                // this slot still has a prompt to be processed
                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
                    const auto & input_tokens = slot.task->tokens;
@@ -2187,7 +2076,7 @@ private:
                        }

                        // TODO: support memory-less logits computation
-                        if (slot.task->need_logits() && !llama_get_memory(ctx)) {
+                        if (slot.need_logits() && !llama_get_memory(ctx)) {
                            send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
                            slot.release();
                            continue;
@@ -2424,12 +2313,6 @@ private:
                        slot.n_prompt_tokens_processed = 0;

                        slot.prompt.tokens.keep_first(n_past);
-
-                        // send initial 0% progress update if needed
-                        // this is to signal the client that the request has started processing
-                        if (slot.task->params.stream && slot.task->params.return_progress) {
-                            send_partial_response(slot, {}, true);
-                        }
                    }

                    if (!slot.can_split()) {
@@ -2447,7 +2330,7 @@ private:
                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
                        SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);

-                        slot.prompt_clear(true);
+                        clear_slot(slot);

                        // there is no common part left
                        slot.n_prompt_tokens_cache = 0;
@@ -2526,7 +2409,7 @@ private:
                            cur_tok,
                            slot.prompt.tokens.pos_next(),
                            { slot.id },
-                            slot.task->need_embd());
+                            slot.need_embd());
                        slot.prompt.tokens.push_back(cur_tok);

                        slot.n_prompt_tokens_processed++;
@@ -2547,6 +2430,16 @@ private:

                        GGML_ASSERT(batch.n_tokens > 0);

+                        common_sampler_reset(slot.smpl.get());
+
+                        // Process all prompt tokens through sampler system
+                        for (int i = 0; i < slot.task->n_tokens(); ++i) {
+                            llama_token id = input_tokens[i];
+                            if (id != LLAMA_TOKEN_NULL) {
+                                common_sampler_accept(slot.smpl.get(), id, false);
+                            }
+                        }
+
                        // extract the logits only for the last token
                        batch.logits[batch.n_tokens - 1] = true;

@@ -2555,8 +2448,6 @@ private:

                        SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);

-                        slot.init_sampler();
-
                        const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
                        const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);

@@ -2603,6 +2494,11 @@ private:
            }
        }

+        if (batch.n_tokens == 0) {
+            SRV_WRN("%s", "no tokens to decode\n");
+            return;
+        }
+
        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);

        if (slot_batched) {
@@ -2616,11 +2512,7 @@ private:
                slot_batched->lora[alora_disabled_id].scale = alora_scale;
            }

-            llama_set_embeddings(ctx, slot_batched->task->need_embd());
-        }
-
-        if (batch.n_tokens == 0) {
-            SRV_WRN("%s", "no tokens to decode\n");
+            llama_set_embeddings(ctx, slot_batched->need_embd());
        }

        int32_t i_next = 0;
@@ -2674,7 +2566,7 @@ private:

                                // note: it's complicated to keep track of how much of the current batch has been
                                //       processed before the error occurred, so we simply clear the entire context
-                                slot.prompt_clear(false);
+                                clear_slot(slot);
                            }
                        }

@@ -2698,30 +2590,31 @@ private:
            // on successful decode, restore the original batch size
            n_batch = llama_n_batch(ctx);

-            // handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
+            // technically, measuring the time here excludes the sampling time for the last batch
+            // but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
+            const int64_t t_current = ggml_time_us();
+
            for (auto & slot : slots) {
-                if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) {
-                    std::vector<server_slot *> children;
+                // may need to copy state to other slots
+                if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
+                    std::vector<server_slot *> child_slots;
                    for (auto & other : slots) {
                        if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
-                            children.push_back(&other);
+                            child_slots.push_back(&other);
                        }
                    }

-                    // all children slots should already launched by launch_slots_with_parent_task()
-                    // copy state to the child slots
-                    for (auto & child : children) {
-                        SLT_INF(slot, " - copying state to child %d\n", child->id);
-
-                        GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
-
-                        slot.copy_state_to(*child);
-                        child->state = SLOT_STATE_DONE_PROMPT;
+                    // we can only proceed if all child slots are having the correct tasks
+                    if (child_slots.size() == slot.task->n_children) {
+                        // copy state to the child slots
+                        for (auto & child : child_slots) {
+                            SLT_INF(slot, "copying state to child %d\n", child->id);
+                            slot.copy_state_to(*child);
+                            child->state = SLOT_STATE_DONE_PROMPT;
+                        }
                    }
                }
-            }

-            for (auto & slot : slots) {
                // optionally send prompt processing progress
                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
                    if (slot.task->params.stream && slot.task->params.return_progress) {
@@ -2749,8 +2642,6 @@ private:
                        continue; // continue loop of slots
                    }

-                    GGML_ASSERT(slot.task->need_sampling());
-
                    // prompt evaluated for next-token prediction
                    slot.state = SLOT_STATE_GENERATING;
                } else if (slot.state != SLOT_STATE_GENERATING) {
@@ -2769,9 +2660,6 @@ private:

                common_sampler_accept(slot.smpl.get(), id, true);

-                // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
-                const int64_t t_current = ggml_time_us();
-
                slot.n_decoded += 1;

                if (slot.n_decoded == 1) {
@@ -2808,15 +2696,13 @@ private:
                    continue;
                }

-                const size_t n_draft = slot.drafted.size();
+                size_t n_draft = slot.drafted.size();

                // the accepted tokens from the speculation
                const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
                slot.i_batch_dft.clear();
                slot.drafted.clear();

-                const int64_t t_current = ggml_time_us();
-
                slot.n_decoded += ids.size();

                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
@@ -2898,12 +2784,6 @@ server_response_reader server_context::get_response_reader() {

 server_context_meta server_context::get_meta() const {
    auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
-
-    auto bos_id = llama_vocab_bos(impl->vocab);
-    auto eos_id = llama_vocab_eos(impl->vocab);
-    auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
-    auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : "";
-
    return server_context_meta {
        /* build_info             */ build_info,
        /* model_name             */ impl->model_name,
@@ -2918,8 +2798,8 @@ server_context_meta server_context::get_meta() const {
        /* chat_template          */ common_chat_templates_source(impl->chat_templates.get()),
        /* chat_template_tool_use */ tool_use_src ? tool_use_src : "",

-        /* bos_token_str          */ bos_token_str,
-        /* eos_token_str          */ eos_token_str,
+        /* bos_token_str          */ common_token_to_piece(impl->ctx, llama_vocab_bos(impl->vocab), true),
+        /* eos_token_str          */ common_token_to_piece(impl->ctx, llama_vocab_eos(impl->vocab), true),
        /* fim_pre_token          */ llama_vocab_fim_pre(impl->vocab),
        /* fim_sub_token          */ llama_vocab_fim_suf(impl->vocab),
        /* fim_mid_token          */ llama_vocab_fim_mid(impl->vocab),
@@ -2992,9 +2872,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
            // Everything else, including multimodal completions.
            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
        }
-
-        // tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
-
+        tasks.reserve(inputs.size());
        for (size_t i = 0; i < inputs.size(); i++) {
            server_task task = server_task(type);

@@ -3013,11 +2891,13 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
            task.params.oaicompat_cmpl_id = completion_id;
            task.params.oaicompat_model   = meta->model_name;

-            // prepare child tasks
            if (task.params.n_cmpl > 1) {
-                int n_children = task.params.n_cmpl - 1;
-                for (int j = 0; j < n_children; j++) {
-                    task.add_child(task.id, rd.get_new_id());
+                task.n_children = task.params.n_cmpl - 1;
+                for (size_t j = 0; j < task.n_children; j++) {
+                    server_task child = task.create_child(
+                        task.id,
+                        rd.get_new_id());
+                    tasks.push_back(std::move(child));
                }
            }

@@ -3066,22 +2946,19 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
        // in streaming mode, the first error must be treated as non-stream response
        // this is to match the OAI API behavior
        // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
-        auto first_result = rd.next(req.should_stop);
+        server_task_result_ptr first_result = rd.next(req.should_stop);
        if (first_result == nullptr) {
-            GGML_ASSERT(req.should_stop());
            return res; // connection is closed
-        }
-
-        if (first_result->is_error()) {
+        } else if (first_result->is_error()) {
            res->error(first_result->to_json());
            return res;
+        } else {
+            GGML_ASSERT(
+                dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr
+                || dynamic_cast<server_task_result_cmpl_final*>(first_result.get()) != nullptr
+            );
        }

-        GGML_ASSERT(
-            dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr ||
-            dynamic_cast<server_task_result_cmpl_final*>  (first_result.get()) != nullptr
-        );
-
        // next responses are streamed
        // to be sent immediately
        json first_result_json = first_result->to_json();
@@ -3137,7 +3014,6 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
                auto result = rd.next(req.should_stop);
                if (result == nullptr) {
                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
-                    GGML_ASSERT(req.should_stop());
                    return false; // should_stop condition met
                }

@@ -3221,11 +3097,6 @@ void server_routes::init_routes() {

        // get the result
        auto result = res->rd.next(req.should_stop);
-        if (!result) {
-            // connection was closed
-            GGML_ASSERT(req.should_stop());
-            return res;
-        }

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3326,11 +3197,6 @@ void server_routes::init_routes() {

        // get the result
        auto result = res->rd.next(req.should_stop);
-        if (!result) {
-            // connection was closed
-            GGML_ASSERT(req.should_stop());
-            return res;
-        }

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3837,12 +3703,7 @@ void server_routes::init_routes() {
        }

        // get the result
-        auto result = rd.next(req.should_stop);
-        if (!result) {
-            // connection was closed
-            GGML_ASSERT(req.should_stop());
-            return res;
-        }
+        server_task_result_ptr result = rd.next(req.should_stop);

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3871,12 +3732,7 @@ void server_routes::init_routes() {
        }

        // get the result
-        auto result = rd.next(req.should_stop);
-        if (!result) {
-            // connection was closed
-            GGML_ASSERT(req.should_stop());
-            return res;
-        }
+        server_task_result_ptr result = rd.next(req.should_stop);

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3909,12 +3765,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const ser
        rd.post_task(std::move(task));
    }

-    auto result = rd.next(req.should_stop);
-    if (!result) {
-        // connection was closed
-        GGML_ASSERT(req.should_stop());
-        return res;
-    }
+    server_task_result_ptr result = rd.next(req.should_stop);

    if (result->is_error()) {
        res->error(result->to_json());
@@ -3945,12 +3796,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const
        rd.post_task(std::move(task));
    }

-    auto result = rd.next(req.should_stop);
-    if (!result) {
-        // connection was closed
-        GGML_ASSERT(req.should_stop());
-        return res;
-    }
+    server_task_result_ptr result = rd.next(req.should_stop);

    if (result->is_error()) {
        res->error(result->to_json());
@@ -3972,12 +3818,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const se
        rd.post_task(std::move(task));
    }

-    auto result = rd.next(req.should_stop);
-    if (!result) {
-        // connection was closed
-        GGML_ASSERT(req.should_stop());
-        return res;
-    }
+    server_task_result_ptr result = rd.next(req.should_stop);

    if (result->is_error()) {
        res->error(result->to_json());
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -21,13 +21,11 @@

 #ifdef _WIN32
 #include <winsock2.h>
-#include <windows.h>
 #else
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <unistd.h>
-extern char **environ;
 #endif

 #if defined(__APPLE__) && defined(__MACH__)
@@ -36,8 +34,6 @@ extern char **environ;
 #include <limits.h>
 #endif

-#define DEFAULT_STOP_TIMEOUT 10 // seconds
-
 #define CMD_ROUTER_TO_CHILD_EXIT  "cmd_router_to_child:exit"
 #define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready"

@@ -101,49 +97,6 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
    }
 }

-#ifdef _WIN32
-static std::string wide_to_utf8(const wchar_t * ws) {
-    if (!ws || !*ws) {
-        return {};
-    }
-
-    const int len = static_cast<int>(std::wcslen(ws));
-    const int bytes = WideCharToMultiByte(CP_UTF8, 0, ws, len, nullptr, 0, nullptr, nullptr);
-    if (bytes == 0) {
-        return {};
-    }
-
-    std::string utf8(bytes, '\0');
-    WideCharToMultiByte(CP_UTF8, 0, ws, len, utf8.data(), bytes, nullptr, nullptr);
-
-    return utf8;
-}
-#endif
-
-static std::vector<std::string> get_environment() {
-    std::vector<std::string> env;
-
-#ifdef _WIN32
-    LPWCH env_block = GetEnvironmentStringsW();
-    if (!env_block) {
-        return env;
-    }
-    for (LPWCH e = env_block; *e; e += wcslen(e) + 1) {
-        env.emplace_back(wide_to_utf8(e));
-    }
-    FreeEnvironmentStringsW(env_block);
-#else
-    if (environ == nullptr) {
-        return env;
-    }
-    for (char ** e = environ; *e != nullptr; e++) {
-        env.emplace_back(*e);
-    }
-#endif
-
-    return env;
-}
-
 void server_model_meta::update_args(common_preset_context & ctx_preset, std::string bin_path) {
    // update params
    unset_reserved_args(preset, false);
@@ -162,11 +115,14 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
 server_models::server_models(
        const common_params & params,
        int argc,
-        char ** argv)
+        char ** argv,
+        char ** envp)
            : ctx_preset(LLAMA_EXAMPLE_SERVER),
              base_params(params),
-              base_env(get_environment()),
              base_preset(ctx_preset.load_from_args(argc, argv)) {
+    for (char ** env = envp; *env != nullptr; env++) {
+        base_env.push_back(std::string(*env));
+    }
    // clean up base preset
    unset_reserved_args(base_preset, true);
    // set binary path
@@ -247,14 +203,13 @@ void server_models::load_models() {
    // convert presets to server_model_meta and add to mapping
    for (const auto & preset : final_presets) {
        server_model_meta meta{
-            /* preset       */ preset.second,
-            /* name         */ preset.first,
-            /* port         */ 0,
-            /* status       */ SERVER_MODEL_STATUS_UNLOADED,
-            /* last_used    */ 0,
-            /* args         */ std::vector<std::string>(),
-            /* exit_code    */ 0,
-            /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+            /* preset      */ preset.second,
+            /* name        */ preset.first,
+            /* port        */ 0,
+            /* status      */ SERVER_MODEL_STATUS_UNLOADED,
+            /* last_used   */ 0,
+            /* args        */ std::vector<std::string>(),
+            /* exit_code   */ 0
        };
        add_model(std::move(meta));
    }
@@ -272,20 +227,6 @@ void server_models::load_models() {
        }
    }

-    // handle custom stop-timeout option
-    for (auto & [name, inst] : mapping) {
-        std::string val;
-        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
-            try {
-                inst.meta.stop_timeout = std::stoi(val);
-            } catch (...) {
-                SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
-                    val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
-                inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
-            }
-        }
-    }
-
    // load any autoload models
    std::vector<std::string> models_to_load;
    for (const auto & [name, inst] : mapping) {
@@ -421,7 +362,7 @@ void server_models::unload_lru() {
    int64_t lru_last_used = ggml_time_ms();
    size_t count_active = 0;
    {
-        std::unique_lock<std::mutex> lk(mutex);
+        std::lock_guard<std::mutex> lk(mutex);
        for (const auto & m : mapping) {
            if (m.second.meta.is_active()) {
                count_active++;
@@ -435,13 +376,6 @@ void server_models::unload_lru() {
    if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
        SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
        unload(lru_model_name);
-        // wait for unload to complete
-        {
-            std::unique_lock<std::mutex> lk(mutex);
-            cv.wait(lk, [this, &lru_model_name]() {
-                return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
-            });
-        }
    }
 }

@@ -502,83 +436,38 @@ void server_models::load(const std::string & name) {

    // start a thread to manage the child process
    // captured variables are guaranteed to be destroyed only after the thread is joined
-    inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port, stop_timeout = inst.meta.stop_timeout]() {
-        FILE * stdin_file = subprocess_stdin(child_proc.get());
-        FILE * stdout_file = subprocess_stdout(child_proc.get()); // combined stdout/stderr
-
-        std::thread log_thread([&]() {
-            // read stdout/stderr and forward to main server log
-            // also handle status report from child process
-            bool state_received = false; // true if child state received
-            if (stdout_file) {
-                char buffer[4096];
-                while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
-                    LOG("[%5d] %s", port, buffer);
-                    if (!state_received && std::strstr(buffer, CMD_CHILD_TO_ROUTER_READY) != nullptr) {
-                        // child process is ready
-                        this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
-                        state_received = true;
-                    }
+    inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() {
+        // read stdout/stderr and forward to main server log
+        bool state_received = false; // true if child state received
+        FILE * p_stdout_stderr = subprocess_stdout(child_proc.get());
+        if (p_stdout_stderr) {
+            char buffer[4096];
+            while (fgets(buffer, sizeof(buffer), p_stdout_stderr) != nullptr) {
+                LOG("[%5d] %s", port, buffer);
+                if (!state_received && std::strstr(buffer, CMD_CHILD_TO_ROUTER_READY) != nullptr) {
+                    // child process is ready
+                    this->update_status(name, SERVER_MODEL_STATUS_LOADED);
+                    state_received = true;
                }
-            } else {
-                SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
            }
-        });
-
-        std::thread stopping_thread([&]() {
-            // thread to monitor stopping signal
-            auto is_stopping = [this, &name]() {
-                return this->stopping_models.find(name) != this->stopping_models.end();
-            };
-            {
-                std::unique_lock<std::mutex> lk(this->mutex);
-                this->cv_stop.wait(lk, is_stopping);
-            }
-            SRV_INF("stopping model instance name=%s\n", name.c_str());
-            // send interrupt to child process
-            fprintf(stdin_file, "%s\n", CMD_ROUTER_TO_CHILD_EXIT);
-            fflush(stdin_file);
-            // wait to stop gracefully or timeout
-            int64_t start_time = ggml_time_ms();
-            while (true) {
-                std::unique_lock<std::mutex> lk(this->mutex);
-                if (!is_stopping()) {
-                    return; // already stopped
-                }
-                int64_t elapsed = ggml_time_ms() - start_time;
-                if (elapsed >= stop_timeout * 1000) {
-                    // timeout, force kill
-                    SRV_WRN("force-killing model instance name=%s after %d seconds timeout\n", name.c_str(), stop_timeout);
-                    subprocess_terminate(child_proc.get());
-                    return;
-                }
-                this->cv_stop.wait_for(lk, std::chrono::seconds(1));
-            }
-        });
-
+        } else {
+            SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
+        }
        // we reach here when the child process exits
-        // note: we cannot join() prior to this point because it will close stdin_file
-        if (log_thread.joinable()) {
-            log_thread.join();
-        }
-
-        // stop the timeout monitoring thread
-        {
-            std::lock_guard<std::mutex> lk(this->mutex);
-            stopping_models.erase(name);
-            cv_stop.notify_all();
-        }
-        if (stopping_thread.joinable()) {
-            stopping_thread.join();
-        }
-
-        // get the exit code
        int exit_code = 0;
        subprocess_join(child_proc.get(), &exit_code);
        subprocess_destroy(child_proc.get());
-
-        // update status and exit code
-        this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
+        // update PID and status
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            auto it = mapping.find(name);
+            if (it != mapping.end()) {
+                auto & meta = it->second.meta;
+                meta.exit_code = exit_code;
+                meta.status    = SERVER_MODEL_STATUS_UNLOADED;
+            }
+            cv.notify_all();
+        }
        SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
    });

@@ -599,14 +488,22 @@ void server_models::load(const std::string & name) {
    cv.notify_all();
 }

+static void interrupt_subprocess(FILE * stdin_file) {
+    // because subprocess.h does not provide a way to send SIGINT,
+    // we will send a command to the child process to exit gracefully
+    if (stdin_file) {
+        fprintf(stdin_file, "%s\n", CMD_ROUTER_TO_CHILD_EXIT);
+        fflush(stdin_file);
+    }
+}
+
 void server_models::unload(const std::string & name) {
    std::lock_guard<std::mutex> lk(mutex);
    auto it = mapping.find(name);
    if (it != mapping.end()) {
        if (it->second.meta.is_active()) {
            SRV_INF("unloading model instance name=%s\n", name.c_str());
-            stopping_models.insert(name);
-            cv_stop.notify_all();
+            interrupt_subprocess(it->second.stdin_file);
            // status change will be handled by the managing thread
        } else {
            SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
@@ -621,8 +518,7 @@ void server_models::unload_all() {
        for (auto & [name, inst] : mapping) {
            if (inst.meta.is_active()) {
                SRV_INF("unloading model instance name=%s\n", name.c_str());
-                stopping_models.insert(name);
-                cv_stop.notify_all();
+                interrupt_subprocess(inst.stdin_file);
                // status change will be handled by the managing thread
            }
            // moving the thread to join list to avoid deadlock
@@ -636,15 +532,16 @@ void server_models::unload_all() {
    }
 }

-void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
-    std::unique_lock<std::mutex> lk(mutex);
-    auto it = mapping.find(name);
-    if (it != mapping.end()) {
-        auto & meta = it->second.meta;
-        meta.status    = status;
-        meta.exit_code = exit_code;
+void server_models::update_status(const std::string & name, server_model_status status) {
+    // for now, we only allow updating to LOADED status
+    if (status != SERVER_MODEL_STATUS_LOADED) {
+        throw std::runtime_error("invalid status value");
+    }
+    auto meta = get_meta(name);
+    if (meta.has_value()) {
+        meta->status = status;
+        update_meta(name, meta.value());
    }
-    cv.notify_all();
 }

 void server_models::wait_until_loaded(const std::string & name) {
@@ -671,7 +568,6 @@ bool server_models::ensure_model_loaded(const std::string & name) {
        load(name);
    }

-    // for loading state
    SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
    wait_until_loaded(name);

@@ -704,10 +600,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
            req.path,
            req.headers,
            req.body,
-            req.should_stop,
-            base_params.timeout_read,
-            base_params.timeout_write
-            );
+            req.should_stop);
    return proxy;
 }

@@ -902,7 +795,7 @@ void server_models_routes::init_routes() {
            res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
            return res;
        }
-        if (!model->is_active()) {
+        if (model->status != SERVER_MODEL_STATUS_LOADED) {
            res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
            return res;
        }
@@ -995,18 +888,13 @@ server_http_proxy::server_http_proxy(
        const std::string & path,
        const std::map<std::string, std::string> & headers,
        const std::string & body,
-        const std::function<bool()> should_stop,
-        int32_t timeout_read,
-        int32_t timeout_write
-        ) {
+        const std::function<bool()> should_stop) {
    // shared between reader and writer threads
    auto cli  = std::make_shared<httplib::Client>(host, port);
    auto pipe = std::make_shared<pipe_t<msg_t>>();

    // setup Client
    cli->set_connection_timeout(0, 200000); // 200 milliseconds
-    cli->set_write_timeout(timeout_read, 0); // reversed for cli (client) vs srv (server)
-    cli->set_read_timeout(timeout_write, 0);
    this->status = 500; // to be overwritten upon response
    this->cleanup = [pipe]() {
        pipe->close_read();
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -9,7 +9,6 @@
 #include <condition_variable>
 #include <functional>
 #include <memory>
-#include <set>

 /**
 * state diagram:
@@ -57,7 +56,6 @@ struct server_model_meta {
    int64_t last_used = 0; // for LRU unloading
    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
-    int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown

    bool is_active() const {
        return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
@@ -85,10 +83,6 @@ private:
    std::condition_variable cv;
    std::map<std::string, instance_t> mapping;

-    // for stopping models
-    std::condition_variable cv_stop;
-    std::set<std::string> stopping_models;
-
    common_preset_context ctx_preset;

    common_params base_params;
@@ -105,7 +99,7 @@ private:
    void add_model(server_model_meta && meta);

 public:
-    server_models(const common_params & params, int argc, char ** argv);
+    server_models(const common_params & params, int argc, char ** argv, char ** envp);

    void load_models();

@@ -125,7 +119,7 @@ public:
    void unload_all();

    // update the status of a model instance (thread-safe)
-    void update_status(const std::string & name, server_model_status status, int exit_code);
+    void update_status(const std::string & name, server_model_status status);

    // wait until the model instance is fully loaded (thread-safe)
    // return when the model is loaded or failed to load
@@ -147,8 +141,8 @@ struct server_models_routes {
    common_params params;
    json webui_settings = json::object();
    server_models models;
-    server_models_routes(const common_params & params, int argc, char ** argv)
-            : params(params), models(params, argc, argv) {
+    server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
+            : params(params), models(params, argc, argv, envp) {
        if (!this->params.webui_config_json.empty()) {
            try {
                webui_settings = json::parse(this->params.webui_config_json);
@@ -183,10 +177,7 @@ public:
                      const std::string & path,
                      const std::map<std::string, std::string> & headers,
                      const std::string & body,
-                      const std::function<bool()> should_stop,
-                      int32_t timeout_read,
-                      int32_t timeout_write
-                      );
+                      const std::function<bool()> should_stop);
    ~server_http_proxy() {
        if (cleanup) {
            cleanup();
--- a/tools/server/server-queue.cpp
+++ b/tools/server/server-queue.cpp
@@ -74,26 +74,11 @@ int server_queue::get_new_id() {
    return new_id;
 }

-void server_queue::pop_deferred_task(int id_slot) {
+void server_queue::pop_deferred_task() {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    if (!queue_tasks_deferred.empty()) {
-        // try to find a task that uses the specified slot
-        bool found = false;
-        for (auto it = queue_tasks_deferred.begin(); it != queue_tasks_deferred.end(); ++it) {
-            if (it->id_slot == id_slot) {
-                QUE_DBG("pop deferred task (use slot %d), id_task = %d\n", id_slot, it->id);
-                queue_tasks.emplace_front(std::move(*it));
-                queue_tasks_deferred.erase(it);
-                found = true;
-                break;
-            }
-        }
-        // if not tasks found using the slot, just pop the first deferred task (default behavior)
-        if (!found) {
-            QUE_DBG("pop deferred task, id_task = %d\n", queue_tasks_deferred.front().id);
-            queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
-            queue_tasks_deferred.pop_front();
-        }
+        queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
+        queue_tasks_deferred.pop_front();
    }
    time_last_task = ggml_time_ms();
    condition_tasks.notify_one();
@@ -232,12 +217,12 @@ void server_response::add_waiting_task_id(int id_task) {
    waiting_task_ids.insert(id_task);
 }

-void server_response::add_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
+void server_response::add_waiting_tasks(const std::vector<server_task> & tasks) {
    std::unique_lock<std::mutex> lock(mutex_results);

-    for (const auto & id_task : id_tasks) {
-        RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
-        waiting_task_ids.insert(id_task);
+    for (const auto & task : tasks) {
+        RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
+        waiting_task_ids.insert(task.id);
    }
 }

@@ -342,7 +327,6 @@ void server_response::terminate() {

 void server_response_reader::post_task(server_task && task, bool front) {
    GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
-    GGML_ASSERT(!task.is_parent() && "not supported, use post_tasks() instead");
    task.index = 0;
    id_tasks.insert(task.id);
    states.push_back(task.create_state());
@@ -354,18 +338,11 @@ void server_response_reader::post_tasks(std::vector<server_task> && tasks, bool
    GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
    id_tasks = server_task::get_list_id(tasks);
    states.reserve(tasks.size());
-    size_t index = 0;
-    for (auto & task : tasks) {
-        task.index = index++;
-        states.push_back(task.create_state());
-        // for child tasks
-        for (auto & child_task : task.child_tasks) {
-            child_task.index = index++;
-            states.push_back(child_task.create_state());
-        }
+    for (size_t i = 0; i < tasks.size(); i++) {
+        tasks[i].index = i;
+        states.push_back(tasks[i].create_state());
    }
-    GGML_ASSERT(states.size() == id_tasks.size());
-    queue_results.add_waiting_task_ids(id_tasks);
+    queue_results.add_waiting_tasks(tasks);
    queue_tasks.post(std::move(tasks), front);
 }

--- a/tools/server/server-queue.h
+++ b/tools/server/server-queue.h
@@ -44,8 +44,7 @@ public:
    int get_new_id();

    // Call when the state of one slot is changed, it will move one task from deferred to main queue
-    // prioritize tasks that use the specified slot (otherwise, pop the first deferred task)
-    void pop_deferred_task(int id_slot);
+    void pop_deferred_task();

    // if sleeping, request exiting sleep state and wait until it is done
    // returns immediately if not sleeping
@@ -125,7 +124,7 @@ public:
    // add the id_task to the list of tasks waiting for response
    void add_waiting_task_id(int id_task);

-    void add_waiting_task_ids(const std::unordered_set<int> & id_tasks);
+    void add_waiting_tasks(const std::vector<server_task> & tasks);

    // when the request is finished, we can remove task associated with it
    void remove_waiting_task_id(int id_task);
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -78,7 +78,6 @@ json task_params::to_json(bool only_metrics) const {
            {"speculative.p_min",         speculative.p_min},
            {"timings_per_token",         timings_per_token},
            {"post_sampling_probs",       post_sampling_probs},
-            {"backend_sampling",          sampling.backend_sampling},
            {"lora",                      lora},
        };
    }
@@ -137,7 +136,6 @@ json task_params::to_json(bool only_metrics) const {
        {"speculative.p_min",         speculative.p_min},
        {"timings_per_token",         timings_per_token},
        {"post_sampling_probs",       post_sampling_probs},
-        {"backend_sampling",          sampling.backend_sampling},
        {"lora",                      lora},
    };
 }
@@ -160,7 +158,6 @@ task_params server_task::params_from_json_cmpl(
    defaults.n_keep        = params_base.n_keep;
    defaults.n_predict     = params_base.n_predict;
    defaults.n_cache_reuse = params_base.n_cache_reuse;
-    defaults.cache_prompt  = params_base.cache_prompt;
    defaults.antiprompt    = params_base.antiprompt;

    // enabling this will output extra debug information in the HTTP responses from the server
@@ -170,7 +167,7 @@ task_params server_task::params_from_json_cmpl(
    params.stream           = json_value(data,       "stream",             false);
    auto stream_opt         = json_value(data,       "stream_options",     json::object());
    params.include_usage    = json_value(stream_opt, "include_usage",      false);
-    params.cache_prompt     = json_value(data,       "cache_prompt",       defaults.cache_prompt);
+    params.cache_prompt     = json_value(data,       "cache_prompt",       true);
    params.return_tokens    = json_value(data,       "return_tokens",      false);
    params.return_progress  = json_value(data,       "return_progress",    false);
    params.n_predict        = json_value(data,       "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
@@ -204,12 +201,9 @@ task_params server_task::params_from_json_cmpl(
    params.sampling.mirostat           = json_value(data, "mirostat",            defaults.sampling.mirostat);
    params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",        defaults.sampling.mirostat_tau);
    params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",        defaults.sampling.mirostat_eta);
-    params.sampling.adaptive_target    = json_value(data, "adaptive_target",     defaults.sampling.adaptive_target);
-    params.sampling.adaptive_decay     = json_value(data, "adaptive_decay",      defaults.sampling.adaptive_decay);
    params.sampling.seed               = json_value(data, "seed",                defaults.sampling.seed);
    params.sampling.n_probs            = json_value(data, "n_probs",             defaults.sampling.n_probs);
    params.sampling.min_keep           = json_value(data, "min_keep",            defaults.sampling.min_keep);
-    params.sampling.backend_sampling   = json_value(data, "backend_sampling",    defaults.sampling.backend_sampling);
    params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);

    params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
@@ -817,15 +811,6 @@ json server_task_result_cmpl_final::to_json_anthropic() {
        msg.content = content;
    }

-    // thinking block comes first (Anthropic extended thinking format)
-    if (!msg.reasoning_content.empty()) {
-        content_blocks.push_back({
-            {"type", "thinking"},
-            {"thinking", msg.reasoning_content},
-            {"signature", ""}  // empty signature for local models (no cryptographic verification)
-        });
-    }
-
    if (!msg.content.empty()) {
        content_blocks.push_back({
            {"type", "text"},
@@ -874,57 +859,20 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
        stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
    }

-    bool has_thinking = !oaicompat_msg.reasoning_content.empty();
-    bool has_text     = !oaicompat_msg.content.empty();
+    bool has_text = !oaicompat_msg.content.empty();
    size_t num_tool_calls = oaicompat_msg.tool_calls.size();

-    // content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
-    size_t thinking_block_index = 0;
-    size_t text_block_index     = has_thinking ? 1 : 0;
-
-    bool thinking_block_started = false;
-    bool text_block_started     = false;
+    bool text_block_started = false;
    std::unordered_set<size_t> tool_calls_started;

    for (const auto & diff : oaicompat_msg_diffs) {
-        // handle thinking/reasoning content
-        if (!diff.reasoning_content_delta.empty()) {
-            if (!thinking_block_started) {
-                events.push_back({
-                    {"event", "content_block_start"},
-                    {"data", {
-                        {"type", "content_block_start"},
-                        {"index", thinking_block_index},
-                        {"content_block", {
-                            {"type", "thinking"},
-                            {"thinking", ""}
-                        }}
-                    }}
-                });
-                thinking_block_started = true;
-            }
-
-            events.push_back({
-                {"event", "content_block_delta"},
-                {"data", {
-                    {"type", "content_block_delta"},
-                    {"index", thinking_block_index},
-                    {"delta", {
-                        {"type", "thinking_delta"},
-                        {"thinking", diff.reasoning_content_delta}
-                    }}
-                }}
-            });
-        }
-
-        // handle regular text content
        if (!diff.content_delta.empty()) {
            if (!text_block_started) {
                events.push_back({
                    {"event", "content_block_start"},
                    {"data", {
                        {"type", "content_block_start"},
-                        {"index", text_block_index},
+                        {"index", 0},
                        {"content_block", {
                            {"type", "text"},
                            {"text", ""}
@@ -938,7 +886,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
                {"event", "content_block_delta"},
                {"data", {
                    {"type", "content_block_delta"},
-                    {"index", text_block_index},
+                    {"index", 0},
                    {"delta", {
                        {"type", "text_delta"},
                        {"text", diff.content_delta}
@@ -947,9 +895,8 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
            });
        }

-        // handle tool calls
        if (diff.tool_call_index != std::string::npos) {
-            size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + diff.tool_call_index;
+            size_t content_block_index = (has_text ? 1 : 0) + diff.tool_call_index;

            if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
                const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
@@ -985,42 +932,18 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
        }
    }

-    // close content blocks in order
-    if (has_thinking) {
-        // Anthropic API requires a signature_delta before closing thinking blocks
-        // We use an empty signature since we can't generate a cryptographic signature for local models
-        events.push_back({
-            {"event", "content_block_delta"},
-            {"data", {
-                {"type", "content_block_delta"},
-                {"index", thinking_block_index},
-                {"delta", {
-                    {"type", "signature_delta"},
-                    {"signature", ""}
-                }}
-            }}
-        });
-        events.push_back({
-            {"event", "content_block_stop"},
-            {"data", {
-                {"type", "content_block_stop"},
-                {"index", thinking_block_index}
-            }}
-        });
-    }
-
    if (has_text) {
        events.push_back({
            {"event", "content_block_stop"},
            {"data", {
                {"type", "content_block_stop"},
-                {"index", text_block_index}
+                {"index", 0}
            }}
        });
    }

    for (size_t i = 0; i < num_tool_calls; i++) {
-        size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + i;
+        size_t content_block_index = (has_text ? 1 : 0) + i;
        events.push_back({
            {"event", "content_block_stop"},
            {"data", {
@@ -1228,10 +1151,11 @@ json server_task_result_rerank::to_json() {
 json server_task_result_cmpl_partial::to_json_anthropic() {
    json events = json::array();
    bool first = (n_decoded == 1);
-    // use member variables to track block state across streaming calls
-    // (anthropic_thinking_block_started, anthropic_text_block_started)
+    bool text_block_started = false;

    if (first) {
+        text_block_started = false;
+
        events.push_back({
            {"event", "message_start"},
            {"data", {
@@ -1253,69 +1177,28 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
        });
    }

-    // content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
-    size_t thinking_block_index = 0;
-    // use anthropic_has_reasoning (set in update()) to know if ANY reasoning was generated
-    size_t text_block_index     = anthropic_has_reasoning ? 1 : 0;
-
-    // use local copies of streaming state (copied from task_result_state in update())
-    // these reflect the state BEFORE this chunk was processed
-    bool thinking_started = anthropic_thinking_block_started;
-    bool text_started     = anthropic_text_block_started;
-
    for (const auto & diff : oaicompat_msg_diffs) {
-        // handle thinking/reasoning content
-        if (!diff.reasoning_content_delta.empty()) {
-            if (!thinking_started) {
-                events.push_back({
-                    {"event", "content_block_start"},
-                    {"data", {
-                        {"type", "content_block_start"},
-                        {"index", thinking_block_index},
-                        {"content_block", {
-                            {"type", "thinking"},
-                            {"thinking", ""}
-                        }}
-                    }}
-                });
-                thinking_started = true;
-            }
-
-            events.push_back({
-                {"event", "content_block_delta"},
-                {"data", {
-                    {"type", "content_block_delta"},
-                    {"index", thinking_block_index},
-                    {"delta", {
-                        {"type", "thinking_delta"},
-                        {"thinking", diff.reasoning_content_delta}
-                    }}
-                }}
-            });
-        }
-
-        // handle regular text content
        if (!diff.content_delta.empty()) {
-            if (!text_started) {
+            if (!text_block_started) {
                events.push_back({
                    {"event", "content_block_start"},
                    {"data", {
                        {"type", "content_block_start"},
-                        {"index", text_block_index},
+                        {"index", 0},
                        {"content_block", {
                            {"type", "text"},
                            {"text", ""}
                        }}
                    }}
                });
-                text_started = true;
+                text_block_started = true;
            }

            events.push_back({
                {"event", "content_block_delta"},
                {"data", {
                    {"type", "content_block_delta"},
-                    {"index", text_block_index},
+                    {"index", 0},
                    {"delta", {
                        {"type", "text_delta"},
                        {"text", diff.content_delta}
@@ -1324,10 +1207,8 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
            });
        }

-        // handle tool calls
        if (diff.tool_call_index != std::string::npos) {
-            // use anthropic_has_reasoning for thinking block count (persists across calls)
-            size_t content_block_index = (anthropic_has_reasoning ? 1 : 0) + (text_started ? 1 : 0) + diff.tool_call_index;
+            size_t content_block_index = (text_block_started ? 1 : 0) + diff.tool_call_index;

            if (!diff.tool_call_delta.name.empty()) {
                events.push_back({
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -96,10 +96,6 @@ struct task_result_state {
    std::string generated_text; // append new chunks of generated text here
    std::vector<std::string> generated_tool_call_ids;

-    // for Anthropic API streaming: track content block state across chunks
-    bool anthropic_thinking_block_started = false;
-    bool anthropic_text_block_started = false;
-
    task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
        : oaicompat_chat_syntax(oaicompat_chat_syntax) {}

@@ -121,10 +117,8 @@ struct server_task {
    int id_slot   = -1;

    // used by parallel sampling (multiple completions from same prompt)
-    int id_parent  = -1;
-    // temporary store of child tasks for scheduling
-    // note: accessing to elements is invalid after the task is moved to server_slot
-    std::vector<server_task> child_tasks;
+    size_t n_children =  0; // number of tasks reusing this prompt
+    int    id_parent  = -1;

    // used by SERVER_TASK_TYPE_INFERENCE
    task_params   params;
@@ -158,36 +152,6 @@ struct server_task {
        return tokens.size();
    }

-    bool need_embd() const {
-        switch (type) {
-            case SERVER_TASK_TYPE_EMBEDDING:
-            case SERVER_TASK_TYPE_RERANK:
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool need_logits() const {
-        switch (type) {
-            case SERVER_TASK_TYPE_COMPLETION:
-            case SERVER_TASK_TYPE_INFILL:
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool need_sampling() const {
-        switch (type) {
-            case SERVER_TASK_TYPE_COMPLETION:
-            case SERVER_TASK_TYPE_INFILL:
-                return true;
-            default:
-                return false;
-        }
-    }
-
    static task_params params_from_json_cmpl(
        const llama_vocab * vocab,
        const common_params & params_base,
@@ -199,30 +163,18 @@ struct server_task {
        std::unordered_set<int> ids(tasks.size());
        for (size_t i = 0; i < tasks.size(); i++) {
            ids.insert(tasks[i].id);
-            for (auto & child : tasks[i].child_tasks) {
-                ids.insert(child.id);
-            }
        }
        return ids;
    }

-    void add_child(int id_parent, int id_child) {
+    server_task create_child(int id_parent, int id_child) const {
        server_task copy;
-
        copy.id        = id_child;
        copy.id_parent = id_parent;
        copy.params    = params;
        copy.type      = type;
        copy.tokens    = tokens.clone();
-        copy.id_slot   = -1; // child tasks cannot specify slot
-
-        // use different sampling seed for each child
-        // note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
-        if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) {
-            copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1;
-        }
-
-        child_tasks.push_back(std::move(copy));
+        return copy;
    }

    // the task will be moved into queue, then onto slots
@@ -230,14 +182,6 @@ struct server_task {
    task_result_state create_state() const {
        return task_result_state(params.oaicompat_chat_syntax);
    }
-
-    bool is_parent() const {
-        return child_tasks.size() > 0;
-    }
-
-    bool is_child() const {
-        return id_parent != -1;
-    }
 };

 struct result_timings {
@@ -393,12 +337,6 @@ struct server_task_result_cmpl_partial : server_task_result {
    std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
    bool is_updated = false;

-    // for Anthropic API: track if any reasoning content has been generated
-    bool anthropic_has_reasoning = false;
-    // Streaming state copied from task_result_state for this chunk
-    bool anthropic_thinking_block_started = false;
-    bool anthropic_text_block_started = false;
-
    virtual bool is_stop() override {
        return false; // in stream mode, partial responses are not considered stop
    }
@@ -408,22 +346,6 @@ struct server_task_result_cmpl_partial : server_task_result {
    virtual void update(task_result_state & state) override {
        is_updated = true;
        state.update_chat_msg(content, true, oaicompat_msg_diffs);
-        // track if the accumulated message has any reasoning content
-        anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
-
-        // Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk)
-        anthropic_thinking_block_started = state.anthropic_thinking_block_started;
-        anthropic_text_block_started = state.anthropic_text_block_started;
-
-        // Pre-compute state updates based on diffs (for next chunk)
-        for (const auto & diff : oaicompat_msg_diffs) {
-            if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) {
-                state.anthropic_thinking_block_started = true;
-            }
-            if (!diff.content_delta.empty() && !state.anthropic_text_block_started) {
-                state.anthropic_text_block_started = true;
-            }
-        }
    }

    json to_json_non_oaicompat();
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -66,7 +66,7 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
    };
 }

-int main(int argc, char ** argv) {
+int main(int argc, char ** argv, char ** envp) {
    // own arguments required by this example
    common_params params;

@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
    if (is_router_server) {
        // setup server instances manager
        try {
-            models_routes.emplace(params, argc, argv);
+            models_routes.emplace(params, argc, argv, envp);
        } catch (const std::exception & e) {
            LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
            return 1;
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -434,8 +434,8 @@ def test_context_size_exceeded_stream():
@pytest.mark.parametrize(
    "n_batch,batch_count,reuse_cache",
    [
-        (64, 4, False),
-        (64, 2, True),
+        (64, 3, False),
+        (64, 1, True),
    ]
 )
 def test_return_progress(n_batch, batch_count, reuse_cache):
@@ -462,18 +462,10 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
    res = make_cmpl_request()
    last_progress = None
    total_batch_count = 0
-
    for data in res:
        cur_progress = data.get("prompt_progress", None)
        if cur_progress is None:
            continue
-        if total_batch_count == 0:
-            # first progress report must have n_cache == n_processed
-            assert cur_progress["total"] > 0
-            assert cur_progress["cache"] == cur_progress["processed"]
-            if reuse_cache:
-                # when reusing cache, we expect some cached tokens
-                assert cur_progress["cache"] > 0
        if last_progress is not None:
            assert cur_progress["total"] == last_progress["total"]
            assert cur_progress["cache"] == last_progress["cache"]
@@ -481,7 +473,6 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
        total_batch_count += 1
        last_progress = cur_progress

-    # last progress should indicate completion (all tokens processed)
    assert last_progress is not None
    assert last_progress["total"] > 0
    assert last_progress["processed"] == last_progress["total"]
@@ -491,22 +482,17 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
 def test_chat_completions_multiple_choices():
    global server
    server.start()
-    # make sure cache can be reused across multiple choices and multiple requests
-    # ref: https://github.com/ggml-org/llama.cpp/pull/18663
-    for _ in range(2):
-        res = server.make_request("POST", "/chat/completions", data={
-            "max_tokens": 8,
-            "n": 2,
-            "messages": [
-                {"role": "system", "content": "Book"},
-                {"role": "user", "content": "What is the best book"},
-            ],
-            # test forcing the same slot to be used
-            # the scheduler should not be locked up in this case
-            "id_slot": 0,
-        })
-        assert res.status_code == 200
-        assert len(res.body["choices"]) == 2
-        for choice in res.body["choices"]:
-            assert "assistant" == choice["message"]["role"]
-            assert choice["finish_reason"] == "length"
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "n": 2,
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body["choices"]) == 2
+    for choice in res.body["choices"]:
+        assert "assistant" == choice["message"]["role"]
+        assert match_regex("Suddenly", choice["message"]["content"])
+        assert choice["finish_reason"] == "length"
--- a/tools/server/tests/unit/test_compat_anthropic.py
+++ b/tools/server/tests/unit/test_compat_anthropic.py
@@ -805,92 +805,3 @@ def test_anthropic_vs_openai_different_response_format():
    assert "input_tokens" in anthropic_res.body["usage"]
    assert "completion_tokens" in openai_res.body["usage"]
    assert "output_tokens" in anthropic_res.body["usage"]
-
-
-# Extended thinking tests with reasoning models
-
-@pytest.mark.slow
-@pytest.mark.parametrize("stream", [False, True])
-def test_anthropic_thinking_with_reasoning_model(stream):
-    """Test that thinking content blocks are properly returned for reasoning models"""
-    global server
-    server = ServerProcess()
-    server.model_hf_repo = "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF"
-    server.model_hf_file = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
-    server.reasoning_format = "deepseek"
-    server.jinja = True
-    server.n_ctx = 8192
-    server.n_predict = 1024
-    server.server_port = 8084
-    server.start(timeout_seconds=600)  # large model needs time to download
-
-    if stream:
-        res = server.make_stream_request("POST", "/v1/messages", data={
-            "model": "test",
-            "max_tokens": 1024,
-            "thinking": {
-                "type": "enabled",
-                "budget_tokens": 500
-            },
-            "messages": [
-                {"role": "user", "content": "What is 2+2?"}
-            ],
-            "stream": True
-        })
-
-        events = list(res)
-
-        # should have thinking content block events
-        thinking_starts = [e for e in events if
-            e.get("type") == "content_block_start" and
-            e.get("content_block", {}).get("type") == "thinking"]
-        assert len(thinking_starts) > 0, "Should have thinking content_block_start event"
-        assert thinking_starts[0]["index"] == 0, "Thinking block should be at index 0"
-
-        # should have thinking_delta events
-        thinking_deltas = [e for e in events if
-            e.get("type") == "content_block_delta" and
-            e.get("delta", {}).get("type") == "thinking_delta"]
-        assert len(thinking_deltas) > 0, "Should have thinking_delta events"
-
-        # should have signature_delta event before thinking block closes (Anthropic API requirement)
-        signature_deltas = [e for e in events if
-            e.get("type") == "content_block_delta" and
-            e.get("delta", {}).get("type") == "signature_delta"]
-        assert len(signature_deltas) > 0, "Should have signature_delta event for thinking block"
-
-        # should have text block after thinking
-        text_starts = [e for e in events if
-            e.get("type") == "content_block_start" and
-            e.get("content_block", {}).get("type") == "text"]
-        assert len(text_starts) > 0, "Should have text content_block_start event"
-        assert text_starts[0]["index"] == 1, "Text block should be at index 1 (after thinking)"
-    else:
-        res = server.make_request("POST", "/v1/messages", data={
-            "model": "test",
-            "max_tokens": 1024,
-            "thinking": {
-                "type": "enabled",
-                "budget_tokens": 500
-            },
-            "messages": [
-                {"role": "user", "content": "What is 2+2?"}
-            ]
-        })
-
-        assert res.status_code == 200
-        assert res.body["type"] == "message"
-
-        content = res.body["content"]
-        assert len(content) >= 2, "Should have at least thinking and text blocks"
-
-        # first block should be thinking
-        thinking_blocks = [b for b in content if b.get("type") == "thinking"]
-        assert len(thinking_blocks) > 0, "Should have thinking content block"
-        assert "thinking" in thinking_blocks[0], "Thinking block should have 'thinking' field"
-        assert len(thinking_blocks[0]["thinking"]) > 0, "Thinking content should not be empty"
-        assert "signature" in thinking_blocks[0], "Thinking block should have 'signature' field (Anthropic API requirement)"
-
-        # should also have text block
-        text_blocks = [b for b in content if b.get("type") == "text"]
-        assert len(text_blocks) > 0, "Should have text content block"
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
    for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
        if expect_ok:
            assert res.status_code == 200
-
-        # note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
-        if res.status_code == 200:
            assert "content" in res.body
            if "timings" in res.body:
                assert res.body["timings"]["predicted_n"] == n_predict
+        else:
+            assert res.status_code == 500
+            assert "content" not in res.body


@pytest.mark.parametrize(
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -10,11 +10,21 @@
 	import { INPUT_CLASSES } from '$lib/constants/input-classes';
 	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
 	import { config } from '$lib/stores/settings.svelte';
-	import { modelOptions, selectedModelId } from '$lib/stores/models.svelte';
+	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
 	import { isRouterMode } from '$lib/stores/server.svelte';
 	import { chatStore } from '$lib/stores/chat.svelte';
 	import { activeMessages } from '$lib/stores/conversations.svelte';
-	import { MimeTypeText } from '$lib/enums';
+	import {
+		FileTypeCategory,
+		MimeTypeApplication,
+		FileExtensionAudio,
+		FileExtensionImage,
+		FileExtensionPdf,
+		FileExtensionText,
+		MimeTypeAudio,
+		MimeTypeImage,
+		MimeTypeText
+	} from '$lib/enums';
 	import { isIMEComposing, parseClipboardContent } from '$lib/utils';
 	import {
 		AudioRecorder,
@@ -51,6 +61,7 @@
 	let audioRecorder: AudioRecorder | undefined;
 	let chatFormActionsRef: ChatFormActions | undefined = $state(undefined);
 	let currentConfig = $derived(config());
+	let fileAcceptString = $state<string | undefined>(undefined);
 	let fileInputRef: ChatFormFileInputInvisible | undefined = $state(undefined);
 	let isRecording = $state(false);
 	let message = $state('');
@@ -93,6 +104,40 @@
 		return null;
 	});

+	// State for model props reactivity
+	let modelPropsVersion = $state(0);
+
+	// Fetch model props when active model changes (works for both MODEL and ROUTER mode)
+	$effect(() => {
+		if (activeModelId) {
+			const cached = modelsStore.getModelProps(activeModelId);
+			if (!cached) {
+				modelsStore.fetchModelProps(activeModelId).then(() => {
+					modelPropsVersion++;
+				});
+			}
+		}
+	});
+
+	// Derive modalities from active model (works for both MODEL and ROUTER mode)
+	let hasAudioModality = $derived.by(() => {
+		if (activeModelId) {
+			void modelPropsVersion; // Trigger reactivity on props fetch
+			return modelsStore.modelSupportsAudio(activeModelId);
+		}
+
+		return false;
+	});
+
+	let hasVisionModality = $derived.by(() => {
+		if (activeModelId) {
+			void modelPropsVersion; // Trigger reactivity on props fetch
+			return modelsStore.modelSupportsVision(activeModelId);
+		}
+
+		return false;
+	});
+
 	function checkModelSelected(): boolean {
 		if (!hasModelSelected) {
 			// Open the model selector
@@ -103,12 +148,42 @@
 		return true;
 	}

+	function getAcceptStringForFileType(fileType: FileTypeCategory): string {
+		switch (fileType) {
+			case FileTypeCategory.IMAGE:
+				return [...Object.values(FileExtensionImage), ...Object.values(MimeTypeImage)].join(',');
+
+			case FileTypeCategory.AUDIO:
+				return [...Object.values(FileExtensionAudio), ...Object.values(MimeTypeAudio)].join(',');
+
+			case FileTypeCategory.PDF:
+				return [...Object.values(FileExtensionPdf), ...Object.values(MimeTypeApplication)].join(
+					','
+				);
+
+			case FileTypeCategory.TEXT:
+				return [...Object.values(FileExtensionText), MimeTypeText.PLAIN].join(',');
+
+			default:
+				return '';
+		}
+	}
+
 	function handleFileSelect(files: File[]) {
 		onFileUpload?.(files);
 	}

-	function handleFileUpload() {
-		fileInputRef?.click();
+	function handleFileUpload(fileType?: FileTypeCategory) {
+		if (fileType) {
+			fileAcceptString = getAcceptStringForFileType(fileType);
+		} else {
+			fileAcceptString = undefined;
+		}
+
+		// Use setTimeout to ensure the accept attribute is applied before opening dialog
+		setTimeout(() => {
+			fileInputRef?.click();
+		}, 10);
 	}

 	async function handleKeydown(event: KeyboardEvent) {
@@ -268,7 +343,13 @@
 	});
 </script>

-<ChatFormFileInputInvisible bind:this={fileInputRef} onFileSelect={handleFileSelect} />
+<ChatFormFileInputInvisible
+	bind:this={fileInputRef}
+	bind:accept={fileAcceptString}
+	{hasAudioModality}
+	{hasVisionModality}
+	onFileSelect={handleFileSelect}
+/>

 <form
 	onsubmit={handleSubmit}
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
@@ -4,13 +4,14 @@
 	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
 	import * as Tooltip from '$lib/components/ui/tooltip';
 	import { FILE_TYPE_ICONS } from '$lib/constants/icons';
+	import { FileTypeCategory } from '$lib/enums';

 	interface Props {
 		class?: string;
 		disabled?: boolean;
 		hasAudioModality?: boolean;
 		hasVisionModality?: boolean;
-		onFileUpload?: () => void;
+		onFileUpload?: (fileType?: FileTypeCategory) => void;
 	}

 	let {
@@ -26,6 +27,10 @@
 			? 'Text files and PDFs supported. Images, audio, and video require vision models.'
 			: 'Attach files';
 	});
+
+	function handleFileUpload(fileType?: FileTypeCategory) {
+		onFileUpload?.(fileType);
+	}
 </script>

 <div class="flex items-center gap-1 {className}">
@@ -56,7 +61,7 @@
 					<DropdownMenu.Item
 						class="images-button flex cursor-pointer items-center gap-2"
 						disabled={!hasVisionModality}
-						onclick={() => onFileUpload?.()}
+						onclick={() => handleFileUpload(FileTypeCategory.IMAGE)}
 					>
 						<FILE_TYPE_ICONS.image class="h-4 w-4" />

@@ -76,7 +81,7 @@
 					<DropdownMenu.Item
 						class="audio-button flex cursor-pointer items-center gap-2"
 						disabled={!hasAudioModality}
-						onclick={() => onFileUpload?.()}
+						onclick={() => handleFileUpload(FileTypeCategory.AUDIO)}
 					>
 						<FILE_TYPE_ICONS.audio class="h-4 w-4" />

@@ -93,7 +98,7 @@

 			<DropdownMenu.Item
 				class="flex cursor-pointer items-center gap-2"
-				onclick={() => onFileUpload?.()}
+				onclick={() => handleFileUpload(FileTypeCategory.TEXT)}
 			>
 				<FILE_TYPE_ICONS.text class="h-4 w-4" />

@@ -104,7 +109,7 @@
 				<Tooltip.Trigger class="w-full">
 					<DropdownMenu.Item
 						class="flex cursor-pointer items-center gap-2"
-						onclick={() => onFileUpload?.()}
+						onclick={() => handleFileUpload(FileTypeCategory.PDF)}
 					>
 						<FILE_TYPE_ICONS.pdf class="h-4 w-4" />

--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
@@ -24,7 +24,7 @@
 		isRecording?: boolean;
 		hasText?: boolean;
 		uploadedFiles?: ChatUploadedFile[];
-		onFileUpload?: () => void;
+		onFileUpload?: (fileType?: FileTypeCategory) => void;
 		onMicClick?: () => void;
 		onStop?: () => void;
 	}
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte
@@ -1,14 +1,35 @@
 <script lang="ts">
+	import { generateModalityAwareAcceptString } from '$lib/utils';
+
 	interface Props {
+		accept?: string;
 		class?: string;
+		hasAudioModality?: boolean;
+		hasVisionModality?: boolean;
 		multiple?: boolean;
 		onFileSelect?: (files: File[]) => void;
 	}

-	let { class: className = '', multiple = true, onFileSelect }: Props = $props();
+	let {
+		accept = $bindable(),
+		class: className = '',
+		hasAudioModality = false,
+		hasVisionModality = false,
+		multiple = true,
+		onFileSelect
+	}: Props = $props();

 	let fileInputElement: HTMLInputElement | undefined;

+	// Use modality-aware accept string by default, but allow override
+	let finalAccept = $derived(
+		accept ??
+			generateModalityAwareAcceptString({
+				hasVision: hasVisionModality,
+				hasAudio: hasAudioModality
+			})
+	);
+
 	export function click() {
 		fileInputElement?.click();
 	}
@@ -25,6 +46,7 @@
 	bind:this={fileInputElement}
 	type="file"
 	{multiple}
+	accept={finalAccept}
 	onchange={handleFileSelect}
 	class="hidden {className}"
 />
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
@@ -89,7 +89,6 @@
 	const fallbackToolCalls = $derived(typeof toolCallContent === 'string' ? toolCallContent : null);

 	const processingState = useProcessingState();
-
 	let currentConfig = $derived(config());
 	let isRouter = $derived(isRouterMode());
 	let displayedModel = $derived((): string | null => {
@@ -117,12 +116,6 @@
 		}
 	});

-	$effect(() => {
-		if (isLoading() && !message?.content?.trim()) {
-			processingState.startMonitoring();
-		}
-	});
-
 	function formatToolCallBadge(toolCall: ApiChatCompletionToolCall, index: number) {
 		const callNumber = index + 1;
 		const functionName = toolCall.function?.name?.trim();
@@ -193,7 +186,7 @@
 		<div class="mt-6 w-full max-w-[48rem]" in:fade>
 			<div class="processing-container">
 				<span class="processing-text">
-					{processingState.getPromptProgressText() ?? processingState.getProcessingMessage()}
+					{processingState.getProcessingMessage()}
 				</span>
 			</div>
 		</div>
@@ -270,23 +263,6 @@
 						predictedTokens={message.timings.predicted_n}
 						predictedMs={message.timings.predicted_ms}
 					/>
-				{:else if isLoading() && currentConfig.showMessageStats}
-					{@const liveStats = processingState.getLiveProcessingStats()}
-					{@const genStats = processingState.getLiveGenerationStats()}
-					{@const promptProgress = processingState.processingState?.promptProgress}
-					{@const isStillProcessingPrompt =
-						promptProgress && promptProgress.processed < promptProgress.total}
-
-					{#if liveStats || genStats}
-						<ChatMessageStatistics
-							isLive={true}
-							isProcessingPrompt={!!isStillProcessingPrompt}
-							promptTokens={liveStats?.tokensProcessed}
-							promptMs={liveStats?.timeMs}
-							predictedTokens={genStats?.tokensGenerated}
-							predictedMs={genStats?.timeMs}
-						/>
-					{/if}
 				{/if}
 			</div>
 		{/if}
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
@@ -5,64 +5,21 @@
 	import { ChatMessageStatsView } from '$lib/enums';

 	interface Props {
-		predictedTokens?: number;
-		predictedMs?: number;
+		predictedTokens: number;
+		predictedMs: number;
 		promptTokens?: number;
 		promptMs?: number;
-		// Live mode: when true, shows stats during streaming
-		isLive?: boolean;
-		// Whether prompt processing is still in progress
-		isProcessingPrompt?: boolean;
-		// Initial view to show (defaults to READING in live mode)
-		initialView?: ChatMessageStatsView;
 	}

-	let {
-		predictedTokens,
-		predictedMs,
-		promptTokens,
-		promptMs,
-		isLive = false,
-		isProcessingPrompt = false,
-		initialView = ChatMessageStatsView.GENERATION
-	}: Props = $props();
+	let { predictedTokens, predictedMs, promptTokens, promptMs }: Props = $props();

-	let activeView: ChatMessageStatsView = $state(initialView);
-	let hasAutoSwitchedToGeneration = $state(false);
+	let activeView: ChatMessageStatsView = $state(ChatMessageStatsView.GENERATION);

-	// In live mode: auto-switch to GENERATION tab when prompt processing completes
-	$effect(() => {
-		if (isLive) {
-			// Auto-switch to generation tab only when prompt processing is done (once)
-			if (
-				!hasAutoSwitchedToGeneration &&
-				!isProcessingPrompt &&
-				predictedTokens &&
-				predictedTokens > 0
-			) {
-				activeView = ChatMessageStatsView.GENERATION;
-				hasAutoSwitchedToGeneration = true;
-			} else if (!hasAutoSwitchedToGeneration) {
-				// Stay on READING while prompt is still being processed
-				activeView = ChatMessageStatsView.READING;
-			}
-		}
-	});
-
-	let hasGenerationStats = $derived(
-		predictedTokens !== undefined &&
-			predictedTokens > 0 &&
-			predictedMs !== undefined &&
-			predictedMs > 0
-	);
-
-	let tokensPerSecond = $derived(hasGenerationStats ? (predictedTokens! / predictedMs!) * 1000 : 0);
-	let timeInSeconds = $derived(
-		predictedMs !== undefined ? (predictedMs / 1000).toFixed(2) : '0.00'
-	);
+	let tokensPerSecond = $derived((predictedTokens / predictedMs) * 1000);
+	let timeInSeconds = $derived((predictedMs / 1000).toFixed(2));

 	let promptTokensPerSecond = $derived(
-		promptTokens !== undefined && promptMs !== undefined && promptMs > 0
+		promptTokens !== undefined && promptMs !== undefined
 			? (promptTokens / promptMs) * 1000
 			: undefined
 	);
@@ -77,14 +34,11 @@
 			promptTokensPerSecond !== undefined &&
 			promptTimeInSeconds !== undefined
 	);
-
-	// In live mode, generation tab is disabled until we have generation stats
-	let isGenerationDisabled = $derived(isLive && !hasGenerationStats);
 </script>

 <div class="inline-flex items-center text-xs text-muted-foreground">
 	<div class="inline-flex items-center rounded-sm bg-muted-foreground/15 p-0.5">
-		{#if hasPromptStats || isLive}
+		{#if hasPromptStats}
 			<Tooltip.Root>
 				<Tooltip.Trigger>
 					<button
@@ -111,32 +65,25 @@
 					class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
 					ChatMessageStatsView.GENERATION
 						? 'bg-background text-foreground shadow-sm'
-						: isGenerationDisabled
-							? 'cursor-not-allowed opacity-40'
-							: 'hover:text-foreground'}"
-					onclick={() => !isGenerationDisabled && (activeView = ChatMessageStatsView.GENERATION)}
-					disabled={isGenerationDisabled}
+						: 'hover:text-foreground'}"
+					onclick={() => (activeView = ChatMessageStatsView.GENERATION)}
 				>
 					<Sparkles class="h-3 w-3" />
 					<span class="sr-only">Generation</span>
 				</button>
 			</Tooltip.Trigger>
 			<Tooltip.Content>
-				<p>
-					{isGenerationDisabled
-						? 'Generation (waiting for tokens...)'
-						: 'Generation (token output)'}
-				</p>
+				<p>Generation (token output)</p>
 			</Tooltip.Content>
 		</Tooltip.Root>
 	</div>

 	<div class="flex items-center gap-1 px-2">
-		{#if activeView === ChatMessageStatsView.GENERATION && hasGenerationStats}
+		{#if activeView === ChatMessageStatsView.GENERATION}
 			<BadgeChatStatistic
 				class="bg-transparent"
 				icon={WholeWord}
-				value="{predictedTokens?.toLocaleString()} tokens"
+				value="{predictedTokens} tokens"
 				tooltipLabel="Generated tokens"
 			/>
 			<BadgeChatStatistic
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
@@ -185,11 +185,6 @@
 					key: 'samplers',
 					label: 'Samplers',
 					type: 'input'
-				},
-				{
-					key: 'backend_sampling',
-					label: 'Backend sampling',
-					type: 'checkbox'
 				}
 			]
 		},
--- a/tools/server/webui/src/lib/constants/settings-config.ts
+++ b/tools/server/webui/src/lib/constants/settings-config.ts
@@ -21,7 +21,6 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
 	autoMicOnEmpty: false,
 	// make sure these default values are in sync with `common.h`
 	samplers: 'top_k;typ_p;top_p;min_p;temperature',
-	backend_sampling: false,
 	temperature: 0.8,
 	dynatemp_range: 0.0,
 	dynatemp_exponent: 1.0,
@@ -58,8 +57,6 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 		'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
 	samplers:
 		'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
-	backend_sampling:
-		'Enable backend-based samplers. When enabled, supported samplers run on the accelerator backend for faster sampling.',
 	temperature:
 		'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
 	dynatemp_range:
--- a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
+++ b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
@@ -1,27 +1,10 @@
 import { activeProcessingState } from '$lib/stores/chat.svelte';
 import { config } from '$lib/stores/settings.svelte';

-export interface LiveProcessingStats {
-	tokensProcessed: number;
-	totalTokens: number;
-	timeMs: number;
-	tokensPerSecond: number;
-	etaSecs?: number;
-}
-
-export interface LiveGenerationStats {
-	tokensGenerated: number;
-	timeMs: number;
-	tokensPerSecond: number;
-}
-
 export interface UseProcessingStateReturn {
 	readonly processingState: ApiProcessingState | null;
 	getProcessingDetails(): string[];
 	getProcessingMessage(): string;
-	getPromptProgressText(): string | null;
-	getLiveProcessingStats(): LiveProcessingStats | null;
-	getLiveGenerationStats(): LiveGenerationStats | null;
 	shouldShowDetails(): boolean;
 	startMonitoring(): void;
 	stopMonitoring(): void;
@@ -46,7 +29,6 @@ export interface UseProcessingStateReturn {
 export function useProcessingState(): UseProcessingStateReturn {
 	let isMonitoring = $state(false);
 	let lastKnownState = $state<ApiProcessingState | null>(null);
-	let lastKnownProcessingStats = $state<LiveProcessingStats | null>(null);

 	// Derive processing state reactively from chatStore's direct state
 	const processingState = $derived.by(() => {
@@ -64,34 +46,6 @@ export function useProcessingState(): UseProcessingStateReturn {
 		}
 	});

-	// Track last known processing stats for when promptProgress disappears
-	$effect(() => {
-		if (processingState?.promptProgress) {
-			const { processed, total, time_ms, cache } = processingState.promptProgress;
-			const actualProcessed = processed - cache;
-			const actualTotal = total - cache;
-
-			if (actualProcessed > 0 && time_ms > 0) {
-				const tokensPerSecond = actualProcessed / (time_ms / 1000);
-				lastKnownProcessingStats = {
-					tokensProcessed: actualProcessed,
-					totalTokens: actualTotal,
-					timeMs: time_ms,
-					tokensPerSecond
-				};
-			}
-		}
-	});
-
-	function getETASecs(done: number, total: number, elapsedMs: number): number | undefined {
-		const elapsedSecs = elapsedMs / 1000;
-		const progressETASecs =
-			done === 0 || elapsedSecs < 0.5
-				? undefined // can be the case for the 0% progress report
-				: elapsedSecs * (total / done - 1);
-		return progressETASecs;
-	}
-
 	function startMonitoring(): void {
 		if (isMonitoring) return;
 		isMonitoring = true;
@@ -105,25 +59,28 @@ export function useProcessingState(): UseProcessingStateReturn {
 		const currentConfig = config();
 		if (!currentConfig.keepStatsVisible) {
 			lastKnownState = null;
-			lastKnownProcessingStats = null;
 		}
 	}

 	function getProcessingMessage(): string {
-		if (!processingState) {
+		const state = processingState;
+		if (!state) {
 			return 'Processing...';
 		}

-		switch (processingState.status) {
+		switch (state.status) {
 			case 'initializing':
 				return 'Initializing...';
 			case 'preparing':
-				if (processingState.progressPercent !== undefined) {
-					return `Processing (${processingState.progressPercent}%)`;
+				if (state.progressPercent !== undefined) {
+					return `Processing (${state.progressPercent}%)`;
 				}
 				return 'Preparing response...';
 			case 'generating':
-				return '';
+				if (state.tokensDecoded > 0) {
+					return `Generating... (${state.tokensDecoded} tokens)`;
+				}
+				return 'Generating...';
 			default:
 				return 'Processing...';
 		}
@@ -174,76 +131,8 @@ export function useProcessingState(): UseProcessingStateReturn {
 	}

 	function shouldShowDetails(): boolean {
-		return processingState !== null && processingState.status !== 'idle';
-	}
-
-	/**
-	 * Returns a short progress message with percent
-	 */
-	function getPromptProgressText(): string | null {
-		if (!processingState?.promptProgress) return null;
-
-		const { processed, total, cache } = processingState.promptProgress;
-
-		const actualProcessed = processed - cache;
-		const actualTotal = total - cache;
-		const percent = Math.round((actualProcessed / actualTotal) * 100);
-		const eta = getETASecs(actualProcessed, actualTotal, processingState.promptProgress.time_ms);
-
-		if (eta !== undefined) {
-			const etaSecs = Math.ceil(eta);
-			return `Processing ${percent}% (ETA: ${etaSecs}s)`;
-		}
-
-		return `Processing ${percent}%`;
-	}
-
-	/**
-	 * Returns live processing statistics for display (prompt processing phase)
-	 * Returns last known stats when promptProgress becomes unavailable
-	 */
-	function getLiveProcessingStats(): LiveProcessingStats | null {
-		if (processingState?.promptProgress) {
-			const { processed, total, time_ms, cache } = processingState.promptProgress;
-
-			const actualProcessed = processed - cache;
-			const actualTotal = total - cache;
-
-			if (actualProcessed > 0 && time_ms > 0) {
-				const tokensPerSecond = actualProcessed / (time_ms / 1000);
-
-				return {
-					tokensProcessed: actualProcessed,
-					totalTokens: actualTotal,
-					timeMs: time_ms,
-					tokensPerSecond
-				};
-			}
-		}
-
-		// Return last known stats if promptProgress is no longer available
-		return lastKnownProcessingStats;
-	}
-
-	/**
-	 * Returns live generation statistics for display (token generation phase)
-	 */
-	function getLiveGenerationStats(): LiveGenerationStats | null {
-		if (!processingState) return null;
-
-		const { tokensDecoded, tokensPerSecond } = processingState;
-
-		if (tokensDecoded <= 0) return null;
-
-		// Calculate time from tokens and speed
-		const timeMs =
-			tokensPerSecond && tokensPerSecond > 0 ? (tokensDecoded / tokensPerSecond) * 1000 : 0;
-
-		return {
-			tokensGenerated: tokensDecoded,
-			timeMs,
-			tokensPerSecond: tokensPerSecond || 0
-		};
+		const state = processingState;
+		return state !== null && state.status !== 'idle';
 	}

 	return {
@@ -252,9 +141,6 @@ export function useProcessingState(): UseProcessingStateReturn {
 		},
 		getProcessingDetails,
 		getProcessingMessage,
-		getPromptProgressText,
-		getLiveProcessingStats,
-		getLiveGenerationStats,
 		shouldShowDetails,
 		startMonitoring,
 		stopMonitoring
--- a/tools/server/webui/src/lib/services/chat.ts
+++ b/tools/server/webui/src/lib/services/chat.ts
@@ -86,7 +86,6 @@ export class ChatService {
 			dry_penalty_last_n,
 			// Other parameters
 			samplers,
-			backend_sampling,
 			custom,
 			timings_per_token,
 			// Config options
@@ -118,8 +117,7 @@ export class ChatService {
 				role: msg.role,
 				content: msg.content
 			})),
-			stream,
-			return_progress: stream ? true : undefined
+			stream
 		};

 		// Include model in request if provided (required in ROUTER mode)
@@ -160,8 +158,6 @@ export class ChatService {
 					: samplers;
 		}

-		if (backend_sampling !== undefined) requestBody.backend_sampling = backend_sampling;
-
 		if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;

 		if (custom) {
@@ -275,7 +271,7 @@ export class ChatService {
 		onReasoningChunk?: (chunk: string) => void,
 		onToolCallChunk?: (chunk: string) => void,
 		onModel?: (model: string) => void,
-		onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
+		onTimings?: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
 		conversationId?: string,
 		abortSignal?: AbortSignal
 	): Promise<void> {
@@ -370,13 +366,11 @@ export class ChatService {
 								onModel?.(chunkModel);
 							}

-							if (promptProgress) {
-								ChatService.notifyTimings(undefined, promptProgress, onTimings);
-							}
-
-							if (timings) {
+							if (timings || promptProgress) {
 								ChatService.notifyTimings(timings, promptProgress, onTimings);
-								lastTimings = timings;
+								if (timings) {
+									lastTimings = timings;
+								}
 							}

 							if (content) {
@@ -774,11 +768,10 @@ export class ChatService {
 		timings: ChatMessageTimings | undefined,
 		promptProgress: ChatMessagePromptProgress | undefined,
 		onTimingsCallback:
-			| ((timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
+			| ((timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
 			| undefined
 	): void {
-		if (!onTimingsCallback || (!timings && !promptProgress)) return;
-
+		if (!timings || !onTimingsCallback) return;
 		onTimingsCallback(timings, promptProgress);
 	}
 }
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -303,17 +303,11 @@ class ChatStore {
 		const currentConfig = config();
 		const outputTokensMax = currentConfig.max_tokens || -1;

-		// Note: for timings data, the n_prompt does NOT include cache tokens
 		const contextUsed = promptTokens + cacheTokens + predictedTokens;
 		const outputTokensUsed = predictedTokens;

-		// Note: for prompt progress, the "processed" DOES include cache tokens
-		// we need to exclude them to get the real prompt tokens processed count
-		const progressCache = promptProgress?.cache || 0;
-		const progressActualDone = (promptProgress?.processed ?? 0) - progressCache;
-		const progressActualTotal = (promptProgress?.total ?? 0) - progressCache;
 		const progressPercent = promptProgress
-			? Math.round((progressActualDone / progressActualTotal) * 100)
+			? Math.round((promptProgress.processed / promptProgress.total) * 100)
 			: undefined;

 		return {
@@ -330,7 +324,6 @@ class ChatStore {
 			topP: currentConfig.top_p ?? 0.95,
 			speculative: false,
 			progressPercent,
-			promptProgress,
 			promptTokens,
 			promptMs,
 			cacheTokens
@@ -541,7 +534,7 @@ class ChatStore {
 					conversationsStore.updateMessageAtIndex(idx, { toolCalls: streamedToolCallContent });
 				},
 				onModel: (modelName: string) => recordModel(modelName),
-				onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
+				onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
 					const tokensPerSecond =
 						timings?.predicted_ms && timings?.predicted_n
 							? (timings.predicted_n / timings.predicted_ms) * 1000
@@ -1039,7 +1032,7 @@ class ChatStore {
 						});
 					},

-					onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
+					onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
 						const tokensPerSecond =
 							timings?.predicted_ms && timings?.predicted_n
 								? (timings.predicted_n / timings.predicted_ms) * 1000
@@ -1461,8 +1454,6 @@ class ChatStore {
 		if (hasValue(currentConfig.dry_penalty_last_n))
 			apiOptions.dry_penalty_last_n = Number(currentConfig.dry_penalty_last_n);
 		if (currentConfig.samplers) apiOptions.samplers = currentConfig.samplers;
-		if (currentConfig.backend_sampling)
-			apiOptions.backend_sampling = currentConfig.backend_sampling;
 		if (currentConfig.custom) apiOptions.custom = currentConfig.custom;

 		return apiOptions;
--- a/tools/server/webui/src/lib/stores/settings.svelte.ts
+++ b/tools/server/webui/src/lib/stores/settings.svelte.ts
@@ -294,14 +294,15 @@ class SettingsStore {
 	 * This sets up the default values from /props endpoint
 	 */
 	syncWithServerDefaults(): void {
-		const propsDefaults = this.getServerDefaults();
-
-		if (Object.keys(propsDefaults).length === 0) {
-			console.warn('No server defaults available for initialization');
+		const serverParams = serverStore.defaultParams;
+		if (!serverParams) {
+			console.warn('No server parameters available for initialization');

 			return;
 		}

+		const propsDefaults = this.getServerDefaults();
+
 		for (const [key, propsValue] of Object.entries(propsDefaults)) {
 			const currentValue = getConfigValue(this.config, key);

--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@@ -149,7 +149,6 @@ export interface ApiLlamaCppServerProps {
 			reasoning_in_content: boolean;
 			thinking_forced_open: boolean;
 			samplers: string[];
-			backend_sampling: boolean;
 			'speculative.n_max': number;
 			'speculative.n_min': number;
 			'speculative.p_min': number;
@@ -187,7 +186,6 @@ export interface ApiChatCompletionRequest {
 	}>;
 	stream?: boolean;
 	model?: string;
-	return_progress?: boolean;
 	// Reasoning parameters
 	reasoning_format?: string;
 	// Generation parameters
@@ -213,7 +211,6 @@ export interface ApiChatCompletionRequest {
 	dry_penalty_last_n?: number;
 	// Sampler configuration
 	samplers?: string[];
-	backend_sampling?: boolean;
 	// Custom parameters (JSON string)
 	custom?: Record<string, unknown>;
 	timings_per_token?: boolean;
@@ -314,7 +311,6 @@ export interface ApiSlotData {
 		reasoning_in_content: boolean;
 		thinking_forced_open: boolean;
 		samplers: string[];
-		backend_sampling: boolean;
 		'speculative.n_max': number;
 		'speculative.n_min': number;
 		'speculative.p_min': number;
@@ -345,7 +341,6 @@ export interface ApiProcessingState {
 	tokensPerSecond?: number;
 	// Progress information from prompt_progress
 	progressPercent?: number;
-	promptProgress?: ChatMessagePromptProgress;
 	promptTokens?: number;
 	promptMs?: number;
 	cacheTokens?: number;
--- a/tools/server/webui/src/lib/types/settings.d.ts
+++ b/tools/server/webui/src/lib/types/settings.d.ts
@@ -43,7 +43,6 @@ export interface SettingsChatServiceOptions {
 	dry_penalty_last_n?: number;
 	// Sampler configuration
 	samplers?: string | string[];
-	backend_sampling?: boolean;
 	// Custom parameters
 	custom?: string;
 	timings_per_token?: boolean;
@@ -52,7 +51,7 @@ export interface SettingsChatServiceOptions {
 	onReasoningChunk?: (chunk: string) => void;
 	onToolCallChunk?: (chunk: string) => void;
 	onModel?: (model: string) => void;
-	onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
+	onTimings?: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
 	onComplete?: (
 		response: string,
 		reasoningContent?: string,
--- a/tools/server/webui/src/lib/utils/clipboard.ts
+++ b/tools/server/webui/src/lib/utils/clipboard.ts
@@ -65,7 +65,10 @@ export async function copyCodeToClipboard(
 	successMessage = 'Code copied to clipboard',
 	errorMessage = 'Failed to copy code'
 ): Promise<boolean> {
-	return copyToClipboard(rawCode, successMessage, errorMessage);
+	const doc = new DOMParser().parseFromString(rawCode, 'text/html');
+	const decodedCode = doc.body.textContent ?? rawCode;
+
+	return copyToClipboard(decodedCode, successMessage, errorMessage);
 }

 /**
--- a/tools/server/webui/src/lib/utils/file-type.ts
+++ b/tools/server/webui/src/lib/utils/file-type.ts
@@ -195,28 +195,9 @@ export function getFileTypeByExtension(filename: string): string | null {
 }

 export function isFileTypeSupported(filename: string, mimeType?: string): boolean {
-	// Images are detected and handled separately for vision models
-	if (mimeType) {
-		const category = getFileTypeCategory(mimeType);
-		if (
-			category === FileTypeCategory.IMAGE ||
-			category === FileTypeCategory.AUDIO ||
-			category === FileTypeCategory.PDF
-		) {
-			return true;
-		}
-	}
-
-	// Check extension for known types (especially images without MIME)
-	const extCategory = getFileTypeCategoryByExtension(filename);
-	if (
-		extCategory === FileTypeCategory.IMAGE ||
-		extCategory === FileTypeCategory.AUDIO ||
-		extCategory === FileTypeCategory.PDF
-	) {
+	if (mimeType && getFileTypeCategory(mimeType)) {
 		return true;
 	}

-	// Fallback: treat everything else as text (inclusive by default)
-	return true;
+	return getFileTypeByExtension(filename) !== null;
 }
--- a/tools/server/webui/src/lib/utils/index.ts
+++ b/tools/server/webui/src/lib/utils/index.ts
@@ -76,6 +76,7 @@ export {
 	isFileTypeSupportedByModel,
 	filterFilesByModalities,
 	generateModalityErrorMessage,
+	generateModalityAwareAcceptString,
 	type ModalityCapabilities
 } from './modality-file-validation';

--- a/tools/server/webui/src/lib/utils/modality-file-validation.ts
+++ b/tools/server/webui/src/lib/utils/modality-file-validation.ts
@@ -4,7 +4,17 @@
 */

 import { getFileTypeCategory } from '$lib/utils';
-import { FileTypeCategory } from '$lib/enums';
+import {
+	FileExtensionAudio,
+	FileExtensionImage,
+	FileExtensionPdf,
+	FileExtensionText,
+	MimeTypeAudio,
+	MimeTypeImage,
+	MimeTypeApplication,
+	MimeTypeText,
+	FileTypeCategory
+} from '$lib/enums';

 /** Modality capabilities for file validation */
 export interface ModalityCapabilities {
@@ -160,3 +170,29 @@ export function generateModalityErrorMessage(
 * @param capabilities - The modality capabilities to check against
 * @returns Accept string for HTML file input element
 */
+export function generateModalityAwareAcceptString(capabilities: ModalityCapabilities): string {
+	const { hasVision, hasAudio } = capabilities;
+
+	const acceptedExtensions: string[] = [];
+	const acceptedMimeTypes: string[] = [];
+
+	// Always include text files and PDFs
+	acceptedExtensions.push(...Object.values(FileExtensionText));
+	acceptedMimeTypes.push(...Object.values(MimeTypeText));
+	acceptedExtensions.push(...Object.values(FileExtensionPdf));
+	acceptedMimeTypes.push(...Object.values(MimeTypeApplication));
+
+	// Include images only if vision is supported
+	if (hasVision) {
+		acceptedExtensions.push(...Object.values(FileExtensionImage));
+		acceptedMimeTypes.push(...Object.values(MimeTypeImage));
+	}
+
+	// Include audio only if audio is supported
+	if (hasAudio) {
+		acceptedExtensions.push(...Object.values(FileExtensionAudio));
+		acceptedMimeTypes.push(...Object.values(MimeTypeAudio));
+	}
+
+	return [...acceptedExtensions, ...acceptedMimeTypes].join(',');
+}
--- a/tools/server/webui/src/lib/utils/process-uploaded-files.ts
+++ b/tools/server/webui/src/lib/utils/process-uploaded-files.ts
@@ -1,4 +1,5 @@
 import { isSvgMimeType, svgBase64UrlToPngDataURL } from './svg-to-png';
+import { isTextFileByName } from './text-files';
 import { isWebpMimeType, webpBase64UrlToPngDataURL } from './webp-to-png';
 import { FileTypeCategory } from '$lib/enums';
 import { modelsStore } from '$lib/stores/models.svelte';
@@ -83,6 +84,17 @@ export async function processFilesToChatUploaded(
 				}

 				results.push({ ...base, preview });
+			} else if (
+				getFileTypeCategory(file.type) === FileTypeCategory.TEXT ||
+				isTextFileByName(file.name)
+			) {
+				try {
+					const textContent = await readFileAsUTF8(file);
+					results.push({ ...base, textContent });
+				} catch (err) {
+					console.warn('Failed to read text file, adding without content:', err);
+					results.push(base);
+				}
 			} else if (getFileTypeCategory(file.type) === FileTypeCategory.PDF) {
 				// Extract text content from PDF for preview
 				try {
@@ -117,14 +129,8 @@ export async function processFilesToChatUploaded(
 				const preview = await readFileAsDataURL(file);
 				results.push({ ...base, preview });
 			} else {
-				// Fallback: treat unknown files as text
-				try {
-					const textContent = await readFileAsUTF8(file);
-					results.push({ ...base, textContent });
-				} catch (err) {
-					console.warn('Failed to read file as text, adding without content:', err);
-					results.push(base);
-				}
+				// Other files: add as-is
+				results.push(base);
 			}
 		} catch (error) {
 			console.error('Error processing file', file.name, error);
--- a/tools/server/webui/src/routes/+layout.svelte
+++ b/tools/server/webui/src/routes/+layout.svelte
@@ -119,7 +119,7 @@
 	$effect(() => {
 		const serverProps = serverStore.props;

-		if (serverProps) {
+		if (serverProps?.default_generation_settings?.params) {
 			settingsStore.syncWithServerDefaults();
 		}
 	});
--- a/tools/server/webui/tests/stories/ChatForm.stories.svelte
+++ b/tools/server/webui/tests/stories/ChatForm.stories.svelte
@@ -65,7 +65,10 @@
 		await expect(textarea).toHaveValue(text);

 		const fileInput = document.querySelector('input[type="file"]');
-		await expect(fileInput).not.toHaveAttribute('accept');
+		const acceptAttr = fileInput?.getAttribute('accept');
+		await expect(fileInput).toHaveAttribute('accept');
+		await expect(acceptAttr).not.toContain('image/');
+		await expect(acceptAttr).not.toContain('audio/');

 		// Open file attachments dropdown
 		const fileUploadButton = canvas.getByText('Attach files');
--- a/tools/tts/README.md
+++ b/tools/tts/README.md
@@ -4,7 +4,7 @@ This example demonstrates the Text To Speech feature. It uses a
 [outeai](https://www.outeai.com/).

 ## Quickstart
-If you have built llama.cpp with SSL support you can simply run the
+If you have built llama.cpp with `-DLLAMA_CURL=ON` you can simply run the
 following command and the required models will be downloaded automatically:
 ```console
 $ build/bin/llama-tts --tts-oute-default -p "Hello world" && aplay output.wav