sync from b7516

2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -27,8 +27,6 @@ add_library(mtmd
            models/qwen3vl.cpp
            models/siglip.cpp
            models/whisper-enc.cpp
-            models/mobilenetv5.cpp
-            models/youtuvl.cpp
            )

 set_target_properties(mtmd PROPERTIES
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -32,6 +32,10 @@ struct clip_graph {
    const float kq_scale;
    const clip_flash_attn_type flash_attn_type;

+    // for debugging
+    const bool debug_graph;
+    std::vector<ggml_tensor *> & debug_print_tensors;
+
    ggml_context_ptr ctx0_ptr;
    ggml_context * ctx0;
    ggml_cgraph * gf;
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -45,14 +45,13 @@
 #define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
 #define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"

-#define KEY_MM_PATCH_MERGE_TYPE    "clip.vision.mm_patch_merge_type"
-#define KEY_IMAGE_GRID_PINPOINTS   "clip.vision.image_grid_pinpoints"
-#define KEY_IMAGE_CROP_RESOLUTION  "clip.vision.image_crop_resolution"
-#define KEY_WIN_ATTN_PATTERN       "clip.vision.n_wa_pattern"
-#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
-#define KEY_ATTN_WINDOW_SIZE       "clip.vision.window_size"
-#define KEY_MINICPMV_VERSION       "clip.minicpmv_version"
-#define KEY_MINICPMV_QUERY_NUM     "clip.minicpmv_query_num"
+#define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+#define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
+#define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION      "clip.minicpmv_version"
+#define KEY_MINICPMV_QUERY_NUM    "clip.minicpmv_query_num"

 // audio-specific
 #define KEY_AUDIO_PROJ_TYPE     "clip.audio.projector_type" // for models with mixed modalities
@@ -154,47 +153,6 @@
 #define TN_CONV_PW1        "%s.blk.%d.conv_pw1.%s"
 #define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"

-// mobilenetv5 (gemma3n) definitions
-#define TN_MNV5_STEM_CONV        "v.conv_stem.conv.weight"
-#define TN_MNV5_STEM_BIAS        "v.conv_stem.conv.bias"
-#define TN_MNV5_STEM_BN          "v.conv_stem.bn.weight"
-
-// Stage 0 Block (Edge Residual)
-#define TN_MNV5_BLK_S0_EXP_W     "v.blk.%d.%d.conv_exp.weight"
-#define TN_MNV5_BLK_S0_BN1_W     "v.blk.%d.%d.bn1.weight"
-#define TN_MNV5_BLK_S0_PWL_W     "v.blk.%d.%d.conv_pwl.weight"
-#define TN_MNV5_BLK_S0_BN2_W     "v.blk.%d.%d.bn2.weight"
-
-// Stage 1+ Block (Universal Inverted Residual)
-#define TN_MNV5_BLK_DW_START_W   "v.blk.%d.%d.dw_start.conv.weight"
-#define TN_MNV5_BLK_DW_START_BN  "v.blk.%d.%d.dw_start.bn.weight"
-#define TN_MNV5_BLK_DW_MID_W     "v.blk.%d.%d.dw_mid.conv.weight"
-#define TN_MNV5_BLK_DW_MID_BN    "v.blk.%d.%d.dw_mid.bn.weight"
-#define TN_MNV5_BLK_PW_EXP_W     "v.blk.%d.%d.pw_exp.conv.weight"
-#define TN_MNV5_BLK_PW_EXP_BN    "v.blk.%d.%d.pw_exp.bn.weight"
-#define TN_MNV5_BLK_PW_PROJ_W    "v.blk.%d.%d.pw_proj.conv.weight"
-#define TN_MNV5_BLK_PW_PROJ_BN   "v.blk.%d.%d.pw_proj.bn.weight"
-#define TN_MNV5_BLK_LAYER_SCALE  "v.blk.%d.%d.layer_scale.gamma"
-
-// Attention Components
-#define TN_MNV5_ATTN_Q_W         "v.blk.%d.%d.attn.query.proj.weight"
-#define TN_MNV5_ATTN_K_W         "v.blk.%d.%d.attn.key.proj.weight"
-#define TN_MNV5_ATTN_V_W         "v.blk.%d.%d.attn.value.proj.weight"
-#define TN_MNV5_ATTN_O_W         "v.blk.%d.%d.attn.output.proj.weight"
-#define TN_MNV5_ATTN_K_DW        "v.blk.%d.%d.attn.key.down_conv.weight"
-#define TN_MNV5_ATTN_K_NORM      "v.blk.%d.%d.attn.key.norm.weight"
-#define TN_MNV5_ATTN_V_DW        "v.blk.%d.%d.attn.value.down_conv.weight"
-#define TN_MNV5_ATTN_V_NORM      "v.blk.%d.%d.attn.value.norm.weight"
-#define TN_MNV5_ATTN_NORM        "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
-
-// MSFA
-#define TN_MNV5_MSFA_FFN_EXP_W   "v.msfa.ffn.pw_exp.conv.weight"
-#define TN_MNV5_MSFA_FFN_EXP_BN  "v.msfa.ffn.pw_exp.bn.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_W  "v.msfa.ffn.pw_proj.conv.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
-#define TN_MNV5_MSFA_NORM        "v.msfa.norm.weight"
-
-
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

@@ -212,8 +170,6 @@ enum projector_type {
    PROJECTOR_TYPE_QWEN2VL,
    PROJECTOR_TYPE_QWEN3VL,
    PROJECTOR_TYPE_GEMMA3,
-    PROJECTOR_TYPE_GEMMA3NV,
-    PROJECTOR_TYPE_GEMMA3NA,
    PROJECTOR_TYPE_IDEFICS3,
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
@@ -224,7 +180,6 @@ enum projector_type {
    PROJECTOR_TYPE_GLMA,
    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
    PROJECTOR_TYPE_VOXTRAL,
-    PROJECTOR_TYPE_MUSIC_FLAMINGO,
    PROJECTOR_TYPE_LFM2,
    PROJECTOR_TYPE_KIMIVL,
    PROJECTOR_TYPE_LIGHTONOCR,
@@ -232,7 +187,6 @@ enum projector_type {
    PROJECTOR_TYPE_JANUS_PRO,
    PROJECTOR_TYPE_LFM2A,
    PROJECTOR_TYPE_GLM4V,
-    PROJECTOR_TYPE_YOUTUVL,
    PROJECTOR_TYPE_UNKNOWN,
 };

@@ -246,8 +200,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
-    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
@@ -257,7 +209,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_GLMA,      "glma"},
    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
-    { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
    { PROJECTOR_TYPE_LFM2,      "lfm2"},
    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
@@ -265,7 +216,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
-    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -61,7 +61,6 @@ struct clip_hparams {
    std::unordered_set<int32_t> vision_feature_layer;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
-    std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)

    // audio
    int32_t n_mel_bins = 0; // whisper preprocessor
@@ -173,45 +172,6 @@ struct clip_layer {
    }
 };

-// Expanded MobileNetV5 block structure for Gemma3n vision encoder
-struct mobilenetv5_block {
-    // Stage 0 (Edge Residual)
-    ggml_tensor * s0_conv_exp_w = nullptr;
-    ggml_tensor * s0_bn1_w      = nullptr;
-    ggml_tensor * s0_conv_pwl_w = nullptr;
-    ggml_tensor * s0_bn2_w      = nullptr;
-
-    // Stage 1+ (Universal Inverted Residual)
-    ggml_tensor * dw_start_w    = nullptr;
-    ggml_tensor * dw_start_bn_w = nullptr;
-
-    ggml_tensor * pw_exp_w      = nullptr;
-    ggml_tensor * pw_exp_bn_w   = nullptr;
-
-    ggml_tensor * dw_mid_w      = nullptr;
-    ggml_tensor * dw_mid_bn_w   = nullptr;
-
-    ggml_tensor * pw_proj_w     = nullptr;
-    ggml_tensor * pw_proj_bn_w  = nullptr;
-
-    ggml_tensor * layer_scale_w = nullptr;
-
-    // Attention (MQA) components
-    ggml_tensor * attn_q_w = nullptr;
-    ggml_tensor * attn_k_w = nullptr;
-    ggml_tensor * attn_v_w = nullptr;
-    ggml_tensor * attn_o_w = nullptr;
-
-    // Optional downsampling/norm in attention
-    ggml_tensor * attn_k_dw_w   = nullptr;
-    ggml_tensor * attn_k_norm_w = nullptr;
-    ggml_tensor * attn_v_dw_w   = nullptr;
-    ggml_tensor * attn_v_norm_w = nullptr;
-
-    // Block norm (often present in attention blocks)
-    ggml_tensor * attn_norm_w   = nullptr;
-};
-
 struct clip_model {
    clip_modality modality = CLIP_MODALITY_VISION;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -328,23 +288,6 @@ struct clip_model {
    ggml_tensor * mm_input_proj_w = nullptr;
    ggml_tensor * mm_soft_emb_norm_w = nullptr;

-    // mobilenetv5 for gemma3n
-    std::vector<mobilenetv5_block> mobilenet_blocks;
-    std::vector<int> mobilenet_stage_ends;
-    ggml_tensor * mobilenet_stem_conv_w = nullptr;
-    ggml_tensor * mobilenet_stem_conv_b = nullptr;
-    ggml_tensor * mobilenet_stem_norm_w = nullptr;
-    ggml_tensor * mm_post_proj_norm_w = nullptr;
-
-    // Multi-Scale Fusion Adapter (MSFA) components
-    ggml_tensor * msfa_concat_conv_w = nullptr;
-    ggml_tensor * msfa_concat_norm_w = nullptr;
-    ggml_tensor * msfa_ffn_expand_w = nullptr;
-    ggml_tensor * msfa_ffn_project_w = nullptr;
-    ggml_tensor * msfa_ffn_expand_bn = nullptr;
-    ggml_tensor * msfa_ffn_project_bn = nullptr;
-
-
    // pixtral, glm4v
    ggml_tensor * token_embd_img_break = nullptr;
    ggml_tensor * mm_patch_merger_w = nullptr;
@@ -376,8 +319,7 @@ struct clip_model {

    bool audio_has_avgpool() const {
        return proj_type == PROJECTOR_TYPE_QWEN2A
-            || proj_type == PROJECTOR_TYPE_VOXTRAL
-            || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
    }

    bool audio_has_stack_frames() const {
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -152,14 +152,18 @@ struct clip_ctx {
    ggml_backend_t backend_cpu = nullptr;
    ggml_backend_buffer_ptr buf;

-
    int max_nodes = 8192;
    ggml_backend_sched_ptr sched;
    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
    bool is_allocated = false;

+    // for debugging
+    bool debug_graph = false;
+    std::vector<ggml_tensor *> debug_print_tensors;
+
    clip_ctx(clip_context_params & ctx_params) {
        flash_attn_type = ctx_params.flash_attn_type;
+        debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
        if (!backend_cpu) {
            throw std::runtime_error("failed to initialize CPU backend");
@@ -200,10 +204,6 @@ struct clip_ctx {
        sched.reset(
            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
        );
-
-        if (ctx_params.cb_eval != nullptr) {
-            ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
-        }
    }

    ~clip_ctx() {
@@ -239,7 +239,9 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
        n_mmproj_embd(clip_n_mmproj_embd(ctx)),
        eps(hparams.eps),
        kq_scale(1.0f / sqrtf((float)d_head)),
-        flash_attn_type(ctx->flash_attn_type) {
+        flash_attn_type(ctx->flash_attn_type),
+        debug_graph(ctx->debug_graph),
+        debug_print_tensors(ctx->debug_print_tensors) {
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
@@ -250,11 +252,14 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
    gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
 }

-void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
-    if (il >= 0) {
-        ggml_format_name(cur, "%s-%d", name, il);
-    } else {
-        ggml_set_name(cur, name);
+void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
+    if (debug_graph) {
+        ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
+        std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
+        ggml_set_name(cur, cur_name.c_str());
+        ggml_set_output(cur);
+        ggml_build_forward_expand(gf, cur);
+        debug_print_tensors.push_back(cur);
    }
 }

@@ -783,10 +788,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_siglip>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
-            } break;
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            {
@@ -817,7 +818,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_QWEN2A:
        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
            } break;
@@ -845,10 +845,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_glm4v>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
-            } break;
        default:
            GGML_ABORT("missing cgraph builder");
    }
@@ -1145,14 +1141,6 @@ struct clip_model_loader {
                        // test model (tinygemma3) has a different value, we optionally read it
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                    } break;
-
-                case PROJECTOR_TYPE_GEMMA3NV:
-                    {
-                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
-                        // Similar configuration to Gemma3
-                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
-                    } break;
                case PROJECTOR_TYPE_QWEN2VL:
                case PROJECTOR_TYPE_QWEN25VL:
                case PROJECTOR_TYPE_QWEN3VL:
@@ -1170,20 +1158,6 @@ struct clip_model_loader {
                            LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
                        }
                    } break;
-                case PROJECTOR_TYPE_YOUTUVL:
-                    {
-                        hparams.n_merge = 2;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
-                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
-                        std::vector<int> wa_layer_indexes_vec;
-                        get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
-                        for (auto & layer : wa_layer_indexes_vec) {
-                            hparams.wa_layer_indexes.insert(layer);
-                        }
-                        // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
-                        hparams.set_limit_image_tokens(1, 62500);
-                        hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
-                    } break;
                case PROJECTOR_TYPE_GLM4V:
                    {
                        hparams.rope_theta = 10000.0f;
@@ -1202,7 +1176,6 @@ struct clip_model_loader {
                case PROJECTOR_TYPE_QWEN2A:
                case PROJECTOR_TYPE_GLMA:
                case PROJECTOR_TYPE_VOXTRAL:
-                case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                    {
                        bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
                                             model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@@ -1252,14 +1225,7 @@ struct clip_model_loader {
                LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
                LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
                LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
-                LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
-                if (!hparams.wa_layer_indexes.empty()) {
-                    LOG_INF("%s: wa_layer_indexes:  ", __func__);
-                    for (auto & layer : hparams.wa_layer_indexes) {
-                        LOG_INF("%d ", layer);
-                    }
-                    LOG_INF("\n");
-                }
+                LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
                if (hparams.image_min_pixels > 0) {
                    LOG_INF("%s: image_min_pixels:   %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
                }
@@ -1341,10 +1307,6 @@ struct clip_model_loader {

        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);

-        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
-            hparams.n_layer = 0; // gemma3n does not use normal layer structure
-        }
-
        // layers
        model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1419,7 +1381,6 @@ struct clip_model_loader {
            }
        }

-
        switch (model.proj_type) {
            case PROJECTOR_TYPE_MLP:
            case PROJECTOR_TYPE_MLP_NORM:
@@ -1514,8 +1475,8 @@ struct clip_model_loader {
                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
                } break;
            case PROJECTOR_TYPE_QWEN2VL:
            case PROJECTOR_TYPE_QWEN25VL:
@@ -1532,14 +1493,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
-            case PROJECTOR_TYPE_YOUTUVL:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);        // merger.ln_q (RMS norm)
-                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));  // merger.mlp.0
-                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));  // merger.mlp.2
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_GLM4V:
                {
                    model.projection     = get_tensor(TN_MM_PROJECTOR);
@@ -1559,112 +1512,11 @@ struct clip_model_loader {
                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                } break;
-            case PROJECTOR_TYPE_GEMMA3NV:
-                {
-                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
-                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
-                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
-
-                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
-                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
-                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
-                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
-
-                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
-
-                    // Dynamically load blocks stage by stage
-                    for (int stage = 0; stage < 4; ++stage) {
-                        int blocks_found_in_stage = 0;
-
-                        for (int blk_idx = 0; ; ++blk_idx) {
-                            bool found_block = false;
-                            mobilenetv5_block block;
-
-                            // 1. Check for Edge Residual (S0)
-                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
-                            if (block.s0_conv_exp_w) {
-                                found_block = true;
-                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
-                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
-                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
-                            }
-                            // 2. Check for UIR (Universal Inverted Residual)
-                            else {
-                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
-                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
-                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
-
-                                if (block.dw_start_w || block.pw_exp_w) {
-                                    found_block = true;
-                                    if (block.dw_start_w) {
-                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
-                                    }
-                                    if (block.pw_exp_w) {
-                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
-                                    }
-                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
-                                    if (block.dw_mid_w) {
-                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
-                                    }
-                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
-                                    if (block.pw_proj_w) {
-                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
-                                    }
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            // 3. Check for Attention (MQA)
-                            // Even if UIR/Edge check failed, this might be a pure attention block
-                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
-                            if (attn_q_check) {
-                                found_block = true;
-                                block.attn_q_w = attn_q_check;
-                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
-                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
-                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
-                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
-                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
-                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
-                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
-                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
-                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
-                                if (!block.layer_scale_w) {
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            if (found_block) {
-                                model.mobilenet_blocks.push_back(block);
-                                blocks_found_in_stage++;
-                            } else {
-                                // End of blocks for this stage
-                                break;
-                            }
-                        }
-
-                        // Track where this stage ends in the flat vector
-                        if (blocks_found_in_stage > 0) {
-                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
-                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
-                        }
-                    }
-                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
-                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
-                } break;
            case PROJECTOR_TYPE_IDEFICS3:
                {
                    model.projection = get_tensor(TN_MM_PROJECTOR);
                } break;
            case PROJECTOR_TYPE_LFM2:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
-                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_KIMIVL:
                {
                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@@ -1724,17 +1576,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
                } break;
-            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-                {
-                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
-                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
-                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
-                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_INTERNVL:
                {
                    model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -1756,8 +1597,8 @@ struct clip_model_loader {
                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
                    model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
                } break;
            case PROJECTOR_TYPE_LLAMA4:
                {
@@ -2107,7 +1948,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

    try {
        clip_model_loader loader(fname);
-        bool skip_audio = false;

        if (loader.has_vision) {
            ctx_vision = new clip_ctx(ctx_params);
@@ -2117,14 +1957,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
                loader.warmup(*ctx_vision);
            }

-            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
-            // we can remove this check when we implement audio support for Gemma 3N
-            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
-
            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
        }

-        if (loader.has_audio && !skip_audio) {
+        if (loader.has_audio) {
            ctx_audio = new clip_ctx(ctx_params);
            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
            loader.load_tensors(*ctx_audio);
@@ -2848,57 +2684,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                // res_imgs->data[0] = *res;
                res_imgs->entries.push_back(std::move(img_f32));
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                const int patch_size = params.patch_size;  // typically 16
-                const int merge_size = params.n_merge;      // typically 2
-                const int align_size = patch_size * merge_size;  // 32
-
-                const int max_num_patches = params.image_max_pixels > 0 ?
-                    params.image_max_pixels / (patch_size * patch_size) : 256;
-
-                // Linear search for optimal scale to fit within max_num_patches
-                float scale = 1.0f;
-                int target_height = original_size.height;
-                int target_width = original_size.width;
-
-                auto get_scaled_image_size = [align_size](float scale, int size) -> int {
-                    float scaled_size = size * scale;
-                    // Round up to nearest multiple of align_size
-                    int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
-                    // Ensure at least one patch
-                    return std::max(align_size, aligned);
-                };
-
-                // Linear search with 0.02 step size
-                while (scale > 0.0f) {
-                    target_height = get_scaled_image_size(scale, original_size.height);
-                    target_width = get_scaled_image_size(scale, original_size.width);
-
-                    int num_patches_h = target_height / patch_size;
-                    int num_patches_w = target_width / patch_size;
-                    int num_patches = num_patches_h * num_patches_w;
-
-                    if (num_patches > max_num_patches) {
-                        scale -= 0.02f;
-                    } else {
-                        break;
-                    }
-                }
-
-                clip_image_size new_size = {target_width, target_height};
-
-                // Resize the image
-                clip_image_u8 resized;
-                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
-
-                // Normalize to float32
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
-
-                // Add to results
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;

        case PROJECTOR_TYPE_IDEFICS3:
            {
@@ -2962,16 +2747,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                res_imgs->entries.push_back(std::move(img_f32));
            } break;

-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                clip_image_u8 resized_image;
-                int sz = params.image_size;
-                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
        case PROJECTOR_TYPE_JANUS_PRO:
            {
                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@@ -3141,7 +2916,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->nx / params.patch_size) / 2;
        default:
            break;
@@ -3157,7 +2931,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->ny / params.patch_size) / 2;
        default:
            break;
@@ -3218,7 +2991,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // dynamic size (2 conv, so double patch size)
                int x_patch = img->nx / (params.patch_size * 2);
@@ -3234,12 +3006,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                int scale_factor = ctx->model.hparams.n_merge;
                n_patches /= (scale_factor * scale_factor);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
-                // regardless of input size (see architecture description)
-                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
-            } break;
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_KIMIVL:
            {
@@ -3265,7 +3031,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                n_patches = img->nx;

@@ -3334,6 +3099,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }

    // build the inference graph
+    ctx->debug_print_tensors.clear();
    ggml_backend_sched_reset(ctx->sched.get());
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@@ -3351,6 +3117,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int pos_w = image_size_width  / patch_size;
    const int pos_h = image_size_height / patch_size;

+    const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl

    auto get_inp_tensor = [&gf](const char * name) {
        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3499,11 +3266,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("positions", positions);
            } break;
        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // pw * ph = number of tokens output by ViT after apply patch merger
                // ipw * ipw = number of vision token been processed inside ViT
-                const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
                const int merge_ratio = 2;
                const int pw  = image_size_width  / patch_size / merge_ratio;
                const int ph  = image_size_height / patch_size / merge_ratio;
@@ -3514,7 +3279,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                std::vector<int> inv_idx(ph * pw);

                if (use_window_attn) {
-                    const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
+                    const int attn_window_size = 112;
                    const int grid_window = attn_window_size / patch_size / merge_ratio;
                    int dst = 0;
                    // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3631,7 +3396,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("patches", patches);
            } break;
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_INTERNVL:
        case PROJECTOR_TYPE_QWEN2A:
@@ -3639,7 +3403,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
        case PROJECTOR_TYPE_JANUS_PRO:
        case PROJECTOR_TYPE_COGVLM:
            {
@@ -3703,6 +3466,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        return false;
    }

+    // print debug nodes
+    if (ctx->debug_graph) {
+        LOG_INF("\n\n---\n\n");
+        LOG_INF("\n\nDebug graph:\n\n");
+        for (ggml_tensor * t : ctx->debug_print_tensors) {
+            std::vector<uint8_t> data(ggml_nbytes(t));
+            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+            print_tensor_shape(t);
+            print_tensor_data(t, data.data(), 3);
+        }
+    }
+
    // the last node is the embedding tensor
    ggml_tensor * embeddings = ggml_graph_node(gf, -1);

@@ -3741,19 +3516,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_JANUS_PRO:
-        case PROJECTOR_TYPE_YOUTUVL:
            return ctx->model.mm_1_b->ne[0];
        case PROJECTOR_TYPE_QWEN3VL:
            // main path + deepstack paths
            return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
            return ctx->model.mm_input_proj_w->ne[0];
        case PROJECTOR_TYPE_IDEFICS3:
            return ctx->model.projection->ne[1];
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_INTERNVL:
            return ctx->model.mm_3_w->ne[1];
@@ -3778,7 +3550,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
 }

 int clip_is_minicpmv(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
        return ctx->model.hparams.minicpmv_version;
    }
@@ -3786,14 +3557,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
 }

 bool clip_is_glm(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }

+bool clip_is_mrope(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
+        || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
+}
+
 bool clip_is_llava(const struct clip_ctx * ctx) {
    return ctx->model.hparams.has_llava_projector;
 }

+bool clip_is_gemma3(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
+}
+
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_VISION;
 }
@@ -3803,16 +3584,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
 }

 bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_ULTRAVOX:
-        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
+        || ctx->proj_type() == PROJECTOR_TYPE_GLMA
+        || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
 }

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
@@ -3854,6 +3629,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
 //
 // API for debugging
 //
+
 void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    clip_image_f32 img;
    img.nx = w;
@@ -3862,6 +3638,9 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    for (int i = 0; i < h * w * 3; i++) {
        img.buf[i] = static_cast<float>(fill_value);
    }
+    bool cur_debug_graph = ctx->debug_graph;
+    ctx->debug_graph = true;
    clip_image_encode(ctx, 1, &img, nullptr);
+    ctx->debug_graph = cur_debug_graph;
    GGML_ASSERT(img.buf.empty() && "expected, always stop here");
 }
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -1,7 +1,6 @@
 #pragma once

 #include "ggml.h"
-#include "mtmd.h"

 #include <stddef.h>
 #include <stdint.h>
@@ -38,8 +37,6 @@ struct clip_context_params {
    int image_min_tokens;
    int image_max_tokens;
    bool warmup;
-    ggml_backend_sched_eval_callback cb_eval;
-    void * cb_eval_user_data;
 };

 struct clip_init_result {
@@ -107,9 +104,9 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct

 int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
+bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
-// note for contributor: this clip_is_(model) pattern is deprecated
-//                       do NOT add new functions like this
+bool clip_is_gemma3(const struct clip_ctx * ctx);

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -1,451 +0,0 @@
-#include "models.h"
-
-// Helpers for MobileNetV5 Blocks
-// RMS Norm 2D - normalizes over channels for each spatial position
-ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
-    // inp: [W, H, C, B]
-
-    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-    cur = ggml_rms_norm(ctx0, cur, eps);
-
-    if (weight) {
-        cur = ggml_mul(ctx0, cur, weight);
-    }
-
-    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-
-    return cur;
-}
-
-// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
-ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
-    const int64_t ih = inp->ne[1];  // height
-    const int64_t iw = inp->ne[0];  // width
-
-    // Calculate output size (ceil division)
-    const int64_t oh = (ih + stride_h - 1) / stride_h;
-    const int64_t ow = (iw + stride_w - 1) / stride_w;
-
-    // Calculate padding needed
-    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
-    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
-
-    // Split padding asymmetrically
-    const int pad_h_top = pad_h / 2;
-    const int pad_h_bottom = pad_h - pad_h_top;
-    const int pad_w_left = pad_w / 2;
-    const int pad_w_right = pad_w - pad_w_left;
-
-    // Apply padding if needed
-    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
-    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
-    if (pad_h > 0 || pad_w > 0) {
-        inp = ggml_pad_ext(ctx0, inp,
-            pad_w_left, pad_w_right,     // width padding (dim 0)
-            pad_h_top, pad_h_bottom,      // height padding (dim 1)
-            0, 0,                         // no channel padding (dim 2)
-            0, 0);                        // no batch padding (dim 3)
-    }
-
-    return inp;
-}
-
-
-// Edge Residual Block (Stage 0)
-ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Expansion Conv (3x3)
-    if (stride == 2) {
-        // Case: Downsampling (Block 0)
-        // Replicates Conv2dSame(kernel=3, stride=2)
-        cur = pad_same_2d(cur, 3, 3, stride, stride);
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
-    } else {
-        // Case: Normal 3x3 Block (Block 1, 2)
-        // Replicates Conv2d(kernel=3, stride=1, padding=1)
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
-    }
-
-    // BN + Activation
-    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
-    cur = ggml_gelu(ctx0, cur);
-
-    // 2. Pointwise Linear Conv (1x1)
-    // 1x1 Convs usually have padding=0 and stride=1
-    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
-    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
-
-    // 3. Residual Connection
-    // Only apply residual if spatial dimensions and channels match (stride 1)
-    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-// Universal Inverted Residual Block (Stage 1+)
-ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Depthwise Start (Optional)
-    // NOTE: dw_start always has stride=1 (no downsampling here)
-    if (block.dw_start_w) {
-        int k = block.dw_start_w->ne[0]; // 3 or 5
-        int p = k / 2;
-        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
-        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
-    }
-
-    // 2. Pointwise Expansion (1x1)
-    if (block.pw_exp_w) {
-        // Standard 1x1 conv, pad=0, stride=1
-        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 3. Depthwise Mid (Optional)
-    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
-    if (block.dw_mid_w) {
-        int k = block.dw_mid_w->ne[0]; // 3 or 5
-
-        if (stride > 1) {
-            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
-            cur = pad_same_2d(cur, k, k, stride, stride);
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
-        } else {
-            // Case: Stride 1 -> Use Standard Symmetric Padding
-            int p = k / 2;
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
-        }
-
-        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 4. Pointwise Projection (1x1)
-    if (block.pw_proj_w) {
-        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
-    }
-
-    // Apply Layer Scaling if present
-    if (block.layer_scale_w) {
-        cur = ggml_mul(ctx0, cur, block.layer_scale_w);
-    }
-
-    // 5. Residual Connection
-    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
-    bool same_channel = (inp->ne[2] == cur->ne[2]);
-    if (same_spatial && same_channel) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-// Attention Block (MQA)
-ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
-    ggml_tensor * cur = inp;
-
-    // Norm
-    if (block.attn_norm_w) {
-        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
-    }
-
-    // 1. Q Calculation
-    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
-
-    // 2. K Calculation (Downsampled)
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * k_inp = cur;
-    if (block.attn_k_dw_w) {
-        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
-        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
-        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_k_norm_w) {
-            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
-
-    // 3. V Calculation (Downsampled)
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * v_inp = cur;
-    if (block.attn_v_dw_w) {
-        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
-        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
-        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_v_norm_w) {
-            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
-
-    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
-    const int D = k->ne[2]; // Head dimension
-    const int n_head = q->ne[2] / D;
-    const int N = W * H;
-
-    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
-    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
-    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
-    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
-    q = ggml_cont(ctx0, q);
-
-    const int Wk = k->ne[0]; const int Hk = k->ne[1];
-    const int M = Wk * Hk;
-
-    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
-    k = ggml_reshape_3d(ctx0, k, M, D, B);
-    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
-    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
-    k = ggml_cont(ctx0, k);
-
-    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
-    v = ggml_reshape_3d(ctx0, v, M, D, B);
-    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
-    v = ggml_cont(ctx0, v); // [M, D, 1, B]
-
-    // Multi-Query Attention
-    float scale = 1.0f / sqrtf((float)D);
-
-    // Step 1: Compute Q @ K.T
-    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
-
-    scores = ggml_scale(ctx0, scores, scale);
-
-    scores = ggml_soft_max(ctx0, scores);
-
-    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
-
-    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
-    kqv = ggml_cont(ctx0, kqv);
-
-
-    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
-    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
-    kqv = ggml_cont(ctx0, kqv);
-
-    // Output projection
-    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
-
-    // Residual & Layer Scale
-    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
-        if (block.layer_scale_w) {
-            cur = ggml_mul(ctx0, cur, block.layer_scale_w);
-        }
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-ggml_cgraph * clip_graph_mobilenetv5::build() {
-    ggml_tensor * inp = build_inp_raw();
-
-    // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
-    ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2);  // Apply SAME padding
-
-    cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
-    if (model.mobilenet_stem_conv_b) {
-        cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
-    }
-    if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
-    cur = ggml_gelu(ctx0, cur);
-
-
-    // 2. Blocks
-    std::vector<ggml_tensor*> intermediate_features;
-    const int total_blocks = model.mobilenet_blocks.size();
-
-    auto is_stage_start = [&](int i) {
-        if (i == 0) return true;
-        for (int end_idx : model.mobilenet_stage_ends) {
-            if (i == end_idx + 1) return true;
-        }
-        return false;
-    };
-
-    auto is_fusion_point = [&](int i) {
-        if (model.mobilenet_stage_ends.size() >= 4) {
-                if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
-                if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
-        } else {
-            if (i == total_blocks - 1) return true;
-        }
-        return false;
-    };
-
-    for (int i = 0; i < total_blocks; i++) {
-        const auto & block = model.mobilenet_blocks[i];
-        int stride = is_stage_start(i) ? 2 : 1;
-
-        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride);
-        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block);
-        else                          cur = build_inverted_residual(cur, block, stride);
-
-        if (is_fusion_point(i)) {
-
-            intermediate_features.push_back(cur);
-        }
-    }
-
-    // 3. Multi-Scale Fusion Adapter (MSFA)
-    if (!intermediate_features.empty()) {
-
-        // A. Reference Resolution: PyTorch implementation uses inputs[0]
-        // We assume intermediate_features[0] is the "High Resolution" target.
-        // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
-        ggml_tensor* target_feat = intermediate_features[0];
-        int high_res_w = target_feat->ne[0];
-        int high_res_h = target_feat->ne[1];
-
-        std::vector<ggml_tensor*> resized_feats;
-
-        // B. Resize inputs to match inputs[0] (High Resolution)
-        for (auto feat : intermediate_features) {
-            int feat_w = feat->ne[0];
-            int feat_h = feat->ne[1];
-
-            // PyTorch: if feat_size < high_resolution: interpolate
-            if (feat_w < high_res_w || feat_h < high_res_h) {
-                // Calculate scale factor.
-                // Note: PyTorch 'nearest' works on arbitrary float scales.
-                // ggml_upscale generally takes integer factors or target sizes depending on helper.
-                // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
-                int scale_w = high_res_w / feat_w;
-                // int scale_h = high_res_h / feat_h;
-
-                // Safety check for non-integer scaling if strictly replicating
-                GGML_ASSERT(high_res_w % feat_w == 0);
-
-                // Upsample (Nearest Neighbor)
-                // 2 is the scale factor
-                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
-            }
-            resized_feats.push_back(feat);
-        }
-
-        // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
-        cur = resized_feats[0];
-        for (size_t k = 1; k < resized_feats.size(); ++k) {
-            cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
-        }
-
-        // D. FFN (UniversalInvertedResidual)
-        // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
-
-        // 1. Expansion
-        if (model.msfa_ffn_expand_w) {
-            // 1x1 Conv
-            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
-
-            if (model.msfa_ffn_expand_bn) {
-                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
-            }
-
-            cur = ggml_gelu(ctx0, cur);
-
-        }
-
-        // 2. Projection (No DW because kernel_size=0)
-        if (model.msfa_ffn_project_w) {
-            // 1x1 Conv
-            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
-
-            // UniversalInvertedResidual typically has a norm after projection
-            if (model.msfa_ffn_project_bn) {
-                cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
-            }
-
-        }
-
-        // E. Final Downsample to Target Resolution (Output Resolution)
-        // PyTorch: matches self.output_resolution (e.g. 16x16)
-        const int target_out_res = 16;
-        int current_w = cur->ne[0];
-
-        if (current_w > target_out_res) {
-            int s = current_w / target_out_res;
-
-            GGML_ASSERT(current_w % target_out_res == 0);
-
-            // Avg Pool: Kernel=s, Stride=s
-            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
-
-        }
-
-        // F. Final Norm
-        if (model.msfa_concat_norm_w) {
-            cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
-
-        }
-    }
-
-    // 4. Gemma 3n Multimodal Projection (Embedder)
-    // Input: 'cur' is [Width, Height, Channels, Batch]
-    int W = cur->ne[0];
-    int H = cur->ne[1];
-    int C = cur->ne[2];
-    int B = cur->ne[3];
-
-    GGML_ASSERT(C == hparams.n_embd);
-
-    // 1. Permute and Flatten to [Channels, Tokens, Batch]
-    // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
-    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
-    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
-    cur = ggml_cont(ctx0, cur);
-    cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
-    cur = ggml_cont(ctx0, cur);
-
-
-    // 2. FEATURE SCALING
-    // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
-    const float scale_factor = sqrtf((float)C);
-    cur = ggml_scale(ctx0, cur, scale_factor);
-
-
-    // 3. SOFT EMBEDDING NORM
-    // PyTorch: self._norm(x) * self.weight
-    // We must normalize regardless, then multiply if weight exists.
-    {
-        const float eps = 1e-6f; // Gemma3n uses 1e-6
-        cur = ggml_rms_norm(ctx0, cur, eps);
-
-        if (model.mm_soft_emb_norm_w) {
-            // Weight shape is (2048,) -> Element-wise broadcast multiply
-            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
-        }
-
-    }
-
-    // 4. PROJECTION
-    // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
-    // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
-    if (model.mm_input_proj_w) {
-        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
-    }
-
-    // 5. POST PROJECTION NORM
-    // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
-    // with_scale=False means weight is registered as buffer with value 1.0
-    // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
-    {
-        const float eps = 1e-6f;
-        cur = ggml_rms_norm(ctx0, cur, eps);
-
-        if (model.mm_post_proj_norm_w) {
-            // If weight is loaded, multiply (should be ~1.0 anyway)
-            cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
-        }
-    }
-
-    ggml_build_forward_expand(gf, cur);
-    return gf;
-}
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -2,11 +2,6 @@

 #include "../clip-graph.h"

-/*
- * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
- * We encourage human contributors to ensure the quality and reliability of the codebase.
- */
-
 struct clip_graph_siglip : clip_graph {
    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -27,11 +22,6 @@ struct clip_graph_qwen3vl : clip_graph {
    ggml_cgraph * build() override;
 };

-struct clip_graph_youtuvl : clip_graph {
-    clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-};
-
 struct clip_graph_minicpmv : clip_graph {
    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -76,36 +66,3 @@ struct clip_graph_glm4v : clip_graph {
    clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
-
-struct clip_graph_mobilenetv5 : clip_graph {
-    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-
-    ggml_tensor * rms_norm_2d(
-        ggml_tensor * inp,
-        ggml_tensor * weight,
-        float eps = 1e-6f);
-
-    ggml_tensor* pad_same_2d(
-        ggml_tensor* inp,
-        int kernel_h,
-        int kernel_w,
-        int stride_h,
-        int stride_w,
-        int dilation_h = 1,
-        int dilation_w = 1);
-
-    ggml_tensor * build_edge_residual(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block,
-        int stride);
-
-    ggml_tensor * build_inverted_residual(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block,
-        int stride);
-
-    ggml_tensor * build_mobilenet_attn(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block);
-};
--- a/tools/mtmd/models/siglip.cpp
+++ b/tools/mtmd/models/siglip.cpp
@@ -50,15 +50,10 @@ ggml_cgraph * clip_graph_siglip::build() {
        const int scale_factor = model.hparams.n_merge;
        cur = build_patch_merge_permute(cur, scale_factor);

-        // projection, in LFM2-VL input norm is optional
-        if (model.mm_input_norm_w) {
-            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
-            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
-        }
-
-        if (model.mm_input_norm_b) {
-            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
-        }
+        // projection
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);

        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
--- a/tools/mtmd/models/whisper-enc.cpp
+++ b/tools/mtmd/models/whisper-enc.cpp
@@ -86,15 +86,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
            FFN_GELU_ERF,
            -1);

-    } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
-        // projector
-        cur = build_ffn(cur,
-            model.mm_1_w, model.mm_1_b,
-            nullptr, nullptr,
-            model.mm_2_w, model.mm_2_b,
-            FFN_GELU_ERF,
-            -1);
-
    } else if (proj_type == PROJECTOR_TYPE_GLMA) {
            cur = ggml_norm(ctx0, cur, hparams.eps);
            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
--- a/tools/mtmd/models/youtuvl.cpp
+++ b/tools/mtmd/models/youtuvl.cpp
@@ -1,179 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_youtuvl::build() {
-    GGML_ASSERT(model.class_embedding == nullptr);
-    const int batch_size       = 1;
-    const bool use_window_attn = !hparams.wa_layer_indexes.empty();
-    const int n_pos            = n_patches;
-    const int num_position_ids = n_pos * 4;
-    const int m = 2;
-    const int Wp = n_patches_x;
-    const int Hp = n_patches_y;
-    const int Hm = Hp / m;
-    const int Wm = Wp / m;
-    norm_type norm_t = NORM_TYPE_NORMAL;
-
-    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
-
-    ggml_tensor * inp = build_inp_raw();
-
-    // change conv3d to linear
-    // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
-    {
-        inp = ggml_reshape_4d(
-            ctx0, inp,
-            Wm * m * patch_size, m * patch_size, Hm, 3);
-        inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            m * patch_size * 3, Wm, m * patch_size, Hm);
-
-        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            m * patch_size * 3, patch_size, m, Hm * Wm);
-
-        inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            patch_size, 3, patch_size, Hm * Wm * m * m);
-
-        inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
-        inp = ggml_cont_3d(
-            ctx0, inp,
-            3*patch_size* patch_size,  Hm * Wm * m * m, 1);
-    }
-    inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
-
-    if (model.patch_bias) {
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-
-    inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
-
-    ggml_tensor * inpL           = inp;
-    ggml_tensor * window_mask    = nullptr;
-    ggml_tensor * window_idx     = nullptr;
-    ggml_tensor * inv_window_idx = nullptr;
-
-    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    // pre-layernorm
-    if (model.pre_ln_w) {
-        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
-    }
-    if (use_window_attn) {
-        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
-        ggml_set_name(inv_window_idx, "inv_window_idx");
-        ggml_set_input(inv_window_idx);
-        // mask for window attention
-        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
-        ggml_set_name(window_mask, "window_mask");
-        ggml_set_input(window_mask);
-
-        // if flash attn is used, we need to pad the mask and cast to f16
-        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
-            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
-        }
-
-        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
-        GGML_ASSERT(batch_size == 1);
-        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
-        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
-        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
-    }
-
-    // loop over layers
-    for (int il = 0; il < n_layer; il++) {
-        const auto & layer = model.layers[il];
-        const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
-
-        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
-
-        // layernorm1
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
-        // self-attention
-        {
-            ggml_tensor * Qcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
-            ggml_tensor * Kcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
-            ggml_tensor * Vcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
-
-            Qcur = ggml_rope_multi(
-                ctx0, Qcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            Kcur = ggml_rope_multi(
-                ctx0, Kcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-
-            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
-
-            cur = build_attn(layer.o_w, layer.o_b,
-                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
-        }
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, inpL);
-
-        inpL = cur; // inpL = residual, cur = hidden_states
-
-        // layernorm2
-        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
-
-        // ffn
-        cur = build_ffn(cur,
-            layer.ff_up_w, layer.ff_up_b,
-            nullptr, nullptr,
-            layer.ff_down_w, layer.ff_down_b,
-            hparams.ffn_op, il);
-
-        // residual 2
-        cur = ggml_add(ctx0, inpL, cur);
-
-        inpL = cur;
-    }
-
-    ggml_tensor * embeddings = inpL;
-    if (use_window_attn) {
-        const int spatial_merge_unit = 4;
-        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
-        ggml_set_name(window_idx, "window_idx");
-        ggml_set_input(window_idx);
-        GGML_ASSERT(batch_size == 1);
-        embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
-        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
-        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
-        cb(embeddings, "window_order_restored", -1);
-    }
-
-    // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
-    if (model.post_ln_w) {
-        embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
-    }
-
-    // Now apply merger (VLPatchMerger):
-    // 1. Apply RMS norm (ln_q in VLPatchMerger)
-    embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
-    cb(embeddings, "merger_normed", -1);
-
-    // 2. First reshape for spatial merge (merge 2x2 patches)
-    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
-    cb(embeddings, "merger_reshaped", -1);
-
-    embeddings = build_ffn(embeddings,
-                    model.mm_0_w, model.mm_0_b,
-                    nullptr, nullptr,
-                    model.mm_1_w, model.mm_1_b,
-                    FFN_GELU,
-                    -1);
-    ggml_build_forward_expand(gf, embeddings);
-
-    return gf;
-}
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -9,250 +9,207 @@
 #include <fstream>
 #include <algorithm>

-// some of the code here is copied from whisper.cpp
+// most of the code here is copied from whisper.cpp

 constexpr bool DEBUG = false;

-void mtmd_audio_cache::fill_sin_cos_table(int n) {
-    sin_vals.resize(n);
-    cos_vals.resize(n);
-    for (int i = 0; i < n; i++) {
-        double theta = (2 * M_PI * i) / n;
-        sin_vals[i]  = sinf(theta);
-        cos_vals[i]  = cosf(theta);
-    }
-}
+struct mtmd_audio_mel_filters {
+    int32_t n_mel;
+    int32_t n_fft;

-void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
-    hann_window.resize(length);
-    int offset = -1;
-    if (periodic) {
-        offset = 0;
-    }
-    for (int i = 0; i < length; i++) {
-        hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
-    }
-}
+    std::vector<float> data;
+};

-void mtmd_audio_cache::fill_mel_filterbank_matrix(int   n_mel,
-                                                  int   n_fft,
-                                                  int   sample_rate,
-                                                  float fmin,
-                                                  float fmax,
-                                                  bool  slaney_area_norm,
-                                                  float scale) {
-    GGML_ASSERT(n_mel > 0 && n_fft > 1);
-    if (fmax <= 0.0f) {
-        fmax = 0.5f * sample_rate;
-    }
+// note: this global cache is shared among all preprocessors
+//       if we want to use multiple preprocessors at the same time,
+//       we will need to enclose it in the preprocessor class in the future
+static struct mtmd_audio_global_cache {
+    // precomputed sin/cos table for FFT
+    std::vector<float> sin_vals;
+    std::vector<float> cos_vals;

-    // Slaney scale (matches librosa default)
-    const double min_log_hz  = 1000.0;
-    const double lin_slope   = 3 / 200.;
-    const double min_log_mel = min_log_hz * lin_slope;
-    const double log_step    = log(6.4) / 27.0;
-    auto         hz_to_mel   = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
-        return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
-    };
-    auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
-        return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
-    };
+    // hann window
+    std::vector<float> hann_window;

-    // infer N_fft from n_fft_bins
-    const double bin_hz_step = double(sample_rate) / double(n_fft);
+    // mel filter bank
+    mtmd_audio_mel_filters filters;

-    // mel grid: n_mel + 2 edges
-    const double        m_lo = hz_to_mel(fmin);
-    const double        m_hi = hz_to_mel(fmax);
-    std::vector<double> mel_pts(n_mel + 2);
-    for (int i = 0; i < n_mel + 2; ++i) {
-        mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
-    }
-
-    // convert to Hz
-    std::vector<double> hz_pts(n_mel + 2);
-    for (int i = 0; i < n_mel + 2; ++i) {
-        hz_pts[i] = mel_to_hz(mel_pts[i]);
-    }
-
-    const int n_fft_bins = n_fft / 2 + 1;
-
-    // filterbank
-    std::vector<float> out(n_mel * n_fft_bins, 0);
-    for (int m = 0; m < n_mel; ++m) {
-        const double f_left   = hz_pts[m];
-        const double f_center = hz_pts[m + 1];
-        const double f_right  = hz_pts[m + 2];
-
-        const double denom_l = std::max(1e-30, f_center - f_left);
-        const double denom_r = std::max(1e-30, f_right - f_center);
-        const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
-
-        for (int k = 0; k < n_fft_bins; ++k) {
-            const double f = k * bin_hz_step;
-            double       w = 0.0;
-            if (f >= f_left && f <= f_center) {
-                w = (f - f_left) / denom_l;
-            } else if (f > f_center && f <= f_right) {
-                w = (f_right - f) / denom_r;
-            }
-            out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+    void fill_sin_cos_table(int n) {
+        sin_vals.resize(n);
+        cos_vals.resize(n);
+        for (int i = 0; i < n; i++) {
+            double theta = (2 * M_PI * i) / n;
+            sin_vals[i] = sinf(theta);
+            cos_vals[i] = cosf(theta);
        }
    }

-    filters.n_mel = n_mel;
-    filters.n_fft = n_fft;
-    filters.data  = std::move(out);
+    void fill_hann_window(int length, bool periodic) {
+        hann_window.resize(length);
+        int offset = -1;
+        if (periodic) {
+            offset = 0;
+        }
+        for (int i = 0; i < length; i++) {
+            hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+        }
+    }

-    if (DEBUG) {  // debug
-        for (size_t i = 0; i < filters.data.size(); ++i) {
-            if (filters.data[i] != 0.0f) {
-                printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
+    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
+    void fill_mel_filterbank_matrix(
+        int n_mel,
+        int n_fft,
+        int sample_rate,            // e.g. 16000
+        float fmin = 0.0f,          // e.g. 0.0
+        float fmax = -1.0f,         // e.g. sr/2; pass -1 for auto
+        bool slaney_area_norm = true,
+        float scale = 1.0f          // optional extra scaling; use 1.0f/1000.0f to mimic your code
+    ) {
+        GGML_ASSERT(n_mel > 0 && n_fft > 1);
+        if (fmax <= 0.0f) {
+            fmax = 0.5f * sample_rate;
+        }
+
+        // Slaney scale (matches librosa default)
+        const double min_log_hz = 1000.0;
+        const double lin_slope = 3 / 200.;
+        const double min_log_mel = min_log_hz * lin_slope;
+        const double log_step = log(6.4) / 27.0;
+        auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
+            return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
+        };
+        auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
+            return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
+        };
+
+        // infer N_fft from n_fft_bins
+        const double bin_hz_step = double(sample_rate) / double(n_fft);
+
+        // mel grid: n_mel + 2 edges
+        const double m_lo = hz_to_mel(fmin);
+        const double m_hi = hz_to_mel(fmax);
+        std::vector<double> mel_pts(n_mel + 2);
+        for (int i = 0; i < n_mel + 2; ++i) {
+            mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
+        }
+
+        // convert to Hz
+        std::vector<double> hz_pts(n_mel + 2);
+        for (int i = 0; i < n_mel + 2; ++i) {
+            hz_pts[i] = mel_to_hz(mel_pts[i]);
+        }
+
+        const int n_fft_bins = n_fft / 2 + 1;
+
+        // filterbank
+        std::vector<float> out(n_mel * n_fft_bins, 0);
+        for (int m = 0; m < n_mel; ++m) {
+            const double f_left   = hz_pts[m];
+            const double f_center = hz_pts[m + 1];
+            const double f_right  = hz_pts[m + 2];
+
+            const double denom_l = std::max(1e-30, f_center - f_left);
+            const double denom_r = std::max(1e-30, f_right  - f_center);
+            const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
+
+            for (int k = 0; k < n_fft_bins; ++k) {
+                const double f = k * bin_hz_step;
+                double w = 0.0;
+                if (f >= f_left && f <= f_center) {
+                    w = (f - f_left) / denom_l;
+                } else if (f > f_center && f <= f_right) {
+                    w = (f_right - f) / denom_r;
+                }
+                out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+            }
+        }
+
+        filters.n_mel = n_mel;
+        filters.n_fft = n_fft;
+        filters.data  = std::move(out);
+
+        if (DEBUG) { // debug
+            for (size_t i = 0; i < filters.data.size(); ++i) {
+                if (filters.data[i] != 0.0f) {
+                    printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+                }
            }
        }
    }
-}
+} g_cache;

-// Unified DFT implementation for both forward and inverse transforms
-// Template parameters:
-//   Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
-//            true  = IDFT with exp(+2πi·k·n/N), scales by 1/N
-//   RealInput: true = input is real-valued (stride 1), avoids imaginary computations
-//              false = input is complex-valued (interleaved real/imag, stride 2)
-template <bool Inverse, bool RealInput>
-static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
-    const int n_sin_cos_vals = cache.sin_vals.size();
-    const int sin_cos_step   = n_sin_cos_vals / N;
-
-    constexpr float sign  = Inverse ? 1.0f : -1.0f;
-    const float     scale = Inverse ? (1.0f / N) : 1.0f;
+// naive Discrete Fourier Transform
+// input is real-valued
+// output is complex-valued
+static void dft(const float * in, int N, float * out) {
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
+    const int sin_cos_step = n_sin_cos_vals / N;

    for (int k = 0; k < N; k++) {
        float re = 0;
        float im = 0;

        for (int n = 0; n < N; n++) {
-            int   idx     = (k * n * sin_cos_step) % n_sin_cos_vals;
-            float cos_val = cache.cos_vals[idx];
-            float sin_val = cache.sin_vals[idx];
-
-            if constexpr (RealInput) {
-                // Real input: in_im = 0, simplifies to:
-                // re += in_re * cos_val
-                // im += sign * in_re * sin_val
-                float in_re = in[n];
-                re += in_re * cos_val;
-                im += sign * in_re * sin_val;
-            } else {
-                float in_re = in[n * 2 + 0];
-                float in_im = in[n * 2 + 1];
-                // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
-                re += in_re * cos_val - sign * in_im * sin_val;
-                im += sign * in_re * sin_val + in_im * cos_val;
-            }
+            int idx = (k * n * sin_cos_step) % (n_sin_cos_vals); // t = 2*M_PI*k*n/N
+            re += in[n] * g_cache.cos_vals[idx]; // cos(t)
+            im -= in[n] * g_cache.sin_vals[idx]; // sin(t)
        }

-        out[k * 2 + 0] = re * scale;
-        out[k * 2 + 1] = im * scale;
+        out[k*2 + 0] = re;
+        out[k*2 + 1] = im;
    }
 }

-// Cooley-Tukey FFT/IFFT unified implementation
-// Template parameters:
-//   Inverse: false = FFT with exp(-2πi·k/N), no scaling
-//            true  = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
-//   RealInput: true = input is real-valued (stride 1)
-//              false = input is complex-valued (interleaved real/imag, stride 2)
-template <bool Inverse, bool RealInput>
-static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    const int n_sin_cos_vals = cache.sin_vals.size();
-
+// Cooley-Tukey FFT
+// poor man's implementation - use something better
+// input is real-valued
+// output is complex-valued
+static void fft(float * in, int N, float * out) {
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
    if (N == 1) {
        out[0] = in[0];
-        if constexpr (RealInput) {
-            out[1] = 0.0f;
-        } else {
-            out[1] = in[1];
-        }
+        out[1] = 0;
        return;
    }

    const int half_N = N / 2;
-    if (N - half_N * 2 == 1) {
-        // Odd N: fall back to DFT
-        dft_impl<Inverse, RealInput>(cache, in, N, out);
+    if (N - half_N*2 == 1) {
+        dft(in, N, out);
        return;
    }

-    // Split into even and odd
-    if constexpr (RealInput) {
-        // Real input: stride is 1, copy only real values
-        float * even = in + N;
-        for (int i = 0; i < half_N; ++i) {
-            even[i] = in[2 * i];
-        }
-        float * even_fft = out + 2 * N;
-        fft_impl<Inverse, true>(cache, even, half_N, even_fft);
-
-        float * odd = even;
-        for (int i = 0; i < half_N; ++i) {
-            odd[i] = in[2 * i + 1];
-        }
-        float * odd_fft = even_fft + N;
-        fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
-    } else {
-        // Complex input: stride is 2, copy complex pairs
-        float * even = in + N * 2;
-        for (int i = 0; i < half_N; ++i) {
-            even[i * 2 + 0] = in[2 * i * 2 + 0];
-            even[i * 2 + 1] = in[2 * i * 2 + 1];
-        }
-        float * even_fft = out + 2 * N;
-        fft_impl<Inverse, false>(cache, even, half_N, even_fft);
-
-        float * odd = even;
-        for (int i = 0; i < half_N; ++i) {
-            odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
-            odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
-        }
-        float * odd_fft = even_fft + N;
-        fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
+    float* even = in + N;
+    for (int i = 0; i < half_N; ++i) {
+        even[i]= in[2*i];
    }
+    float* even_fft = out + 2 * N;
+    fft(even, half_N, even_fft);

-    float * even_fft = out + 2 * N;
-    float * odd_fft  = even_fft + N;
+    float* odd = even;
+    for (int i = 0; i < half_N; ++i) {
+        odd[i] = in[2*i + 1];
+    }
+    float* odd_fft = even_fft + N;
+    fft(odd, half_N, odd_fft);

    const int sin_cos_step = n_sin_cos_vals / N;
-
-    constexpr float sign  = Inverse ? 1.0f : -1.0f;
-    constexpr float scale = Inverse ? 0.5f : 1.0f;
-
    for (int k = 0; k < half_N; k++) {
-        int   idx = k * sin_cos_step;  // t = 2*M_PI*k/N
-        float re  = cache.cos_vals[idx];
-        float im  = sign * cache.sin_vals[idx];
+        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
+        float re =  g_cache.cos_vals[idx]; // cos(t)
+        float im = -g_cache.sin_vals[idx]; // sin(t)

-        float re_odd = odd_fft[2 * k + 0];
-        float im_odd = odd_fft[2 * k + 1];
+        float re_odd = odd_fft[2*k + 0];
+        float im_odd = odd_fft[2*k + 1];

-        out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
-        out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
+        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
+        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;

-        out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
-        out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
+        out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
+        out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
    }
 }

-// Forward FFT for real input (used by mel spectrogram)
-static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    fft_impl<false, true>(cache, in, N, out);
-}
-
-// Inverse FFT for complex input
-static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    fft_impl<true, false>(cache, in, N, out);
-}
-
 struct filter_params {
    int32_t n_mel;
    int32_t n_fft_bins;
@@ -265,27 +222,20 @@ struct filter_params {
    bool    norm_per_feature = false;
 };

-static void log_mel_spectrogram_worker_thread(int                        ith,
-                                              const float *              hann,
-                                              const std::vector<float> & samples,
-                                              int                        n_samples,
-                                              int                        frame_size,
-                                              int                        frame_step,
-                                              int                        n_threads,
-                                              const filter_params &      params,
-                                              const mtmd_audio_cache &   cache,
-                                              mtmd_audio_mel &           out) {
+static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
+                                              int n_samples, int frame_size, int frame_step, int n_threads,
+                                              const filter_params & params, mtmd_audio_mel & out) {
    std::vector<float> fft_in(frame_size * 2, 0.0);
    std::vector<float> fft_out(frame_size * 2 * 2 * 2);

    int n_fft_bins = params.n_fft_bins;
    int i = ith;

-    const auto & filters = cache.filters;
+    const auto & filters = g_cache.filters;

    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
    GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
-    GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
+    GGML_ASSERT(g_cache.sin_vals.size() == g_cache.cos_vals.size());
    // calculate FFT only when fft_in are not all zero
    for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
        const int offset = i * frame_step;
@@ -301,7 +251,7 @@ static void log_mel_spectrogram_worker_thread(int                        ith,
        }

        // FFT
-        fft(cache, fft_in.data(), frame_size, fft_out.data());
+        fft(fft_in.data(), frame_size, fft_out.data());

        // Calculate modulus^2 of complex numbers
        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
@@ -348,7 +298,6 @@ static bool log_mel_spectrogram(
        const int     n_samples_in,
        const int     n_threads,
        const filter_params & params,
-        const mtmd_audio_cache & cache,
        mtmd_audio_mel & out) {
    //const int64_t t_start_us = ggml_time_us();

@@ -356,9 +305,9 @@ static bool log_mel_spectrogram(
    int n_samples = n_samples_in;

    // Hann window
-    const float * hann       = cache.hann_window.data();
-    const int     frame_size = (params.n_fft_bins - 1) * 2;
-    const int     frame_step = params.hop_length;
+    const float * hann = g_cache.hann_window.data();
+    const int frame_size = (params.n_fft_bins - 1) * 2;
+    const int frame_step = params.hop_length;

    // Padding
    std::vector<float> samples_padded;
@@ -386,9 +335,9 @@ static bool log_mel_spectrogram(

    // preemphasis
    if (params.preemph) {
-        const int   pad_amount = frame_size / 2;
+        const int pad_amount = frame_size / 2;
        const float preemph = 0.97f;
-        float       prev = samples_padded[pad_amount];
+        float prev = samples_padded[pad_amount];
        for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
            float cur = samples_padded[i];
            samples_padded[i] = cur - preemph * prev;
@@ -423,14 +372,14 @@ static bool log_mel_spectrogram(
    {
        std::vector<std::thread> workers(n_threads - 1);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw] =
-                std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
-                            frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
+            workers[iw] = std::thread(
+                    log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
+                    n_samples, frame_size, frame_step, n_threads,
+                    std::cref(params), std::ref(out));
        }

        // main thread
-        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
-                                          cache, out);
+        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, out);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
            workers[iw].join();
        }
@@ -455,7 +404,7 @@ static bool log_mel_spectrogram(

            for (int j = 0; j < effective_n_len; ++j) {
                auto &value = out.data[i * out.n_len + j];
-                value        = (value - mean) / mstd;
+                value = (value - mean) / mstd;
            }

            // pad the rest with zeros
@@ -501,14 +450,18 @@ static bool log_mel_spectrogram(
 //

 void mtmd_audio_preprocessor_whisper::initialize() {
-    cache.fill_sin_cos_table(hparams.audio_n_fft);
-    cache.fill_hann_window(hparams.audio_window_len, true);
-    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+    g_cache.fill_sin_cos_table(hparams.audio_n_fft);
+    g_cache.fill_hann_window(hparams.audio_window_len, true);
+    g_cache.fill_mel_filterbank_matrix(
+        hparams.n_mel_bins,
+        hparams.audio_n_fft,
+        hparams.audio_sample_rate);
 }

-bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 samples,
-                                                 size_t                        n_samples,
-                                                 std::vector<mtmd_audio_mel> & output) {
+bool mtmd_audio_preprocessor_whisper::preprocess(
+        const float * samples,
+        size_t n_samples,
+        std::vector<mtmd_audio_mel> & output) {
    if (n_samples == 0) {
        // empty audio
        return false;
@@ -518,7 +471,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    // if input is too short, pad with zeros
    // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
    // TODO: maybe handle this better
-    size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1);  // +1 second margin
+    size_t min_samples = (size_t)hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
    if (n_samples < min_samples) {
        smpl.resize(min_samples, 0.0f);
        std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
@@ -533,19 +486,22 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    params.hop_length       = hparams.audio_hop_len;
    params.sample_rate      = hparams.audio_sample_rate;
    params.center_padding   = false;
-    params.preemph          = 0.0f;  // disabled
+    params.preemph          = 0.0f; // disabled
    params.use_natural_log  = false;
    params.norm_per_feature = false;

-    // make sure the cache is initialized
-    GGML_ASSERT(!cache.sin_vals.empty());
-    GGML_ASSERT(!cache.cos_vals.empty());
-    GGML_ASSERT(!cache.filters.data.empty());
+    // make sure the global cache is initialized
+    GGML_ASSERT(!g_cache.sin_vals.empty());
+    GGML_ASSERT(!g_cache.cos_vals.empty());
+    GGML_ASSERT(!g_cache.filters.data.empty());

    mtmd_audio_mel out_full;
-    bool           ok = log_mel_spectrogram(samples, n_samples,
-                                            4,  // n_threads
-                                            params, cache, out_full);
+    bool ok = log_mel_spectrogram(
+                samples,
+                n_samples,
+                4, // n_threads
+                params,
+                out_full);
    if (!ok) {
        return false;
    }
@@ -556,21 +512,21 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
        printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
    }
    const size_t frames_per_chunk = 3000;
-    GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
-    for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
-        int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
-        if ((size_t) n_len < frames_per_chunk) {
-            break;  // last uncomplete chunk will always be a padded chunk, safe to ignore
+    GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
+    for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
+        int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
+        if ((size_t)n_len < frames_per_chunk) {
+            break; // last uncomplete chunk will always be a padded chunk, safe to ignore
        }

        mtmd_audio_mel out_chunk;
        out_chunk.n_len     = n_len;
        out_chunk.n_mel     = out_full.n_mel;
-        out_chunk.n_len_org = out_full.n_mel;  // unused
+        out_chunk.n_len_org = out_full.n_mel; // unused
        out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);

        for (int i = 0; i < out_full.n_mel; i++) {
-            auto src = out_full.data.begin() + i * out_full.n_len + off;
+            auto src = out_full.data.begin() + i*out_full.n_len + off;
            out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
        }

@@ -585,14 +541,18 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
 //

 void mtmd_audio_preprocessor_conformer::initialize() {
-    cache.fill_sin_cos_table(hparams.audio_n_fft);
-    cache.fill_hann_window(hparams.audio_window_len, true);
-    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+    g_cache.fill_sin_cos_table(hparams.audio_n_fft);
+    g_cache.fill_hann_window(hparams.audio_window_len, true);
+    g_cache.fill_mel_filterbank_matrix(
+        hparams.n_mel_bins,
+        hparams.audio_n_fft,
+        hparams.audio_sample_rate);
 }

-bool mtmd_audio_preprocessor_conformer::preprocess(const float *                 samples,
-                                                   size_t                        n_samples,
-                                                   std::vector<mtmd_audio_mel> & output) {
+bool mtmd_audio_preprocessor_conformer::preprocess(
+        const float * samples,
+        size_t n_samples,
+        std::vector<mtmd_audio_mel> & output) {
    // empty audio
    if (n_samples == 0) {
        return false;
@@ -609,15 +569,18 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float *
    params.use_natural_log  = true;
    params.norm_per_feature = true;

-    // make sure the cache is initialized
-    GGML_ASSERT(!cache.sin_vals.empty());
-    GGML_ASSERT(!cache.cos_vals.empty());
-    GGML_ASSERT(!cache.filters.data.empty());
+    // make sure the global cache is initialized
+    GGML_ASSERT(!g_cache.sin_vals.empty());
+    GGML_ASSERT(!g_cache.cos_vals.empty());
+    GGML_ASSERT(!g_cache.filters.data.empty());

    mtmd_audio_mel out_full;
-    bool           ok = log_mel_spectrogram(samples, n_samples,
-                                            4,  // n_threads
-                                            params, cache, out_full);
+    bool ok = log_mel_spectrogram(
+                samples,
+                n_samples,
+                4, // n_threads
+                params,
+                out_full);
    if (!ok) {
        return false;
    }
@@ -625,106 +588,3 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float *
    output.push_back(std::move(out_full));
    return true;
 }
-
-//
-// mtmd_audio_streaming_istft implementation
-//
-
-mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
-    n_fft(n_fft),
-    hop_length(hop_length),
-    n_fft_bins(n_fft / 2 + 1),
-    overlap_buffer(n_fft, 0.0f),
-    window_sum_buffer(n_fft, 0.0f),
-    padding_to_remove((n_fft - hop_length) / 2),
-    ifft_in(n_fft * 2 * 4, 0.0f),  // extra space for recursive IFFT
-    ifft_out(n_fft * 2 * 4, 0.0f) {
-    cache.fill_sin_cos_table(n_fft);
-    cache.fill_hann_window(n_fft, true);
-}
-
-void mtmd_audio_streaming_istft::reset() {
-    std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
-    std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
-    padding_to_remove = (n_fft - hop_length) / 2;
-}
-
-std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
-    std::vector<float> output(hop_length);
-
-    // copy frequencies
-    for (int j = 0; j < n_fft_bins; j++) {
-        ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
-        ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
-    }
-
-    // mirror negative frequencies
-    for (int j = 1; j < n_fft_bins - 1; j++) {
-        int mirror_idx              = n_fft - j;
-        ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
-        ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1];  // conjugate
-    }
-
-    ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
-
-    // update window sum and overlap buffer
-    for (int j = 0; j < n_fft; j++) {
-        window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
-        overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
-    }
-
-    // extract hop_length samples with normalization
-    for (int i = 0; i < hop_length; i++) {
-        if (window_sum_buffer[i] > 1e-8f) {
-            output[i] = overlap_buffer[i] / window_sum_buffer[i];
-        } else {
-            output[i] = overlap_buffer[i];
-        }
-    }
-
-    // shift buffers left by hop_length
-    std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
-    std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
-
-    std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
-    std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
-
-    // Remove padding if needed
-    int to_remove = std::min(padding_to_remove, (int) output.size());
-    padding_to_remove -= to_remove;
-    output.erase(output.begin(), output.begin() + to_remove);
-
-    return output;
-}
-
-std::vector<float> mtmd_audio_streaming_istft::flush() {
-    std::vector<float> output;
-
-    // Extract remaining samples from overlap buffer
-    // Continue until we've extracted all meaningful samples
-    int remaining = n_fft - hop_length;
-    while (remaining > 0) {
-        int chunk_size = std::min(remaining, hop_length);
-
-        for (int i = 0; i < chunk_size; i++) {
-            float sample;
-            if (window_sum_buffer[i] > 1e-8f) {
-                sample = overlap_buffer[i] / window_sum_buffer[i];
-            } else {
-                sample = overlap_buffer[i];
-            }
-            output.push_back(sample);
-        }
-
-        // Shift buffers
-        std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
-        std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
-
-        std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
-        std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
-
-        remaining -= chunk_size;
-    }
-
-    return output;
-}
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@@ -17,38 +17,6 @@ struct mtmd_audio_mel {
    std::vector<float> data;
 };

-struct mtmd_audio_mel_filters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-// cache for audio processing, each processor instance owns its own cache
-struct mtmd_audio_cache {
-    std::vector<float> sin_vals;
-    std::vector<float> cos_vals;
-
-    std::vector<float> hann_window;
-
-    mtmd_audio_mel_filters filters;
-
-    void fill_sin_cos_table(int n);
-
-    void fill_hann_window(int length, bool periodic);
-
-    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
-    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
-    void fill_mel_filterbank_matrix(int   n_mel,
-                                    int   n_fft,
-                                    int   sample_rate,               // e.g. 16000
-                                    float fmin             = 0.0f,   // e.g. 0.0
-                                    float fmax             = -1.0f,  // e.g. sr/2; pass -1 for auto
-                                    bool  slaney_area_norm = true,
-                                    float scale = 1.0f  // optional extra scaling
-    );
-};
-
 struct mtmd_audio_preprocessor {
    const clip_hparams & hparams;

@@ -63,51 +31,10 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
    mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
    void initialize() override;
    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
-
-  private:
-    mtmd_audio_cache cache;
 };

 struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
    mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
    void initialize() override;
    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
-
-  private:
-    mtmd_audio_cache cache;
-};
-
-//
-// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
-//
-struct mtmd_audio_streaming_istft {
-    mtmd_audio_streaming_istft(int n_fft, int hop_length);
-
-    // reset streaming state
-    void reset();
-
-    // process a single STFT frame (streaming)
-    // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
-    // returns: up to hop_length samples
-    std::vector<float> process_frame(const float * frame_spectrum);
-
-    // flush remaining samples at end of stream
-    std::vector<float> flush();
-
-  private:
-    int n_fft;
-    int hop_length;
-    int n_fft_bins;
-
-    // Own cache for output processing
-    mtmd_audio_cache cache;
-
-    // Streaming state
-    std::vector<float> overlap_buffer;
-    std::vector<float> window_sum_buffer;
-    int                padding_to_remove;
-
-    // Working buffers for IFFT
-    std::vector<float> ifft_in;
-    std::vector<float> ifft_out;
 };
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -1,5 +1,4 @@
 #include "arg.h"
-#include "debug.h"
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
@@ -89,8 +88,6 @@ struct mtmd_cli_context {
    int n_threads    = 1;
    llama_pos n_past = 0;

-    base_callback_data cb_data;
-
    mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
        model = llama_init->model();
        lctx = llama_init->context();
@@ -142,10 +139,6 @@ struct mtmd_cli_context {
        mparams.warmup           = params.warmup;
        mparams.image_min_tokens = params.image_min_tokens;
        mparams.image_max_tokens = params.image_max_tokens;
-        if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) {
-            mparams.cb_eval_user_data = &cb_data;
-            mparams.cb_eval = common_debug_cb_eval<false>;
-        }
        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
        if (!ctx_vision.get()) {
            LOG_ERR("Failed to load vision model from %s\n", clip_path);
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -111,8 +111,6 @@ mtmd_context_params mtmd_context_params_default() {
        /* warmup            */ true,
        /* image_min_tokens  */ -1,
        /* image_max_tokens  */ -1,
-        /* cb_eval           */ nullptr,
-        /* cb_eval_user_data */ nullptr,
    };
    return params;
 }
@@ -148,6 +146,8 @@ struct mtmd_context {
    bool        tok_row_end_trail = false;
    bool        ov_img_first      = false;

+    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
+
    // string template for slice image delimiters with row/col (idefics3)
    std::string sli_img_start_tmpl;

@@ -178,8 +178,6 @@ struct mtmd_context {
            /* image_min_tokens  */ ctx_params.image_min_tokens,
            /* image_max_tokens  */ ctx_params.image_max_tokens,
            /* warmup            */ ctx_params.warmup,
-            /* cb_eval           */ ctx_params.cb_eval,
-            /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
        };

        auto res = clip_init(mmproj_fname, ctx_clip_params);
@@ -219,6 +217,7 @@ struct mtmd_context {

    void init_vision() {
        GGML_ASSERT(ctx_v != nullptr);
+        use_mrope = clip_is_mrope(ctx_v);

        projector_type proj = clip_get_projector_type(ctx_v);
        int minicpmv_version = clip_is_minicpmv(ctx_v);
@@ -267,7 +266,7 @@ struct mtmd_context {
        }

        // set boi/eoi
-        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
+        if (proj == PROJECTOR_TYPE_GEMMA3) {
            // <start_of_image> ... (image embeddings) ... <end_of_image>
            img_beg = "<start_of_image>";
            img_end = "<end_of_image>";
@@ -284,7 +283,7 @@ struct mtmd_context {
            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
            img_end = "[IMG_END]";

-        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
+        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
            img_beg = "<|vision_start|>";
            img_end = "<|vision_end|>";
@@ -331,7 +330,6 @@ struct mtmd_context {
            case PROJECTOR_TYPE_ULTRAVOX:
            case PROJECTOR_TYPE_VOXTRAL:
            case PROJECTOR_TYPE_GLMA:
-            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                break;
            case PROJECTOR_TYPE_LFM2A:
@@ -354,9 +352,6 @@ struct mtmd_context {
            // [BEGIN_AUDIO] ... (embeddings) ...
            aud_beg = "[BEGIN_AUDIO]";

-        } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
-            // <sound> ... (embeddings) ...
-            aud_beg = "<sound>";
        }
    }

@@ -628,7 +623,7 @@ struct mtmd_tokenizer {
                }

                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                if (mtmd_decode_use_mrope(ctx)) {
+                if (ctx->use_mrope) {
                    // for Qwen2VL, we need this information for M-RoPE decoding positions
                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
@@ -863,24 +858,14 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }

 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    switch (ctx->proj_type_v()) {
-        case PROJECTOR_TYPE_GEMMA3:
-            return true;
-        default:
-            return false;
+    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
+        return true;
    }
+    return false;
 }

 bool mtmd_decode_use_mrope(mtmd_context * ctx) {
-    switch (ctx->proj_type_v()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->use_mrope;
 }

 bool mtmd_support_vision(mtmd_context * ctx) {
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -27,9 +27,6 @@
 * - Make sure the C API is aligned with the libllama C API (as in llama.h)
 * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
 * - Keep the API minimal, do not expose internal details unless necessary
- *
- * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
- * We encourage human contributors to ensure the quality and reliability of the codebase.
 */

 #ifdef LLAMA_SHARED
@@ -95,10 +92,6 @@ struct mtmd_context_params {
    // limit number of image tokens, only for vision models with dynamic resolution
    int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
    int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
-
-    // callback function passed over to mtmd proper
-    ggml_backend_sched_eval_callback cb_eval;
-    void * cb_eval_user_data;
 };

 MTMD_API const char * mtmd_default_marker(void);
@@ -277,12 +270,12 @@ struct bitmap {
        ptr.reset(mtmd_bitmap_init(nx, ny, data));
    }
    ~bitmap() = default;
-    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
-    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
-    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
-    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
-    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
-    void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
+    uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
+    uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
+    const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
+    size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
+    std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
+    void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
 };

 struct bitmaps {
@@ -306,8 +299,8 @@ struct input_chunks {
    input_chunks() = default;
    input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
    ~input_chunks() = default;
-    size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
-    const mtmd_input_chunk * operator[](size_t idx) const {
+    size_t size() { return mtmd_input_chunks_size(ptr.get()); }
+    const mtmd_input_chunk * operator[](size_t idx) {
        return mtmd_input_chunks_get(ptr.get(), idx);
    }
 };