sync from b7516

2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -152,14 +152,18 @@ struct clip_ctx {
    ggml_backend_t backend_cpu = nullptr;
    ggml_backend_buffer_ptr buf;

-
    int max_nodes = 8192;
    ggml_backend_sched_ptr sched;
    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
    bool is_allocated = false;

+    // for debugging
+    bool debug_graph = false;
+    std::vector<ggml_tensor *> debug_print_tensors;
+
    clip_ctx(clip_context_params & ctx_params) {
        flash_attn_type = ctx_params.flash_attn_type;
+        debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
        if (!backend_cpu) {
            throw std::runtime_error("failed to initialize CPU backend");
@@ -200,10 +204,6 @@ struct clip_ctx {
        sched.reset(
            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
        );
-
-        if (ctx_params.cb_eval != nullptr) {
-            ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
-        }
    }

    ~clip_ctx() {
@@ -239,7 +239,9 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
        n_mmproj_embd(clip_n_mmproj_embd(ctx)),
        eps(hparams.eps),
        kq_scale(1.0f / sqrtf((float)d_head)),
-        flash_attn_type(ctx->flash_attn_type) {
+        flash_attn_type(ctx->flash_attn_type),
+        debug_graph(ctx->debug_graph),
+        debug_print_tensors(ctx->debug_print_tensors) {
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
@@ -250,11 +252,14 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
    gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
 }

-void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
-    if (il >= 0) {
-        ggml_format_name(cur, "%s-%d", name, il);
-    } else {
-        ggml_set_name(cur, name);
+void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
+    if (debug_graph) {
+        ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
+        std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
+        ggml_set_name(cur, cur_name.c_str());
+        ggml_set_output(cur);
+        ggml_build_forward_expand(gf, cur);
+        debug_print_tensors.push_back(cur);
    }
 }

@@ -783,10 +788,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_siglip>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
-            } break;
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            {
@@ -817,7 +818,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_QWEN2A:
        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
            } break;
@@ -845,10 +845,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_glm4v>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
-            } break;
        default:
            GGML_ABORT("missing cgraph builder");
    }
@@ -1145,14 +1141,6 @@ struct clip_model_loader {
                        // test model (tinygemma3) has a different value, we optionally read it
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                    } break;
-
-                case PROJECTOR_TYPE_GEMMA3NV:
-                    {
-                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
-                        // Similar configuration to Gemma3
-                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
-                    } break;
                case PROJECTOR_TYPE_QWEN2VL:
                case PROJECTOR_TYPE_QWEN25VL:
                case PROJECTOR_TYPE_QWEN3VL:
@@ -1170,20 +1158,6 @@ struct clip_model_loader {
                            LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
                        }
                    } break;
-                case PROJECTOR_TYPE_YOUTUVL:
-                    {
-                        hparams.n_merge = 2;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
-                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
-                        std::vector<int> wa_layer_indexes_vec;
-                        get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
-                        for (auto & layer : wa_layer_indexes_vec) {
-                            hparams.wa_layer_indexes.insert(layer);
-                        }
-                        // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
-                        hparams.set_limit_image_tokens(1, 62500);
-                        hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
-                    } break;
                case PROJECTOR_TYPE_GLM4V:
                    {
                        hparams.rope_theta = 10000.0f;
@@ -1202,7 +1176,6 @@ struct clip_model_loader {
                case PROJECTOR_TYPE_QWEN2A:
                case PROJECTOR_TYPE_GLMA:
                case PROJECTOR_TYPE_VOXTRAL:
-                case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                    {
                        bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
                                             model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@@ -1252,14 +1225,7 @@ struct clip_model_loader {
                LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
                LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
                LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
-                LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
-                if (!hparams.wa_layer_indexes.empty()) {
-                    LOG_INF("%s: wa_layer_indexes:  ", __func__);
-                    for (auto & layer : hparams.wa_layer_indexes) {
-                        LOG_INF("%d ", layer);
-                    }
-                    LOG_INF("\n");
-                }
+                LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
                if (hparams.image_min_pixels > 0) {
                    LOG_INF("%s: image_min_pixels:   %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
                }
@@ -1341,10 +1307,6 @@ struct clip_model_loader {

        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);

-        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
-            hparams.n_layer = 0; // gemma3n does not use normal layer structure
-        }
-
        // layers
        model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1419,7 +1381,6 @@ struct clip_model_loader {
            }
        }

-
        switch (model.proj_type) {
            case PROJECTOR_TYPE_MLP:
            case PROJECTOR_TYPE_MLP_NORM:
@@ -1514,8 +1475,8 @@ struct clip_model_loader {
                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
                } break;
            case PROJECTOR_TYPE_QWEN2VL:
            case PROJECTOR_TYPE_QWEN25VL:
@@ -1532,14 +1493,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
-            case PROJECTOR_TYPE_YOUTUVL:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);        // merger.ln_q (RMS norm)
-                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));  // merger.mlp.0
-                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));  // merger.mlp.2
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_GLM4V:
                {
                    model.projection     = get_tensor(TN_MM_PROJECTOR);
@@ -1559,112 +1512,11 @@ struct clip_model_loader {
                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                } break;
-            case PROJECTOR_TYPE_GEMMA3NV:
-                {
-                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
-                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
-                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
-
-                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
-                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
-                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
-                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
-
-                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
-
-                    // Dynamically load blocks stage by stage
-                    for (int stage = 0; stage < 4; ++stage) {
-                        int blocks_found_in_stage = 0;
-
-                        for (int blk_idx = 0; ; ++blk_idx) {
-                            bool found_block = false;
-                            mobilenetv5_block block;
-
-                            // 1. Check for Edge Residual (S0)
-                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
-                            if (block.s0_conv_exp_w) {
-                                found_block = true;
-                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
-                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
-                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
-                            }
-                            // 2. Check for UIR (Universal Inverted Residual)
-                            else {
-                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
-                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
-                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
-
-                                if (block.dw_start_w || block.pw_exp_w) {
-                                    found_block = true;
-                                    if (block.dw_start_w) {
-                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
-                                    }
-                                    if (block.pw_exp_w) {
-                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
-                                    }
-                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
-                                    if (block.dw_mid_w) {
-                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
-                                    }
-                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
-                                    if (block.pw_proj_w) {
-                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
-                                    }
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            // 3. Check for Attention (MQA)
-                            // Even if UIR/Edge check failed, this might be a pure attention block
-                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
-                            if (attn_q_check) {
-                                found_block = true;
-                                block.attn_q_w = attn_q_check;
-                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
-                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
-                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
-                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
-                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
-                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
-                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
-                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
-                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
-                                if (!block.layer_scale_w) {
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            if (found_block) {
-                                model.mobilenet_blocks.push_back(block);
-                                blocks_found_in_stage++;
-                            } else {
-                                // End of blocks for this stage
-                                break;
-                            }
-                        }
-
-                        // Track where this stage ends in the flat vector
-                        if (blocks_found_in_stage > 0) {
-                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
-                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
-                        }
-                    }
-                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
-                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
-                } break;
            case PROJECTOR_TYPE_IDEFICS3:
                {
                    model.projection = get_tensor(TN_MM_PROJECTOR);
                } break;
            case PROJECTOR_TYPE_LFM2:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
-                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_KIMIVL:
                {
                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@@ -1724,17 +1576,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
                } break;
-            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-                {
-                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
-                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
-                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
-                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_INTERNVL:
                {
                    model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -1756,8 +1597,8 @@ struct clip_model_loader {
                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
                    model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
                } break;
            case PROJECTOR_TYPE_LLAMA4:
                {
@@ -2107,7 +1948,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

    try {
        clip_model_loader loader(fname);
-        bool skip_audio = false;

        if (loader.has_vision) {
            ctx_vision = new clip_ctx(ctx_params);
@@ -2117,14 +1957,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
                loader.warmup(*ctx_vision);
            }

-            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
-            // we can remove this check when we implement audio support for Gemma 3N
-            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
-
            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
        }

-        if (loader.has_audio && !skip_audio) {
+        if (loader.has_audio) {
            ctx_audio = new clip_ctx(ctx_params);
            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
            loader.load_tensors(*ctx_audio);
@@ -2848,57 +2684,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                // res_imgs->data[0] = *res;
                res_imgs->entries.push_back(std::move(img_f32));
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                const int patch_size = params.patch_size;  // typically 16
-                const int merge_size = params.n_merge;      // typically 2
-                const int align_size = patch_size * merge_size;  // 32
-
-                const int max_num_patches = params.image_max_pixels > 0 ?
-                    params.image_max_pixels / (patch_size * patch_size) : 256;
-
-                // Linear search for optimal scale to fit within max_num_patches
-                float scale = 1.0f;
-                int target_height = original_size.height;
-                int target_width = original_size.width;
-
-                auto get_scaled_image_size = [align_size](float scale, int size) -> int {
-                    float scaled_size = size * scale;
-                    // Round up to nearest multiple of align_size
-                    int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
-                    // Ensure at least one patch
-                    return std::max(align_size, aligned);
-                };
-
-                // Linear search with 0.02 step size
-                while (scale > 0.0f) {
-                    target_height = get_scaled_image_size(scale, original_size.height);
-                    target_width = get_scaled_image_size(scale, original_size.width);
-
-                    int num_patches_h = target_height / patch_size;
-                    int num_patches_w = target_width / patch_size;
-                    int num_patches = num_patches_h * num_patches_w;
-
-                    if (num_patches > max_num_patches) {
-                        scale -= 0.02f;
-                    } else {
-                        break;
-                    }
-                }
-
-                clip_image_size new_size = {target_width, target_height};
-
-                // Resize the image
-                clip_image_u8 resized;
-                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
-
-                // Normalize to float32
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
-
-                // Add to results
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;

        case PROJECTOR_TYPE_IDEFICS3:
            {
@@ -2962,16 +2747,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                res_imgs->entries.push_back(std::move(img_f32));
            } break;

-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                clip_image_u8 resized_image;
-                int sz = params.image_size;
-                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
        case PROJECTOR_TYPE_JANUS_PRO:
            {
                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@@ -3141,7 +2916,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->nx / params.patch_size) / 2;
        default:
            break;
@@ -3157,7 +2931,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->ny / params.patch_size) / 2;
        default:
            break;
@@ -3218,7 +2991,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // dynamic size (2 conv, so double patch size)
                int x_patch = img->nx / (params.patch_size * 2);
@@ -3234,12 +3006,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                int scale_factor = ctx->model.hparams.n_merge;
                n_patches /= (scale_factor * scale_factor);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
-                // regardless of input size (see architecture description)
-                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
-            } break;
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_KIMIVL:
            {
@@ -3265,7 +3031,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                n_patches = img->nx;

@@ -3334,6 +3099,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }

    // build the inference graph
+    ctx->debug_print_tensors.clear();
    ggml_backend_sched_reset(ctx->sched.get());
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@@ -3351,6 +3117,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int pos_w = image_size_width  / patch_size;
    const int pos_h = image_size_height / patch_size;

+    const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl

    auto get_inp_tensor = [&gf](const char * name) {
        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3499,11 +3266,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("positions", positions);
            } break;
        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // pw * ph = number of tokens output by ViT after apply patch merger
                // ipw * ipw = number of vision token been processed inside ViT
-                const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
                const int merge_ratio = 2;
                const int pw  = image_size_width  / patch_size / merge_ratio;
                const int ph  = image_size_height / patch_size / merge_ratio;
@@ -3514,7 +3279,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                std::vector<int> inv_idx(ph * pw);

                if (use_window_attn) {
-                    const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
+                    const int attn_window_size = 112;
                    const int grid_window = attn_window_size / patch_size / merge_ratio;
                    int dst = 0;
                    // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3631,7 +3396,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("patches", patches);
            } break;
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_INTERNVL:
        case PROJECTOR_TYPE_QWEN2A:
@@ -3639,7 +3403,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
        case PROJECTOR_TYPE_JANUS_PRO:
        case PROJECTOR_TYPE_COGVLM:
            {
@@ -3703,6 +3466,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        return false;
    }

+    // print debug nodes
+    if (ctx->debug_graph) {
+        LOG_INF("\n\n---\n\n");
+        LOG_INF("\n\nDebug graph:\n\n");
+        for (ggml_tensor * t : ctx->debug_print_tensors) {
+            std::vector<uint8_t> data(ggml_nbytes(t));
+            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+            print_tensor_shape(t);
+            print_tensor_data(t, data.data(), 3);
+        }
+    }
+
    // the last node is the embedding tensor
    ggml_tensor * embeddings = ggml_graph_node(gf, -1);

@@ -3741,19 +3516,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_JANUS_PRO:
-        case PROJECTOR_TYPE_YOUTUVL:
            return ctx->model.mm_1_b->ne[0];
        case PROJECTOR_TYPE_QWEN3VL:
            // main path + deepstack paths
            return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
            return ctx->model.mm_input_proj_w->ne[0];
        case PROJECTOR_TYPE_IDEFICS3:
            return ctx->model.projection->ne[1];
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_INTERNVL:
            return ctx->model.mm_3_w->ne[1];
@@ -3778,7 +3550,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
 }

 int clip_is_minicpmv(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
        return ctx->model.hparams.minicpmv_version;
    }
@@ -3786,14 +3557,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
 }

 bool clip_is_glm(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }

+bool clip_is_mrope(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
+        || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
+}
+
 bool clip_is_llava(const struct clip_ctx * ctx) {
    return ctx->model.hparams.has_llava_projector;
 }

+bool clip_is_gemma3(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
+}
+
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_VISION;
 }
@@ -3803,16 +3584,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
 }

 bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_ULTRAVOX:
-        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
+        || ctx->proj_type() == PROJECTOR_TYPE_GLMA
+        || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
 }

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
@@ -3854,6 +3629,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
 //
 // API for debugging
 //
+
 void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    clip_image_f32 img;
    img.nx = w;
@@ -3862,6 +3638,9 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    for (int i = 0; i < h * w * 3; i++) {
        img.buf[i] = static_cast<float>(fill_value);
    }
+    bool cur_debug_graph = ctx->debug_graph;
+    ctx->debug_graph = true;
    clip_image_encode(ctx, 1, &img, nullptr);
+    ctx->debug_graph = cur_debug_graph;
    GGML_ASSERT(img.buf.empty() && "expected, always stop here");
 }