sync from b7516
This commit is contained in:
@@ -152,14 +152,18 @@ struct clip_ctx {
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
ggml_backend_buffer_ptr buf;
|
||||
|
||||
|
||||
int max_nodes = 8192;
|
||||
ggml_backend_sched_ptr sched;
|
||||
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||
bool is_allocated = false;
|
||||
|
||||
// for debugging
|
||||
bool debug_graph = false;
|
||||
std::vector<ggml_tensor *> debug_print_tensors;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
flash_attn_type = ctx_params.flash_attn_type;
|
||||
debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
|
||||
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
if (!backend_cpu) {
|
||||
throw std::runtime_error("failed to initialize CPU backend");
|
||||
@@ -200,10 +204,6 @@ struct clip_ctx {
|
||||
sched.reset(
|
||||
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
|
||||
);
|
||||
|
||||
if (ctx_params.cb_eval != nullptr) {
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
|
||||
}
|
||||
}
|
||||
|
||||
~clip_ctx() {
|
||||
@@ -239,7 +239,9 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
||||
n_mmproj_embd(clip_n_mmproj_embd(ctx)),
|
||||
eps(hparams.eps),
|
||||
kq_scale(1.0f / sqrtf((float)d_head)),
|
||||
flash_attn_type(ctx->flash_attn_type) {
|
||||
flash_attn_type(ctx->flash_attn_type),
|
||||
debug_graph(ctx->debug_graph),
|
||||
debug_print_tensors(ctx->debug_print_tensors) {
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
||||
@@ -250,11 +252,14 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
||||
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
|
||||
}
|
||||
|
||||
void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||
if (il >= 0) {
|
||||
ggml_format_name(cur, "%s-%d", name, il);
|
||||
} else {
|
||||
ggml_set_name(cur, name);
|
||||
void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
|
||||
if (debug_graph) {
|
||||
ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
|
||||
std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
|
||||
ggml_set_name(cur, cur_name.c_str());
|
||||
ggml_set_output(cur);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
debug_print_tensors.push_back(cur);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -783,10 +788,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_siglip>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
@@ -817,7 +818,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
||||
} break;
|
||||
@@ -845,10 +845,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("missing cgraph builder");
|
||||
}
|
||||
@@ -1145,14 +1141,6 @@ struct clip_model_loader {
|
||||
// test model (tinygemma3) has a different value, we optionally read it
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
||||
// Similar configuration to Gemma3
|
||||
hparams.n_merge = 1; // MobileNetV5 handles resizing internally
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
@@ -1170,20 +1158,6 @@ struct clip_model_loader {
|
||||
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
||||
std::vector<int> wa_layer_indexes_vec;
|
||||
get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
|
||||
for (auto & layer : wa_layer_indexes_vec) {
|
||||
hparams.wa_layer_indexes.insert(layer);
|
||||
}
|
||||
// support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
|
||||
hparams.set_limit_image_tokens(1, 62500);
|
||||
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
@@ -1202,7 +1176,6 @@ struct clip_model_loader {
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
||||
@@ -1252,14 +1225,7 @@ struct clip_model_loader {
|
||||
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
|
||||
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
|
||||
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
|
||||
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
||||
if (!hparams.wa_layer_indexes.empty()) {
|
||||
LOG_INF("%s: wa_layer_indexes: ", __func__);
|
||||
for (auto & layer : hparams.wa_layer_indexes) {
|
||||
LOG_INF("%d ", layer);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
}
|
||||
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
||||
if (hparams.image_min_pixels > 0) {
|
||||
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
|
||||
}
|
||||
@@ -1341,10 +1307,6 @@ struct clip_model_loader {
|
||||
|
||||
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
||||
|
||||
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
hparams.n_layer = 0; // gemma3n does not use normal layer structure
|
||||
}
|
||||
|
||||
// layers
|
||||
model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
@@ -1419,7 +1381,6 @@ struct clip_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
switch (model.proj_type) {
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
@@ -1514,8 +1475,8 @@ struct clip_model_loader {
|
||||
model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
|
||||
model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
|
||||
model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
@@ -1532,14 +1493,6 @@ struct clip_model_loader {
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
|
||||
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
@@ -1559,112 +1512,11 @@ struct clip_model_loader {
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
||||
model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
|
||||
model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
|
||||
|
||||
model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
|
||||
model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
|
||||
model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
|
||||
model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
|
||||
|
||||
model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
|
||||
|
||||
// Dynamically load blocks stage by stage
|
||||
for (int stage = 0; stage < 4; ++stage) {
|
||||
int blocks_found_in_stage = 0;
|
||||
|
||||
for (int blk_idx = 0; ; ++blk_idx) {
|
||||
bool found_block = false;
|
||||
mobilenetv5_block block;
|
||||
|
||||
// 1. Check for Edge Residual (S0)
|
||||
block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
|
||||
if (block.s0_conv_exp_w) {
|
||||
found_block = true;
|
||||
block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
|
||||
block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
|
||||
block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
|
||||
}
|
||||
// 2. Check for UIR (Universal Inverted Residual)
|
||||
else {
|
||||
// Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
|
||||
block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
|
||||
block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
|
||||
|
||||
if (block.dw_start_w || block.pw_exp_w) {
|
||||
found_block = true;
|
||||
if (block.dw_start_w) {
|
||||
block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
|
||||
}
|
||||
if (block.pw_exp_w) {
|
||||
block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
|
||||
if (block.dw_mid_w) {
|
||||
block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
|
||||
if (block.pw_proj_w) {
|
||||
block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Check for Attention (MQA)
|
||||
// Even if UIR/Edge check failed, this might be a pure attention block
|
||||
ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
|
||||
if (attn_q_check) {
|
||||
found_block = true;
|
||||
block.attn_q_w = attn_q_check;
|
||||
block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
|
||||
block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
|
||||
block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
|
||||
block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
|
||||
block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
|
||||
block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
|
||||
block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
|
||||
block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
|
||||
// Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
|
||||
if (!block.layer_scale_w) {
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
if (found_block) {
|
||||
model.mobilenet_blocks.push_back(block);
|
||||
blocks_found_in_stage++;
|
||||
} else {
|
||||
// End of blocks for this stage
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Track where this stage ends in the flat vector
|
||||
if (blocks_found_in_stage > 0) {
|
||||
model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
|
||||
LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
|
||||
}
|
||||
}
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
@@ -1724,17 +1576,6 @@ struct clip_model_loader {
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
||||
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
||||
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
||||
@@ -1756,8 +1597,8 @@ struct clip_model_loader {
|
||||
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
||||
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
@@ -2107,7 +1948,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
|
||||
try {
|
||||
clip_model_loader loader(fname);
|
||||
bool skip_audio = false;
|
||||
|
||||
if (loader.has_vision) {
|
||||
ctx_vision = new clip_ctx(ctx_params);
|
||||
@@ -2117,14 +1957,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
loader.warmup(*ctx_vision);
|
||||
}
|
||||
|
||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
|
||||
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
||||
}
|
||||
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
if (loader.has_audio) {
|
||||
ctx_audio = new clip_ctx(ctx_params);
|
||||
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
||||
loader.load_tensors(*ctx_audio);
|
||||
@@ -2848,57 +2684,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
// res_imgs->data[0] = *res;
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
const int patch_size = params.patch_size; // typically 16
|
||||
const int merge_size = params.n_merge; // typically 2
|
||||
const int align_size = patch_size * merge_size; // 32
|
||||
|
||||
const int max_num_patches = params.image_max_pixels > 0 ?
|
||||
params.image_max_pixels / (patch_size * patch_size) : 256;
|
||||
|
||||
// Linear search for optimal scale to fit within max_num_patches
|
||||
float scale = 1.0f;
|
||||
int target_height = original_size.height;
|
||||
int target_width = original_size.width;
|
||||
|
||||
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
|
||||
float scaled_size = size * scale;
|
||||
// Round up to nearest multiple of align_size
|
||||
int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
|
||||
// Ensure at least one patch
|
||||
return std::max(align_size, aligned);
|
||||
};
|
||||
|
||||
// Linear search with 0.02 step size
|
||||
while (scale > 0.0f) {
|
||||
target_height = get_scaled_image_size(scale, original_size.height);
|
||||
target_width = get_scaled_image_size(scale, original_size.width);
|
||||
|
||||
int num_patches_h = target_height / patch_size;
|
||||
int num_patches_w = target_width / patch_size;
|
||||
int num_patches = num_patches_h * num_patches_w;
|
||||
|
||||
if (num_patches > max_num_patches) {
|
||||
scale -= 0.02f;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
clip_image_size new_size = {target_width, target_height};
|
||||
|
||||
// Resize the image
|
||||
clip_image_u8 resized;
|
||||
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
|
||||
|
||||
// Normalize to float32
|
||||
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
|
||||
|
||||
// Add to results
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
@@ -2962,16 +2747,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
clip_image_u8 resized_image;
|
||||
int sz = params.image_size;
|
||||
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
|
||||
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
|
||||
@@ -3141,7 +2916,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
default:
|
||||
break;
|
||||
@@ -3157,7 +2931,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
default:
|
||||
break;
|
||||
@@ -3218,7 +2991,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
// dynamic size (2 conv, so double patch size)
|
||||
int x_patch = img->nx / (params.patch_size * 2);
|
||||
@@ -3234,12 +3006,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
int scale_factor = ctx->model.hparams.n_merge;
|
||||
n_patches /= (scale_factor * scale_factor);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
|
||||
// regardless of input size (see architecture description)
|
||||
n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
@@ -3265,7 +3031,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
n_patches = img->nx;
|
||||
|
||||
@@ -3334,6 +3099,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
|
||||
// build the inference graph
|
||||
ctx->debug_print_tensors.clear();
|
||||
ggml_backend_sched_reset(ctx->sched.get());
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
||||
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
||||
@@ -3351,6 +3117,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
const int pos_w = image_size_width / patch_size;
|
||||
const int pos_h = image_size_height / patch_size;
|
||||
|
||||
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
||||
|
||||
auto get_inp_tensor = [&gf](const char * name) {
|
||||
ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
|
||||
@@ -3499,11 +3266,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
// pw * ph = number of tokens output by ViT after apply patch merger
|
||||
// ipw * ipw = number of vision token been processed inside ViT
|
||||
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
|
||||
const int merge_ratio = 2;
|
||||
const int pw = image_size_width / patch_size / merge_ratio;
|
||||
const int ph = image_size_height / patch_size / merge_ratio;
|
||||
@@ -3514,7 +3279,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
std::vector<int> inv_idx(ph * pw);
|
||||
|
||||
if (use_window_attn) {
|
||||
const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
|
||||
const int attn_window_size = 112;
|
||||
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
||||
int dst = 0;
|
||||
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
||||
@@ -3631,7 +3396,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
set_input_i32("patches", patches);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
@@ -3639,7 +3403,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
{
|
||||
@@ -3703,6 +3466,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
return false;
|
||||
}
|
||||
|
||||
// print debug nodes
|
||||
if (ctx->debug_graph) {
|
||||
LOG_INF("\n\n---\n\n");
|
||||
LOG_INF("\n\nDebug graph:\n\n");
|
||||
for (ggml_tensor * t : ctx->debug_print_tensors) {
|
||||
std::vector<uint8_t> data(ggml_nbytes(t));
|
||||
ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
|
||||
print_tensor_shape(t);
|
||||
print_tensor_data(t, data.data(), 3);
|
||||
}
|
||||
}
|
||||
|
||||
// the last node is the embedding tensor
|
||||
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
|
||||
@@ -3741,19 +3516,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return ctx->model.mm_1_b->ne[0];
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
// main path + deepstack paths
|
||||
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
return ctx->model.mm_input_proj_w->ne[0];
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
return ctx->model.projection->ne[1];
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
return ctx->model.mm_3_w->ne[1];
|
||||
@@ -3778,7 +3550,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
|
||||
return ctx->model.hparams.minicpmv_version;
|
||||
}
|
||||
@@ -3786,14 +3557,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
||||
}
|
||||
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
|
||||
}
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||
return ctx->model.hparams.has_llava_projector;
|
||||
}
|
||||
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
|
||||
}
|
||||
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->model.modality == CLIP_MODALITY_VISION;
|
||||
}
|
||||
@@ -3803,16 +3584,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
@@ -3854,6 +3629,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
||||
//
|
||||
// API for debugging
|
||||
//
|
||||
|
||||
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
||||
clip_image_f32 img;
|
||||
img.nx = w;
|
||||
@@ -3862,6 +3638,9 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
||||
for (int i = 0; i < h * w * 3; i++) {
|
||||
img.buf[i] = static_cast<float>(fill_value);
|
||||
}
|
||||
bool cur_debug_graph = ctx->debug_graph;
|
||||
ctx->debug_graph = true;
|
||||
clip_image_encode(ctx, 1, &img, nullptr);
|
||||
ctx->debug_graph = cur_debug_graph;
|
||||
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user