sync from b7516
This commit is contained in:
@@ -27,8 +27,6 @@ add_library(mtmd
|
||||
models/qwen3vl.cpp
|
||||
models/siglip.cpp
|
||||
models/whisper-enc.cpp
|
||||
models/mobilenetv5.cpp
|
||||
models/youtuvl.cpp
|
||||
)
|
||||
|
||||
set_target_properties(mtmd PROPERTIES
|
||||
|
||||
@@ -32,6 +32,10 @@ struct clip_graph {
|
||||
const float kq_scale;
|
||||
const clip_flash_attn_type flash_attn_type;
|
||||
|
||||
// for debugging
|
||||
const bool debug_graph;
|
||||
std::vector<ggml_tensor *> & debug_print_tensors;
|
||||
|
||||
ggml_context_ptr ctx0_ptr;
|
||||
ggml_context * ctx0;
|
||||
ggml_cgraph * gf;
|
||||
|
||||
@@ -45,14 +45,13 @@
|
||||
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
||||
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
|
||||
// audio-specific
|
||||
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
|
||||
@@ -154,47 +153,6 @@
|
||||
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
|
||||
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
|
||||
|
||||
// mobilenetv5 (gemma3n) definitions
|
||||
#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
|
||||
#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
|
||||
#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight"
|
||||
|
||||
// Stage 0 Block (Edge Residual)
|
||||
#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight"
|
||||
#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight"
|
||||
#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight"
|
||||
#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight"
|
||||
|
||||
// Stage 1+ Block (Universal Inverted Residual)
|
||||
#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight"
|
||||
#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma"
|
||||
|
||||
// Attention Components
|
||||
#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight"
|
||||
#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight"
|
||||
#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight"
|
||||
#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight"
|
||||
#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
|
||||
|
||||
// MSFA
|
||||
#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
|
||||
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
|
||||
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
@@ -212,8 +170,6 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_QWEN2VL,
|
||||
PROJECTOR_TYPE_QWEN3VL,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_GEMMA3NV,
|
||||
PROJECTOR_TYPE_GEMMA3NA,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
@@ -224,7 +180,6 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_GLMA,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
PROJECTOR_TYPE_MUSIC_FLAMINGO,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
PROJECTOR_TYPE_KIMIVL,
|
||||
PROJECTOR_TYPE_LIGHTONOCR,
|
||||
@@ -232,7 +187,6 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_JANUS_PRO,
|
||||
PROJECTOR_TYPE_LFM2A,
|
||||
PROJECTOR_TYPE_GLM4V,
|
||||
PROJECTOR_TYPE_YOUTUVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -246,8 +200,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
|
||||
@@ -257,7 +209,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_GLMA, "glma"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||
@@ -265,7 +216,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
|
||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
@@ -61,7 +61,6 @@ struct clip_hparams {
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
|
||||
|
||||
// audio
|
||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||
@@ -173,45 +172,6 @@ struct clip_layer {
|
||||
}
|
||||
};
|
||||
|
||||
// Expanded MobileNetV5 block structure for Gemma3n vision encoder
|
||||
struct mobilenetv5_block {
|
||||
// Stage 0 (Edge Residual)
|
||||
ggml_tensor * s0_conv_exp_w = nullptr;
|
||||
ggml_tensor * s0_bn1_w = nullptr;
|
||||
ggml_tensor * s0_conv_pwl_w = nullptr;
|
||||
ggml_tensor * s0_bn2_w = nullptr;
|
||||
|
||||
// Stage 1+ (Universal Inverted Residual)
|
||||
ggml_tensor * dw_start_w = nullptr;
|
||||
ggml_tensor * dw_start_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_exp_w = nullptr;
|
||||
ggml_tensor * pw_exp_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * dw_mid_w = nullptr;
|
||||
ggml_tensor * dw_mid_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_proj_w = nullptr;
|
||||
ggml_tensor * pw_proj_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * layer_scale_w = nullptr;
|
||||
|
||||
// Attention (MQA) components
|
||||
ggml_tensor * attn_q_w = nullptr;
|
||||
ggml_tensor * attn_k_w = nullptr;
|
||||
ggml_tensor * attn_v_w = nullptr;
|
||||
ggml_tensor * attn_o_w = nullptr;
|
||||
|
||||
// Optional downsampling/norm in attention
|
||||
ggml_tensor * attn_k_dw_w = nullptr;
|
||||
ggml_tensor * attn_k_norm_w = nullptr;
|
||||
ggml_tensor * attn_v_dw_w = nullptr;
|
||||
ggml_tensor * attn_v_norm_w = nullptr;
|
||||
|
||||
// Block norm (often present in attention blocks)
|
||||
ggml_tensor * attn_norm_w = nullptr;
|
||||
};
|
||||
|
||||
struct clip_model {
|
||||
clip_modality modality = CLIP_MODALITY_VISION;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
@@ -328,23 +288,6 @@ struct clip_model {
|
||||
ggml_tensor * mm_input_proj_w = nullptr;
|
||||
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
||||
|
||||
// mobilenetv5 for gemma3n
|
||||
std::vector<mobilenetv5_block> mobilenet_blocks;
|
||||
std::vector<int> mobilenet_stage_ends;
|
||||
ggml_tensor * mobilenet_stem_conv_w = nullptr;
|
||||
ggml_tensor * mobilenet_stem_conv_b = nullptr;
|
||||
ggml_tensor * mobilenet_stem_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_proj_norm_w = nullptr;
|
||||
|
||||
// Multi-Scale Fusion Adapter (MSFA) components
|
||||
ggml_tensor * msfa_concat_conv_w = nullptr;
|
||||
ggml_tensor * msfa_concat_norm_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_bn = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_bn = nullptr;
|
||||
|
||||
|
||||
// pixtral, glm4v
|
||||
ggml_tensor * token_embd_img_break = nullptr;
|
||||
ggml_tensor * mm_patch_merger_w = nullptr;
|
||||
@@ -376,8 +319,7 @@ struct clip_model {
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
||||
|| proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
|
||||
bool audio_has_stack_frames() const {
|
||||
|
||||
@@ -152,14 +152,18 @@ struct clip_ctx {
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
ggml_backend_buffer_ptr buf;
|
||||
|
||||
|
||||
int max_nodes = 8192;
|
||||
ggml_backend_sched_ptr sched;
|
||||
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||
bool is_allocated = false;
|
||||
|
||||
// for debugging
|
||||
bool debug_graph = false;
|
||||
std::vector<ggml_tensor *> debug_print_tensors;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
flash_attn_type = ctx_params.flash_attn_type;
|
||||
debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
|
||||
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
if (!backend_cpu) {
|
||||
throw std::runtime_error("failed to initialize CPU backend");
|
||||
@@ -200,10 +204,6 @@ struct clip_ctx {
|
||||
sched.reset(
|
||||
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
|
||||
);
|
||||
|
||||
if (ctx_params.cb_eval != nullptr) {
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
|
||||
}
|
||||
}
|
||||
|
||||
~clip_ctx() {
|
||||
@@ -239,7 +239,9 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
||||
n_mmproj_embd(clip_n_mmproj_embd(ctx)),
|
||||
eps(hparams.eps),
|
||||
kq_scale(1.0f / sqrtf((float)d_head)),
|
||||
flash_attn_type(ctx->flash_attn_type) {
|
||||
flash_attn_type(ctx->flash_attn_type),
|
||||
debug_graph(ctx->debug_graph),
|
||||
debug_print_tensors(ctx->debug_print_tensors) {
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
||||
@@ -250,11 +252,14 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
||||
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
|
||||
}
|
||||
|
||||
void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||
if (il >= 0) {
|
||||
ggml_format_name(cur, "%s-%d", name, il);
|
||||
} else {
|
||||
ggml_set_name(cur, name);
|
||||
void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
|
||||
if (debug_graph) {
|
||||
ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
|
||||
std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
|
||||
ggml_set_name(cur, cur_name.c_str());
|
||||
ggml_set_output(cur);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
debug_print_tensors.push_back(cur);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -783,10 +788,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_siglip>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
@@ -817,7 +818,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
||||
} break;
|
||||
@@ -845,10 +845,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("missing cgraph builder");
|
||||
}
|
||||
@@ -1145,14 +1141,6 @@ struct clip_model_loader {
|
||||
// test model (tinygemma3) has a different value, we optionally read it
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
||||
// Similar configuration to Gemma3
|
||||
hparams.n_merge = 1; // MobileNetV5 handles resizing internally
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
@@ -1170,20 +1158,6 @@ struct clip_model_loader {
|
||||
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
||||
std::vector<int> wa_layer_indexes_vec;
|
||||
get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
|
||||
for (auto & layer : wa_layer_indexes_vec) {
|
||||
hparams.wa_layer_indexes.insert(layer);
|
||||
}
|
||||
// support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
|
||||
hparams.set_limit_image_tokens(1, 62500);
|
||||
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
@@ -1202,7 +1176,6 @@ struct clip_model_loader {
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
||||
@@ -1252,14 +1225,7 @@ struct clip_model_loader {
|
||||
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
|
||||
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
|
||||
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
|
||||
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
||||
if (!hparams.wa_layer_indexes.empty()) {
|
||||
LOG_INF("%s: wa_layer_indexes: ", __func__);
|
||||
for (auto & layer : hparams.wa_layer_indexes) {
|
||||
LOG_INF("%d ", layer);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
}
|
||||
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
||||
if (hparams.image_min_pixels > 0) {
|
||||
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
|
||||
}
|
||||
@@ -1341,10 +1307,6 @@ struct clip_model_loader {
|
||||
|
||||
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
||||
|
||||
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
hparams.n_layer = 0; // gemma3n does not use normal layer structure
|
||||
}
|
||||
|
||||
// layers
|
||||
model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
@@ -1419,7 +1381,6 @@ struct clip_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
switch (model.proj_type) {
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
@@ -1514,8 +1475,8 @@ struct clip_model_loader {
|
||||
model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
|
||||
model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
|
||||
model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
@@ -1532,14 +1493,6 @@ struct clip_model_loader {
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
|
||||
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
@@ -1559,112 +1512,11 @@ struct clip_model_loader {
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
||||
model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
|
||||
model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
|
||||
|
||||
model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
|
||||
model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
|
||||
model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
|
||||
model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
|
||||
|
||||
model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
|
||||
|
||||
// Dynamically load blocks stage by stage
|
||||
for (int stage = 0; stage < 4; ++stage) {
|
||||
int blocks_found_in_stage = 0;
|
||||
|
||||
for (int blk_idx = 0; ; ++blk_idx) {
|
||||
bool found_block = false;
|
||||
mobilenetv5_block block;
|
||||
|
||||
// 1. Check for Edge Residual (S0)
|
||||
block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
|
||||
if (block.s0_conv_exp_w) {
|
||||
found_block = true;
|
||||
block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
|
||||
block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
|
||||
block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
|
||||
}
|
||||
// 2. Check for UIR (Universal Inverted Residual)
|
||||
else {
|
||||
// Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
|
||||
block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
|
||||
block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
|
||||
|
||||
if (block.dw_start_w || block.pw_exp_w) {
|
||||
found_block = true;
|
||||
if (block.dw_start_w) {
|
||||
block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
|
||||
}
|
||||
if (block.pw_exp_w) {
|
||||
block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
|
||||
if (block.dw_mid_w) {
|
||||
block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
|
||||
if (block.pw_proj_w) {
|
||||
block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Check for Attention (MQA)
|
||||
// Even if UIR/Edge check failed, this might be a pure attention block
|
||||
ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
|
||||
if (attn_q_check) {
|
||||
found_block = true;
|
||||
block.attn_q_w = attn_q_check;
|
||||
block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
|
||||
block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
|
||||
block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
|
||||
block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
|
||||
block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
|
||||
block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
|
||||
block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
|
||||
block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
|
||||
// Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
|
||||
if (!block.layer_scale_w) {
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
if (found_block) {
|
||||
model.mobilenet_blocks.push_back(block);
|
||||
blocks_found_in_stage++;
|
||||
} else {
|
||||
// End of blocks for this stage
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Track where this stage ends in the flat vector
|
||||
if (blocks_found_in_stage > 0) {
|
||||
model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
|
||||
LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
|
||||
}
|
||||
}
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
@@ -1724,17 +1576,6 @@ struct clip_model_loader {
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
||||
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
||||
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
||||
@@ -1756,8 +1597,8 @@ struct clip_model_loader {
|
||||
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
||||
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
@@ -2107,7 +1948,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
|
||||
try {
|
||||
clip_model_loader loader(fname);
|
||||
bool skip_audio = false;
|
||||
|
||||
if (loader.has_vision) {
|
||||
ctx_vision = new clip_ctx(ctx_params);
|
||||
@@ -2117,14 +1957,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
loader.warmup(*ctx_vision);
|
||||
}
|
||||
|
||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
|
||||
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
||||
}
|
||||
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
if (loader.has_audio) {
|
||||
ctx_audio = new clip_ctx(ctx_params);
|
||||
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
||||
loader.load_tensors(*ctx_audio);
|
||||
@@ -2848,57 +2684,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
// res_imgs->data[0] = *res;
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
const int patch_size = params.patch_size; // typically 16
|
||||
const int merge_size = params.n_merge; // typically 2
|
||||
const int align_size = patch_size * merge_size; // 32
|
||||
|
||||
const int max_num_patches = params.image_max_pixels > 0 ?
|
||||
params.image_max_pixels / (patch_size * patch_size) : 256;
|
||||
|
||||
// Linear search for optimal scale to fit within max_num_patches
|
||||
float scale = 1.0f;
|
||||
int target_height = original_size.height;
|
||||
int target_width = original_size.width;
|
||||
|
||||
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
|
||||
float scaled_size = size * scale;
|
||||
// Round up to nearest multiple of align_size
|
||||
int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
|
||||
// Ensure at least one patch
|
||||
return std::max(align_size, aligned);
|
||||
};
|
||||
|
||||
// Linear search with 0.02 step size
|
||||
while (scale > 0.0f) {
|
||||
target_height = get_scaled_image_size(scale, original_size.height);
|
||||
target_width = get_scaled_image_size(scale, original_size.width);
|
||||
|
||||
int num_patches_h = target_height / patch_size;
|
||||
int num_patches_w = target_width / patch_size;
|
||||
int num_patches = num_patches_h * num_patches_w;
|
||||
|
||||
if (num_patches > max_num_patches) {
|
||||
scale -= 0.02f;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
clip_image_size new_size = {target_width, target_height};
|
||||
|
||||
// Resize the image
|
||||
clip_image_u8 resized;
|
||||
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
|
||||
|
||||
// Normalize to float32
|
||||
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
|
||||
|
||||
// Add to results
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
@@ -2962,16 +2747,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
clip_image_u8 resized_image;
|
||||
int sz = params.image_size;
|
||||
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
|
||||
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
|
||||
@@ -3141,7 +2916,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
default:
|
||||
break;
|
||||
@@ -3157,7 +2931,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
default:
|
||||
break;
|
||||
@@ -3218,7 +2991,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
// dynamic size (2 conv, so double patch size)
|
||||
int x_patch = img->nx / (params.patch_size * 2);
|
||||
@@ -3234,12 +3006,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
int scale_factor = ctx->model.hparams.n_merge;
|
||||
n_patches /= (scale_factor * scale_factor);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
|
||||
// regardless of input size (see architecture description)
|
||||
n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
@@ -3265,7 +3031,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
n_patches = img->nx;
|
||||
|
||||
@@ -3334,6 +3099,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
|
||||
// build the inference graph
|
||||
ctx->debug_print_tensors.clear();
|
||||
ggml_backend_sched_reset(ctx->sched.get());
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
||||
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
||||
@@ -3351,6 +3117,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
const int pos_w = image_size_width / patch_size;
|
||||
const int pos_h = image_size_height / patch_size;
|
||||
|
||||
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
||||
|
||||
auto get_inp_tensor = [&gf](const char * name) {
|
||||
ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
|
||||
@@ -3499,11 +3266,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
// pw * ph = number of tokens output by ViT after apply patch merger
|
||||
// ipw * ipw = number of vision token been processed inside ViT
|
||||
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
|
||||
const int merge_ratio = 2;
|
||||
const int pw = image_size_width / patch_size / merge_ratio;
|
||||
const int ph = image_size_height / patch_size / merge_ratio;
|
||||
@@ -3514,7 +3279,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
std::vector<int> inv_idx(ph * pw);
|
||||
|
||||
if (use_window_attn) {
|
||||
const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
|
||||
const int attn_window_size = 112;
|
||||
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
||||
int dst = 0;
|
||||
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
||||
@@ -3631,7 +3396,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
set_input_i32("patches", patches);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
@@ -3639,7 +3403,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
{
|
||||
@@ -3703,6 +3466,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
return false;
|
||||
}
|
||||
|
||||
// print debug nodes
|
||||
if (ctx->debug_graph) {
|
||||
LOG_INF("\n\n---\n\n");
|
||||
LOG_INF("\n\nDebug graph:\n\n");
|
||||
for (ggml_tensor * t : ctx->debug_print_tensors) {
|
||||
std::vector<uint8_t> data(ggml_nbytes(t));
|
||||
ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
|
||||
print_tensor_shape(t);
|
||||
print_tensor_data(t, data.data(), 3);
|
||||
}
|
||||
}
|
||||
|
||||
// the last node is the embedding tensor
|
||||
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
|
||||
@@ -3741,19 +3516,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return ctx->model.mm_1_b->ne[0];
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
// main path + deepstack paths
|
||||
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
return ctx->model.mm_input_proj_w->ne[0];
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
return ctx->model.projection->ne[1];
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
return ctx->model.mm_3_w->ne[1];
|
||||
@@ -3778,7 +3550,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
|
||||
return ctx->model.hparams.minicpmv_version;
|
||||
}
|
||||
@@ -3786,14 +3557,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
||||
}
|
||||
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
|
||||
}
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||
return ctx->model.hparams.has_llava_projector;
|
||||
}
|
||||
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
|
||||
}
|
||||
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->model.modality == CLIP_MODALITY_VISION;
|
||||
}
|
||||
@@ -3803,16 +3584,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
@@ -3854,6 +3629,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
||||
//
|
||||
// API for debugging
|
||||
//
|
||||
|
||||
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
||||
clip_image_f32 img;
|
||||
img.nx = w;
|
||||
@@ -3862,6 +3638,9 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
||||
for (int i = 0; i < h * w * 3; i++) {
|
||||
img.buf[i] = static_cast<float>(fill_value);
|
||||
}
|
||||
bool cur_debug_graph = ctx->debug_graph;
|
||||
ctx->debug_graph = true;
|
||||
clip_image_encode(ctx, 1, &img, nullptr);
|
||||
ctx->debug_graph = cur_debug_graph;
|
||||
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "mtmd.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
@@ -38,8 +37,6 @@ struct clip_context_params {
|
||||
int image_min_tokens;
|
||||
int image_max_tokens;
|
||||
bool warmup;
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
};
|
||||
|
||||
struct clip_init_result {
|
||||
@@ -107,9 +104,9 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx);
|
||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
// note for contributor: this clip_is_(model) pattern is deprecated
|
||||
// do NOT add new functions like this
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
|
||||
@@ -1,451 +0,0 @@
|
||||
#include "models.h"
|
||||
|
||||
// Helpers for MobileNetV5 Blocks
|
||||
// RMS Norm 2D - normalizes over channels for each spatial position
|
||||
ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
|
||||
// inp: [W, H, C, B]
|
||||
|
||||
ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (weight) {
|
||||
cur = ggml_mul(ctx0, cur, weight);
|
||||
}
|
||||
|
||||
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
|
||||
ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
|
||||
const int64_t ih = inp->ne[1]; // height
|
||||
const int64_t iw = inp->ne[0]; // width
|
||||
|
||||
// Calculate output size (ceil division)
|
||||
const int64_t oh = (ih + stride_h - 1) / stride_h;
|
||||
const int64_t ow = (iw + stride_w - 1) / stride_w;
|
||||
|
||||
// Calculate padding needed
|
||||
const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
|
||||
const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
|
||||
|
||||
// Split padding asymmetrically
|
||||
const int pad_h_top = pad_h / 2;
|
||||
const int pad_h_bottom = pad_h - pad_h_top;
|
||||
const int pad_w_left = pad_w / 2;
|
||||
const int pad_w_right = pad_w - pad_w_left;
|
||||
|
||||
// Apply padding if needed
|
||||
// ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
|
||||
// For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
|
||||
if (pad_h > 0 || pad_w > 0) {
|
||||
inp = ggml_pad_ext(ctx0, inp,
|
||||
pad_w_left, pad_w_right, // width padding (dim 0)
|
||||
pad_h_top, pad_h_bottom, // height padding (dim 1)
|
||||
0, 0, // no channel padding (dim 2)
|
||||
0, 0); // no batch padding (dim 3)
|
||||
}
|
||||
|
||||
return inp;
|
||||
}
|
||||
|
||||
|
||||
// Edge Residual Block (Stage 0)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// 1. Expansion Conv (3x3)
|
||||
if (stride == 2) {
|
||||
// Case: Downsampling (Block 0)
|
||||
// Replicates Conv2dSame(kernel=3, stride=2)
|
||||
cur = pad_same_2d(cur, 3, 3, stride, stride);
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
|
||||
} else {
|
||||
// Case: Normal 3x3 Block (Block 1, 2)
|
||||
// Replicates Conv2d(kernel=3, stride=1, padding=1)
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
|
||||
}
|
||||
|
||||
// BN + Activation
|
||||
if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// 2. Pointwise Linear Conv (1x1)
|
||||
// 1x1 Convs usually have padding=0 and stride=1
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
|
||||
|
||||
// 3. Residual Connection
|
||||
// Only apply residual if spatial dimensions and channels match (stride 1)
|
||||
if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Universal Inverted Residual Block (Stage 1+)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// 1. Depthwise Start (Optional)
|
||||
// NOTE: dw_start always has stride=1 (no downsampling here)
|
||||
if (block.dw_start_w) {
|
||||
int k = block.dw_start_w->ne[0]; // 3 or 5
|
||||
int p = k / 2;
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
|
||||
if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
|
||||
}
|
||||
|
||||
// 2. Pointwise Expansion (1x1)
|
||||
if (block.pw_exp_w) {
|
||||
// Standard 1x1 conv, pad=0, stride=1
|
||||
cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
}
|
||||
|
||||
// 3. Depthwise Mid (Optional)
|
||||
// NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
|
||||
if (block.dw_mid_w) {
|
||||
int k = block.dw_mid_w->ne[0]; // 3 or 5
|
||||
|
||||
if (stride > 1) {
|
||||
// Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
|
||||
cur = pad_same_2d(cur, k, k, stride, stride);
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
|
||||
} else {
|
||||
// Case: Stride 1 -> Use Standard Symmetric Padding
|
||||
int p = k / 2;
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
|
||||
}
|
||||
|
||||
if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
}
|
||||
|
||||
// 4. Pointwise Projection (1x1)
|
||||
if (block.pw_proj_w) {
|
||||
cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
|
||||
}
|
||||
|
||||
// Apply Layer Scaling if present
|
||||
if (block.layer_scale_w) {
|
||||
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
|
||||
}
|
||||
|
||||
// 5. Residual Connection
|
||||
bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
|
||||
bool same_channel = (inp->ne[2] == cur->ne[2]);
|
||||
if (same_spatial && same_channel) {
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Attention Block (MQA)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// Norm
|
||||
if (block.attn_norm_w) {
|
||||
cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
|
||||
}
|
||||
|
||||
// 1. Q Calculation
|
||||
ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// 2. K Calculation (Downsampled)
|
||||
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
|
||||
ggml_tensor * k_inp = cur;
|
||||
if (block.attn_k_dw_w) {
|
||||
int k_size = block.attn_k_dw_w->ne[0]; // Usually 3
|
||||
k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding
|
||||
k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (block.attn_k_norm_w) {
|
||||
k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
|
||||
}
|
||||
}
|
||||
ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// 3. V Calculation (Downsampled)
|
||||
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
|
||||
ggml_tensor * v_inp = cur;
|
||||
if (block.attn_v_dw_w) {
|
||||
int v_size = block.attn_v_dw_w->ne[0]; // Usually 3
|
||||
v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding
|
||||
v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (block.attn_v_norm_w) {
|
||||
v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
|
||||
}
|
||||
}
|
||||
ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
|
||||
const int D = k->ne[2]; // Head dimension
|
||||
const int n_head = q->ne[2] / D;
|
||||
const int N = W * H;
|
||||
|
||||
// Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
|
||||
q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
|
||||
q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
|
||||
q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
|
||||
q = ggml_cont(ctx0, q);
|
||||
|
||||
const int Wk = k->ne[0]; const int Hk = k->ne[1];
|
||||
const int M = Wk * Hk;
|
||||
|
||||
// Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
|
||||
k = ggml_reshape_3d(ctx0, k, M, D, B);
|
||||
k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
|
||||
k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
|
||||
k = ggml_cont(ctx0, k);
|
||||
|
||||
// Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
|
||||
v = ggml_reshape_3d(ctx0, v, M, D, B);
|
||||
v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
|
||||
v = ggml_cont(ctx0, v); // [M, D, 1, B]
|
||||
|
||||
// Multi-Query Attention
|
||||
float scale = 1.0f / sqrtf((float)D);
|
||||
|
||||
// Step 1: Compute Q @ K.T
|
||||
ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
|
||||
|
||||
scores = ggml_scale(ctx0, scores, scale);
|
||||
|
||||
scores = ggml_soft_max(ctx0, scores);
|
||||
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
|
||||
|
||||
kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
|
||||
kqv = ggml_cont(ctx0, kqv);
|
||||
|
||||
|
||||
kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
|
||||
kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
|
||||
kqv = ggml_cont(ctx0, kqv);
|
||||
|
||||
// Output projection
|
||||
cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// Residual & Layer Scale
|
||||
if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
|
||||
if (block.layer_scale_w) {
|
||||
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_mobilenetv5::build() {
|
||||
ggml_tensor * inp = build_inp_raw();
|
||||
|
||||
// 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
|
||||
ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding
|
||||
|
||||
cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (model.mobilenet_stem_conv_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
|
||||
}
|
||||
if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
|
||||
// 2. Blocks
|
||||
std::vector<ggml_tensor*> intermediate_features;
|
||||
const int total_blocks = model.mobilenet_blocks.size();
|
||||
|
||||
auto is_stage_start = [&](int i) {
|
||||
if (i == 0) return true;
|
||||
for (int end_idx : model.mobilenet_stage_ends) {
|
||||
if (i == end_idx + 1) return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
auto is_fusion_point = [&](int i) {
|
||||
if (model.mobilenet_stage_ends.size() >= 4) {
|
||||
if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
|
||||
if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
|
||||
} else {
|
||||
if (i == total_blocks - 1) return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
for (int i = 0; i < total_blocks; i++) {
|
||||
const auto & block = model.mobilenet_blocks[i];
|
||||
int stride = is_stage_start(i) ? 2 : 1;
|
||||
|
||||
if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride);
|
||||
else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block);
|
||||
else cur = build_inverted_residual(cur, block, stride);
|
||||
|
||||
if (is_fusion_point(i)) {
|
||||
|
||||
intermediate_features.push_back(cur);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Multi-Scale Fusion Adapter (MSFA)
|
||||
if (!intermediate_features.empty()) {
|
||||
|
||||
// A. Reference Resolution: PyTorch implementation uses inputs[0]
|
||||
// We assume intermediate_features[0] is the "High Resolution" target.
|
||||
// In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
|
||||
ggml_tensor* target_feat = intermediate_features[0];
|
||||
int high_res_w = target_feat->ne[0];
|
||||
int high_res_h = target_feat->ne[1];
|
||||
|
||||
std::vector<ggml_tensor*> resized_feats;
|
||||
|
||||
// B. Resize inputs to match inputs[0] (High Resolution)
|
||||
for (auto feat : intermediate_features) {
|
||||
int feat_w = feat->ne[0];
|
||||
int feat_h = feat->ne[1];
|
||||
|
||||
// PyTorch: if feat_size < high_resolution: interpolate
|
||||
if (feat_w < high_res_w || feat_h < high_res_h) {
|
||||
// Calculate scale factor.
|
||||
// Note: PyTorch 'nearest' works on arbitrary float scales.
|
||||
// ggml_upscale generally takes integer factors or target sizes depending on helper.
|
||||
// Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
|
||||
int scale_w = high_res_w / feat_w;
|
||||
// int scale_h = high_res_h / feat_h;
|
||||
|
||||
// Safety check for non-integer scaling if strictly replicating
|
||||
GGML_ASSERT(high_res_w % feat_w == 0);
|
||||
|
||||
// Upsample (Nearest Neighbor)
|
||||
// 2 is the scale factor
|
||||
feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
|
||||
}
|
||||
resized_feats.push_back(feat);
|
||||
}
|
||||
|
||||
// C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
|
||||
cur = resized_feats[0];
|
||||
for (size_t k = 1; k < resized_feats.size(); ++k) {
|
||||
cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
|
||||
}
|
||||
|
||||
// D. FFN (UniversalInvertedResidual)
|
||||
// Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
|
||||
|
||||
// 1. Expansion
|
||||
if (model.msfa_ffn_expand_w) {
|
||||
// 1x1 Conv
|
||||
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
if (model.msfa_ffn_expand_bn) {
|
||||
cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
|
||||
}
|
||||
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
}
|
||||
|
||||
// 2. Projection (No DW because kernel_size=0)
|
||||
if (model.msfa_ffn_project_w) {
|
||||
// 1x1 Conv
|
||||
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// UniversalInvertedResidual typically has a norm after projection
|
||||
if (model.msfa_ffn_project_bn) {
|
||||
cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// E. Final Downsample to Target Resolution (Output Resolution)
|
||||
// PyTorch: matches self.output_resolution (e.g. 16x16)
|
||||
const int target_out_res = 16;
|
||||
int current_w = cur->ne[0];
|
||||
|
||||
if (current_w > target_out_res) {
|
||||
int s = current_w / target_out_res;
|
||||
|
||||
GGML_ASSERT(current_w % target_out_res == 0);
|
||||
|
||||
// Avg Pool: Kernel=s, Stride=s
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
// F. Final Norm
|
||||
if (model.msfa_concat_norm_w) {
|
||||
cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Gemma 3n Multimodal Projection (Embedder)
|
||||
// Input: 'cur' is [Width, Height, Channels, Batch]
|
||||
int W = cur->ne[0];
|
||||
int H = cur->ne[1];
|
||||
int C = cur->ne[2];
|
||||
int B = cur->ne[3];
|
||||
|
||||
GGML_ASSERT(C == hparams.n_embd);
|
||||
|
||||
// 1. Permute and Flatten to [Channels, Tokens, Batch]
|
||||
// PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
|
||||
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
|
||||
// 2. FEATURE SCALING
|
||||
// PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
|
||||
const float scale_factor = sqrtf((float)C);
|
||||
cur = ggml_scale(ctx0, cur, scale_factor);
|
||||
|
||||
|
||||
// 3. SOFT EMBEDDING NORM
|
||||
// PyTorch: self._norm(x) * self.weight
|
||||
// We must normalize regardless, then multiply if weight exists.
|
||||
{
|
||||
const float eps = 1e-6f; // Gemma3n uses 1e-6
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (model.mm_soft_emb_norm_w) {
|
||||
// Weight shape is (2048,) -> Element-wise broadcast multiply
|
||||
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// 4. PROJECTION
|
||||
// PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
|
||||
// Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
|
||||
if (model.mm_input_proj_w) {
|
||||
cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
|
||||
}
|
||||
|
||||
// 5. POST PROJECTION NORM
|
||||
// PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
|
||||
// with_scale=False means weight is registered as buffer with value 1.0
|
||||
// So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
|
||||
{
|
||||
const float eps = 1e-6f;
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (model.mm_post_proj_norm_w) {
|
||||
// If weight is loaded, multiply (should be ~1.0 anyway)
|
||||
cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
@@ -2,11 +2,6 @@
|
||||
|
||||
#include "../clip-graph.h"
|
||||
|
||||
/*
|
||||
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
|
||||
* We encourage human contributors to ensure the quality and reliability of the codebase.
|
||||
*/
|
||||
|
||||
struct clip_graph_siglip : clip_graph {
|
||||
clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
@@ -27,11 +22,6 @@ struct clip_graph_qwen3vl : clip_graph {
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_youtuvl : clip_graph {
|
||||
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_minicpmv : clip_graph {
|
||||
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
@@ -76,36 +66,3 @@ struct clip_graph_glm4v : clip_graph {
|
||||
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_mobilenetv5 : clip_graph {
|
||||
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
ggml_tensor * rms_norm_2d(
|
||||
ggml_tensor * inp,
|
||||
ggml_tensor * weight,
|
||||
float eps = 1e-6f);
|
||||
|
||||
ggml_tensor* pad_same_2d(
|
||||
ggml_tensor* inp,
|
||||
int kernel_h,
|
||||
int kernel_w,
|
||||
int stride_h,
|
||||
int stride_w,
|
||||
int dilation_h = 1,
|
||||
int dilation_w = 1);
|
||||
|
||||
ggml_tensor * build_edge_residual(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block,
|
||||
int stride);
|
||||
|
||||
ggml_tensor * build_inverted_residual(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block,
|
||||
int stride);
|
||||
|
||||
ggml_tensor * build_mobilenet_attn(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block);
|
||||
};
|
||||
|
||||
@@ -50,15 +50,10 @@ ggml_cgraph * clip_graph_siglip::build() {
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
|
||||
// projection, in LFM2-VL input norm is optional
|
||||
if (model.mm_input_norm_w) {
|
||||
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
}
|
||||
|
||||
if (model.mm_input_norm_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
}
|
||||
// projection
|
||||
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
|
||||
@@ -86,15 +86,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
||||
// projector
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
|
||||
cur = ggml_norm(ctx0, cur, hparams.eps);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
|
||||
@@ -1,179 +0,0 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_youtuvl::build() {
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
const int batch_size = 1;
|
||||
const bool use_window_attn = !hparams.wa_layer_indexes.empty();
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4;
|
||||
const int m = 2;
|
||||
const int Wp = n_patches_x;
|
||||
const int Hp = n_patches_y;
|
||||
const int Hm = Hp / m;
|
||||
const int Wm = Wp / m;
|
||||
norm_type norm_t = NORM_TYPE_NORMAL;
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * inp = build_inp_raw();
|
||||
|
||||
// change conv3d to linear
|
||||
// reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
|
||||
{
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
Wm * m * patch_size, m * patch_size, Hm, 3);
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
m * patch_size * 3, Wm, m * patch_size, Hm);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
m * patch_size * 3, patch_size, m, Hm * Wm);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
patch_size, 3, patch_size, Hm * Wm * m * m);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
|
||||
inp = ggml_cont_3d(
|
||||
ctx0, inp,
|
||||
3*patch_size* patch_size, Hm * Wm * m * m, 1);
|
||||
}
|
||||
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
||||
|
||||
if (model.patch_bias) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
}
|
||||
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
ggml_tensor * window_mask = nullptr;
|
||||
ggml_tensor * window_idx = nullptr;
|
||||
ggml_tensor * inv_window_idx = nullptr;
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
// pre-layernorm
|
||||
if (model.pre_ln_w) {
|
||||
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
||||
}
|
||||
if (use_window_attn) {
|
||||
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||
ggml_set_name(inv_window_idx, "inv_window_idx");
|
||||
ggml_set_input(inv_window_idx);
|
||||
// mask for window attention
|
||||
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
||||
ggml_set_name(window_mask, "window_mask");
|
||||
ggml_set_input(window_mask);
|
||||
|
||||
// if flash attn is used, we need to pad the mask and cast to f16
|
||||
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
|
||||
}
|
||||
|
||||
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
|
||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
|
||||
|
||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
|
||||
ggml_tensor * Kcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
|
||||
ggml_tensor * Vcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
|
||||
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0, Qcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0, Kcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
|
||||
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
|
||||
}
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
inpL = cur; // inpL = residual, cur = hidden_states
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
nullptr, nullptr,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
ggml_tensor * embeddings = inpL;
|
||||
if (use_window_attn) {
|
||||
const int spatial_merge_unit = 4;
|
||||
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
|
||||
ggml_set_name(window_idx, "window_idx");
|
||||
ggml_set_input(window_idx);
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
|
||||
cb(embeddings, "window_order_restored", -1);
|
||||
}
|
||||
|
||||
// post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
|
||||
if (model.post_ln_w) {
|
||||
embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
|
||||
}
|
||||
|
||||
// Now apply merger (VLPatchMerger):
|
||||
// 1. Apply RMS norm (ln_q in VLPatchMerger)
|
||||
embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
|
||||
cb(embeddings, "merger_normed", -1);
|
||||
|
||||
// 2. First reshape for spatial merge (merge 2x2 patches)
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||
cb(embeddings, "merger_reshaped", -1);
|
||||
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
@@ -9,250 +9,207 @@
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
|
||||
// some of the code here is copied from whisper.cpp
|
||||
// most of the code here is copied from whisper.cpp
|
||||
|
||||
constexpr bool DEBUG = false;
|
||||
|
||||
void mtmd_audio_cache::fill_sin_cos_table(int n) {
|
||||
sin_vals.resize(n);
|
||||
cos_vals.resize(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
double theta = (2 * M_PI * i) / n;
|
||||
sin_vals[i] = sinf(theta);
|
||||
cos_vals[i] = cosf(theta);
|
||||
}
|
||||
}
|
||||
struct mtmd_audio_mel_filters {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft;
|
||||
|
||||
void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
|
||||
hann_window.resize(length);
|
||||
int offset = -1;
|
||||
if (periodic) {
|
||||
offset = 0;
|
||||
}
|
||||
for (int i = 0; i < length; i++) {
|
||||
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
||||
}
|
||||
}
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
|
||||
int n_fft,
|
||||
int sample_rate,
|
||||
float fmin,
|
||||
float fmax,
|
||||
bool slaney_area_norm,
|
||||
float scale) {
|
||||
GGML_ASSERT(n_mel > 0 && n_fft > 1);
|
||||
if (fmax <= 0.0f) {
|
||||
fmax = 0.5f * sample_rate;
|
||||
}
|
||||
// note: this global cache is shared among all preprocessors
|
||||
// if we want to use multiple preprocessors at the same time,
|
||||
// we will need to enclose it in the preprocessor class in the future
|
||||
static struct mtmd_audio_global_cache {
|
||||
// precomputed sin/cos table for FFT
|
||||
std::vector<float> sin_vals;
|
||||
std::vector<float> cos_vals;
|
||||
|
||||
// Slaney scale (matches librosa default)
|
||||
const double min_log_hz = 1000.0;
|
||||
const double lin_slope = 3 / 200.;
|
||||
const double min_log_mel = min_log_hz * lin_slope;
|
||||
const double log_step = log(6.4) / 27.0;
|
||||
auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
|
||||
return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
|
||||
};
|
||||
auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
|
||||
return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
|
||||
};
|
||||
// hann window
|
||||
std::vector<float> hann_window;
|
||||
|
||||
// infer N_fft from n_fft_bins
|
||||
const double bin_hz_step = double(sample_rate) / double(n_fft);
|
||||
// mel filter bank
|
||||
mtmd_audio_mel_filters filters;
|
||||
|
||||
// mel grid: n_mel + 2 edges
|
||||
const double m_lo = hz_to_mel(fmin);
|
||||
const double m_hi = hz_to_mel(fmax);
|
||||
std::vector<double> mel_pts(n_mel + 2);
|
||||
for (int i = 0; i < n_mel + 2; ++i) {
|
||||
mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
|
||||
}
|
||||
|
||||
// convert to Hz
|
||||
std::vector<double> hz_pts(n_mel + 2);
|
||||
for (int i = 0; i < n_mel + 2; ++i) {
|
||||
hz_pts[i] = mel_to_hz(mel_pts[i]);
|
||||
}
|
||||
|
||||
const int n_fft_bins = n_fft / 2 + 1;
|
||||
|
||||
// filterbank
|
||||
std::vector<float> out(n_mel * n_fft_bins, 0);
|
||||
for (int m = 0; m < n_mel; ++m) {
|
||||
const double f_left = hz_pts[m];
|
||||
const double f_center = hz_pts[m + 1];
|
||||
const double f_right = hz_pts[m + 2];
|
||||
|
||||
const double denom_l = std::max(1e-30, f_center - f_left);
|
||||
const double denom_r = std::max(1e-30, f_right - f_center);
|
||||
const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
|
||||
|
||||
for (int k = 0; k < n_fft_bins; ++k) {
|
||||
const double f = k * bin_hz_step;
|
||||
double w = 0.0;
|
||||
if (f >= f_left && f <= f_center) {
|
||||
w = (f - f_left) / denom_l;
|
||||
} else if (f > f_center && f <= f_right) {
|
||||
w = (f_right - f) / denom_r;
|
||||
}
|
||||
out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
|
||||
void fill_sin_cos_table(int n) {
|
||||
sin_vals.resize(n);
|
||||
cos_vals.resize(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
double theta = (2 * M_PI * i) / n;
|
||||
sin_vals[i] = sinf(theta);
|
||||
cos_vals[i] = cosf(theta);
|
||||
}
|
||||
}
|
||||
|
||||
filters.n_mel = n_mel;
|
||||
filters.n_fft = n_fft;
|
||||
filters.data = std::move(out);
|
||||
void fill_hann_window(int length, bool periodic) {
|
||||
hann_window.resize(length);
|
||||
int offset = -1;
|
||||
if (periodic) {
|
||||
offset = 0;
|
||||
}
|
||||
for (int i = 0; i < length; i++) {
|
||||
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
||||
}
|
||||
}
|
||||
|
||||
if (DEBUG) { // debug
|
||||
for (size_t i = 0; i < filters.data.size(); ++i) {
|
||||
if (filters.data[i] != 0.0f) {
|
||||
printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
|
||||
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
|
||||
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
|
||||
void fill_mel_filterbank_matrix(
|
||||
int n_mel,
|
||||
int n_fft,
|
||||
int sample_rate, // e.g. 16000
|
||||
float fmin = 0.0f, // e.g. 0.0
|
||||
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
|
||||
bool slaney_area_norm = true,
|
||||
float scale = 1.0f // optional extra scaling; use 1.0f/1000.0f to mimic your code
|
||||
) {
|
||||
GGML_ASSERT(n_mel > 0 && n_fft > 1);
|
||||
if (fmax <= 0.0f) {
|
||||
fmax = 0.5f * sample_rate;
|
||||
}
|
||||
|
||||
// Slaney scale (matches librosa default)
|
||||
const double min_log_hz = 1000.0;
|
||||
const double lin_slope = 3 / 200.;
|
||||
const double min_log_mel = min_log_hz * lin_slope;
|
||||
const double log_step = log(6.4) / 27.0;
|
||||
auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
|
||||
return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
|
||||
};
|
||||
auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
|
||||
return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
|
||||
};
|
||||
|
||||
// infer N_fft from n_fft_bins
|
||||
const double bin_hz_step = double(sample_rate) / double(n_fft);
|
||||
|
||||
// mel grid: n_mel + 2 edges
|
||||
const double m_lo = hz_to_mel(fmin);
|
||||
const double m_hi = hz_to_mel(fmax);
|
||||
std::vector<double> mel_pts(n_mel + 2);
|
||||
for (int i = 0; i < n_mel + 2; ++i) {
|
||||
mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
|
||||
}
|
||||
|
||||
// convert to Hz
|
||||
std::vector<double> hz_pts(n_mel + 2);
|
||||
for (int i = 0; i < n_mel + 2; ++i) {
|
||||
hz_pts[i] = mel_to_hz(mel_pts[i]);
|
||||
}
|
||||
|
||||
const int n_fft_bins = n_fft / 2 + 1;
|
||||
|
||||
// filterbank
|
||||
std::vector<float> out(n_mel * n_fft_bins, 0);
|
||||
for (int m = 0; m < n_mel; ++m) {
|
||||
const double f_left = hz_pts[m];
|
||||
const double f_center = hz_pts[m + 1];
|
||||
const double f_right = hz_pts[m + 2];
|
||||
|
||||
const double denom_l = std::max(1e-30, f_center - f_left);
|
||||
const double denom_r = std::max(1e-30, f_right - f_center);
|
||||
const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
|
||||
|
||||
for (int k = 0; k < n_fft_bins; ++k) {
|
||||
const double f = k * bin_hz_step;
|
||||
double w = 0.0;
|
||||
if (f >= f_left && f <= f_center) {
|
||||
w = (f - f_left) / denom_l;
|
||||
} else if (f > f_center && f <= f_right) {
|
||||
w = (f_right - f) / denom_r;
|
||||
}
|
||||
out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
|
||||
}
|
||||
}
|
||||
|
||||
filters.n_mel = n_mel;
|
||||
filters.n_fft = n_fft;
|
||||
filters.data = std::move(out);
|
||||
|
||||
if (DEBUG) { // debug
|
||||
for (size_t i = 0; i < filters.data.size(); ++i) {
|
||||
if (filters.data[i] != 0.0f) {
|
||||
printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} g_cache;
|
||||
|
||||
// Unified DFT implementation for both forward and inverse transforms
|
||||
// Template parameters:
|
||||
// Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
|
||||
// true = IDFT with exp(+2πi·k·n/N), scales by 1/N
|
||||
// RealInput: true = input is real-valued (stride 1), avoids imaginary computations
|
||||
// false = input is complex-valued (interleaved real/imag, stride 2)
|
||||
template <bool Inverse, bool RealInput>
|
||||
static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
|
||||
const int n_sin_cos_vals = cache.sin_vals.size();
|
||||
const int sin_cos_step = n_sin_cos_vals / N;
|
||||
|
||||
constexpr float sign = Inverse ? 1.0f : -1.0f;
|
||||
const float scale = Inverse ? (1.0f / N) : 1.0f;
|
||||
// naive Discrete Fourier Transform
|
||||
// input is real-valued
|
||||
// output is complex-valued
|
||||
static void dft(const float * in, int N, float * out) {
|
||||
const int n_sin_cos_vals = g_cache.sin_vals.size();
|
||||
const int sin_cos_step = n_sin_cos_vals / N;
|
||||
|
||||
for (int k = 0; k < N; k++) {
|
||||
float re = 0;
|
||||
float im = 0;
|
||||
|
||||
for (int n = 0; n < N; n++) {
|
||||
int idx = (k * n * sin_cos_step) % n_sin_cos_vals;
|
||||
float cos_val = cache.cos_vals[idx];
|
||||
float sin_val = cache.sin_vals[idx];
|
||||
|
||||
if constexpr (RealInput) {
|
||||
// Real input: in_im = 0, simplifies to:
|
||||
// re += in_re * cos_val
|
||||
// im += sign * in_re * sin_val
|
||||
float in_re = in[n];
|
||||
re += in_re * cos_val;
|
||||
im += sign * in_re * sin_val;
|
||||
} else {
|
||||
float in_re = in[n * 2 + 0];
|
||||
float in_im = in[n * 2 + 1];
|
||||
// (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
|
||||
re += in_re * cos_val - sign * in_im * sin_val;
|
||||
im += sign * in_re * sin_val + in_im * cos_val;
|
||||
}
|
||||
int idx = (k * n * sin_cos_step) % (n_sin_cos_vals); // t = 2*M_PI*k*n/N
|
||||
re += in[n] * g_cache.cos_vals[idx]; // cos(t)
|
||||
im -= in[n] * g_cache.sin_vals[idx]; // sin(t)
|
||||
}
|
||||
|
||||
out[k * 2 + 0] = re * scale;
|
||||
out[k * 2 + 1] = im * scale;
|
||||
out[k*2 + 0] = re;
|
||||
out[k*2 + 1] = im;
|
||||
}
|
||||
}
|
||||
|
||||
// Cooley-Tukey FFT/IFFT unified implementation
|
||||
// Template parameters:
|
||||
// Inverse: false = FFT with exp(-2πi·k/N), no scaling
|
||||
// true = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
|
||||
// RealInput: true = input is real-valued (stride 1)
|
||||
// false = input is complex-valued (interleaved real/imag, stride 2)
|
||||
template <bool Inverse, bool RealInput>
|
||||
static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
|
||||
const int n_sin_cos_vals = cache.sin_vals.size();
|
||||
|
||||
// Cooley-Tukey FFT
|
||||
// poor man's implementation - use something better
|
||||
// input is real-valued
|
||||
// output is complex-valued
|
||||
static void fft(float * in, int N, float * out) {
|
||||
const int n_sin_cos_vals = g_cache.sin_vals.size();
|
||||
if (N == 1) {
|
||||
out[0] = in[0];
|
||||
if constexpr (RealInput) {
|
||||
out[1] = 0.0f;
|
||||
} else {
|
||||
out[1] = in[1];
|
||||
}
|
||||
out[1] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
const int half_N = N / 2;
|
||||
if (N - half_N * 2 == 1) {
|
||||
// Odd N: fall back to DFT
|
||||
dft_impl<Inverse, RealInput>(cache, in, N, out);
|
||||
if (N - half_N*2 == 1) {
|
||||
dft(in, N, out);
|
||||
return;
|
||||
}
|
||||
|
||||
// Split into even and odd
|
||||
if constexpr (RealInput) {
|
||||
// Real input: stride is 1, copy only real values
|
||||
float * even = in + N;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
even[i] = in[2 * i];
|
||||
}
|
||||
float * even_fft = out + 2 * N;
|
||||
fft_impl<Inverse, true>(cache, even, half_N, even_fft);
|
||||
|
||||
float * odd = even;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
odd[i] = in[2 * i + 1];
|
||||
}
|
||||
float * odd_fft = even_fft + N;
|
||||
fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
|
||||
} else {
|
||||
// Complex input: stride is 2, copy complex pairs
|
||||
float * even = in + N * 2;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
even[i * 2 + 0] = in[2 * i * 2 + 0];
|
||||
even[i * 2 + 1] = in[2 * i * 2 + 1];
|
||||
}
|
||||
float * even_fft = out + 2 * N;
|
||||
fft_impl<Inverse, false>(cache, even, half_N, even_fft);
|
||||
|
||||
float * odd = even;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
|
||||
odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
|
||||
}
|
||||
float * odd_fft = even_fft + N;
|
||||
fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
|
||||
float* even = in + N;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
even[i]= in[2*i];
|
||||
}
|
||||
float* even_fft = out + 2 * N;
|
||||
fft(even, half_N, even_fft);
|
||||
|
||||
float * even_fft = out + 2 * N;
|
||||
float * odd_fft = even_fft + N;
|
||||
float* odd = even;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
odd[i] = in[2*i + 1];
|
||||
}
|
||||
float* odd_fft = even_fft + N;
|
||||
fft(odd, half_N, odd_fft);
|
||||
|
||||
const int sin_cos_step = n_sin_cos_vals / N;
|
||||
|
||||
constexpr float sign = Inverse ? 1.0f : -1.0f;
|
||||
constexpr float scale = Inverse ? 0.5f : 1.0f;
|
||||
|
||||
for (int k = 0; k < half_N; k++) {
|
||||
int idx = k * sin_cos_step; // t = 2*M_PI*k/N
|
||||
float re = cache.cos_vals[idx];
|
||||
float im = sign * cache.sin_vals[idx];
|
||||
int idx = k * sin_cos_step; // t = 2*M_PI*k/N
|
||||
float re = g_cache.cos_vals[idx]; // cos(t)
|
||||
float im = -g_cache.sin_vals[idx]; // sin(t)
|
||||
|
||||
float re_odd = odd_fft[2 * k + 0];
|
||||
float im_odd = odd_fft[2 * k + 1];
|
||||
float re_odd = odd_fft[2*k + 0];
|
||||
float im_odd = odd_fft[2*k + 1];
|
||||
|
||||
out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
|
||||
out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
|
||||
out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
|
||||
out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
|
||||
|
||||
out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
|
||||
out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
|
||||
out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
|
||||
out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
|
||||
}
|
||||
}
|
||||
|
||||
// Forward FFT for real input (used by mel spectrogram)
|
||||
static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
|
||||
fft_impl<false, true>(cache, in, N, out);
|
||||
}
|
||||
|
||||
// Inverse FFT for complex input
|
||||
static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
|
||||
fft_impl<true, false>(cache, in, N, out);
|
||||
}
|
||||
|
||||
struct filter_params {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft_bins;
|
||||
@@ -265,27 +222,20 @@ struct filter_params {
|
||||
bool norm_per_feature = false;
|
||||
};
|
||||
|
||||
static void log_mel_spectrogram_worker_thread(int ith,
|
||||
const float * hann,
|
||||
const std::vector<float> & samples,
|
||||
int n_samples,
|
||||
int frame_size,
|
||||
int frame_step,
|
||||
int n_threads,
|
||||
const filter_params & params,
|
||||
const mtmd_audio_cache & cache,
|
||||
mtmd_audio_mel & out) {
|
||||
static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
|
||||
int n_samples, int frame_size, int frame_step, int n_threads,
|
||||
const filter_params & params, mtmd_audio_mel & out) {
|
||||
std::vector<float> fft_in(frame_size * 2, 0.0);
|
||||
std::vector<float> fft_out(frame_size * 2 * 2 * 2);
|
||||
|
||||
int n_fft_bins = params.n_fft_bins;
|
||||
int i = ith;
|
||||
|
||||
const auto & filters = cache.filters;
|
||||
const auto & filters = g_cache.filters;
|
||||
|
||||
// make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
|
||||
GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
|
||||
GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
|
||||
GGML_ASSERT(g_cache.sin_vals.size() == g_cache.cos_vals.size());
|
||||
// calculate FFT only when fft_in are not all zero
|
||||
for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
|
||||
const int offset = i * frame_step;
|
||||
@@ -301,7 +251,7 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
||||
}
|
||||
|
||||
// FFT
|
||||
fft(cache, fft_in.data(), frame_size, fft_out.data());
|
||||
fft(fft_in.data(), frame_size, fft_out.data());
|
||||
|
||||
// Calculate modulus^2 of complex numbers
|
||||
// Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
|
||||
@@ -348,7 +298,6 @@ static bool log_mel_spectrogram(
|
||||
const int n_samples_in,
|
||||
const int n_threads,
|
||||
const filter_params & params,
|
||||
const mtmd_audio_cache & cache,
|
||||
mtmd_audio_mel & out) {
|
||||
//const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
@@ -356,9 +305,9 @@ static bool log_mel_spectrogram(
|
||||
int n_samples = n_samples_in;
|
||||
|
||||
// Hann window
|
||||
const float * hann = cache.hann_window.data();
|
||||
const int frame_size = (params.n_fft_bins - 1) * 2;
|
||||
const int frame_step = params.hop_length;
|
||||
const float * hann = g_cache.hann_window.data();
|
||||
const int frame_size = (params.n_fft_bins - 1) * 2;
|
||||
const int frame_step = params.hop_length;
|
||||
|
||||
// Padding
|
||||
std::vector<float> samples_padded;
|
||||
@@ -386,9 +335,9 @@ static bool log_mel_spectrogram(
|
||||
|
||||
// preemphasis
|
||||
if (params.preemph) {
|
||||
const int pad_amount = frame_size / 2;
|
||||
const int pad_amount = frame_size / 2;
|
||||
const float preemph = 0.97f;
|
||||
float prev = samples_padded[pad_amount];
|
||||
float prev = samples_padded[pad_amount];
|
||||
for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
|
||||
float cur = samples_padded[i];
|
||||
samples_padded[i] = cur - preemph * prev;
|
||||
@@ -423,14 +372,14 @@ static bool log_mel_spectrogram(
|
||||
{
|
||||
std::vector<std::thread> workers(n_threads - 1);
|
||||
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
||||
workers[iw] =
|
||||
std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
|
||||
frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
|
||||
workers[iw] = std::thread(
|
||||
log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
|
||||
n_samples, frame_size, frame_step, n_threads,
|
||||
std::cref(params), std::ref(out));
|
||||
}
|
||||
|
||||
// main thread
|
||||
log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
|
||||
cache, out);
|
||||
log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, out);
|
||||
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
||||
workers[iw].join();
|
||||
}
|
||||
@@ -455,7 +404,7 @@ static bool log_mel_spectrogram(
|
||||
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
auto &value = out.data[i * out.n_len + j];
|
||||
value = (value - mean) / mstd;
|
||||
value = (value - mean) / mstd;
|
||||
}
|
||||
|
||||
// pad the rest with zeros
|
||||
@@ -501,14 +450,18 @@ static bool log_mel_spectrogram(
|
||||
//
|
||||
|
||||
void mtmd_audio_preprocessor_whisper::initialize() {
|
||||
cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||
cache.fill_hann_window(hparams.audio_window_len, true);
|
||||
cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
|
||||
g_cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||
g_cache.fill_hann_window(hparams.audio_window_len, true);
|
||||
g_cache.fill_mel_filterbank_matrix(
|
||||
hparams.n_mel_bins,
|
||||
hparams.audio_n_fft,
|
||||
hparams.audio_sample_rate);
|
||||
}
|
||||
|
||||
bool mtmd_audio_preprocessor_whisper::preprocess(const float * samples,
|
||||
size_t n_samples,
|
||||
std::vector<mtmd_audio_mel> & output) {
|
||||
bool mtmd_audio_preprocessor_whisper::preprocess(
|
||||
const float * samples,
|
||||
size_t n_samples,
|
||||
std::vector<mtmd_audio_mel> & output) {
|
||||
if (n_samples == 0) {
|
||||
// empty audio
|
||||
return false;
|
||||
@@ -518,7 +471,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
|
||||
// if input is too short, pad with zeros
|
||||
// this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
|
||||
// TODO: maybe handle this better
|
||||
size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
|
||||
size_t min_samples = (size_t)hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
|
||||
if (n_samples < min_samples) {
|
||||
smpl.resize(min_samples, 0.0f);
|
||||
std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
|
||||
@@ -533,19 +486,22 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
|
||||
params.hop_length = hparams.audio_hop_len;
|
||||
params.sample_rate = hparams.audio_sample_rate;
|
||||
params.center_padding = false;
|
||||
params.preemph = 0.0f; // disabled
|
||||
params.preemph = 0.0f; // disabled
|
||||
params.use_natural_log = false;
|
||||
params.norm_per_feature = false;
|
||||
|
||||
// make sure the cache is initialized
|
||||
GGML_ASSERT(!cache.sin_vals.empty());
|
||||
GGML_ASSERT(!cache.cos_vals.empty());
|
||||
GGML_ASSERT(!cache.filters.data.empty());
|
||||
// make sure the global cache is initialized
|
||||
GGML_ASSERT(!g_cache.sin_vals.empty());
|
||||
GGML_ASSERT(!g_cache.cos_vals.empty());
|
||||
GGML_ASSERT(!g_cache.filters.data.empty());
|
||||
|
||||
mtmd_audio_mel out_full;
|
||||
bool ok = log_mel_spectrogram(samples, n_samples,
|
||||
4, // n_threads
|
||||
params, cache, out_full);
|
||||
bool ok = log_mel_spectrogram(
|
||||
samples,
|
||||
n_samples,
|
||||
4, // n_threads
|
||||
params,
|
||||
out_full);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
@@ -556,21 +512,21 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
|
||||
printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
|
||||
}
|
||||
const size_t frames_per_chunk = 3000;
|
||||
GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
|
||||
for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
|
||||
int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
|
||||
if ((size_t) n_len < frames_per_chunk) {
|
||||
break; // last uncomplete chunk will always be a padded chunk, safe to ignore
|
||||
GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
|
||||
for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
|
||||
int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
|
||||
if ((size_t)n_len < frames_per_chunk) {
|
||||
break; // last uncomplete chunk will always be a padded chunk, safe to ignore
|
||||
}
|
||||
|
||||
mtmd_audio_mel out_chunk;
|
||||
out_chunk.n_len = n_len;
|
||||
out_chunk.n_mel = out_full.n_mel;
|
||||
out_chunk.n_len_org = out_full.n_mel; // unused
|
||||
out_chunk.n_len_org = out_full.n_mel; // unused
|
||||
out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
|
||||
|
||||
for (int i = 0; i < out_full.n_mel; i++) {
|
||||
auto src = out_full.data.begin() + i * out_full.n_len + off;
|
||||
auto src = out_full.data.begin() + i*out_full.n_len + off;
|
||||
out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
|
||||
}
|
||||
|
||||
@@ -585,14 +541,18 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
|
||||
//
|
||||
|
||||
void mtmd_audio_preprocessor_conformer::initialize() {
|
||||
cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||
cache.fill_hann_window(hparams.audio_window_len, true);
|
||||
cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
|
||||
g_cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||
g_cache.fill_hann_window(hparams.audio_window_len, true);
|
||||
g_cache.fill_mel_filterbank_matrix(
|
||||
hparams.n_mel_bins,
|
||||
hparams.audio_n_fft,
|
||||
hparams.audio_sample_rate);
|
||||
}
|
||||
|
||||
bool mtmd_audio_preprocessor_conformer::preprocess(const float * samples,
|
||||
size_t n_samples,
|
||||
std::vector<mtmd_audio_mel> & output) {
|
||||
bool mtmd_audio_preprocessor_conformer::preprocess(
|
||||
const float * samples,
|
||||
size_t n_samples,
|
||||
std::vector<mtmd_audio_mel> & output) {
|
||||
// empty audio
|
||||
if (n_samples == 0) {
|
||||
return false;
|
||||
@@ -609,15 +569,18 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float *
|
||||
params.use_natural_log = true;
|
||||
params.norm_per_feature = true;
|
||||
|
||||
// make sure the cache is initialized
|
||||
GGML_ASSERT(!cache.sin_vals.empty());
|
||||
GGML_ASSERT(!cache.cos_vals.empty());
|
||||
GGML_ASSERT(!cache.filters.data.empty());
|
||||
// make sure the global cache is initialized
|
||||
GGML_ASSERT(!g_cache.sin_vals.empty());
|
||||
GGML_ASSERT(!g_cache.cos_vals.empty());
|
||||
GGML_ASSERT(!g_cache.filters.data.empty());
|
||||
|
||||
mtmd_audio_mel out_full;
|
||||
bool ok = log_mel_spectrogram(samples, n_samples,
|
||||
4, // n_threads
|
||||
params, cache, out_full);
|
||||
bool ok = log_mel_spectrogram(
|
||||
samples,
|
||||
n_samples,
|
||||
4, // n_threads
|
||||
params,
|
||||
out_full);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
@@ -625,106 +588,3 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float *
|
||||
output.push_back(std::move(out_full));
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_audio_streaming_istft implementation
|
||||
//
|
||||
|
||||
mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
|
||||
n_fft(n_fft),
|
||||
hop_length(hop_length),
|
||||
n_fft_bins(n_fft / 2 + 1),
|
||||
overlap_buffer(n_fft, 0.0f),
|
||||
window_sum_buffer(n_fft, 0.0f),
|
||||
padding_to_remove((n_fft - hop_length) / 2),
|
||||
ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT
|
||||
ifft_out(n_fft * 2 * 4, 0.0f) {
|
||||
cache.fill_sin_cos_table(n_fft);
|
||||
cache.fill_hann_window(n_fft, true);
|
||||
}
|
||||
|
||||
void mtmd_audio_streaming_istft::reset() {
|
||||
std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
|
||||
std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
|
||||
padding_to_remove = (n_fft - hop_length) / 2;
|
||||
}
|
||||
|
||||
std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
|
||||
std::vector<float> output(hop_length);
|
||||
|
||||
// copy frequencies
|
||||
for (int j = 0; j < n_fft_bins; j++) {
|
||||
ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
|
||||
ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
|
||||
}
|
||||
|
||||
// mirror negative frequencies
|
||||
for (int j = 1; j < n_fft_bins - 1; j++) {
|
||||
int mirror_idx = n_fft - j;
|
||||
ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
|
||||
ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1]; // conjugate
|
||||
}
|
||||
|
||||
ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
|
||||
|
||||
// update window sum and overlap buffer
|
||||
for (int j = 0; j < n_fft; j++) {
|
||||
window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
|
||||
overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
|
||||
}
|
||||
|
||||
// extract hop_length samples with normalization
|
||||
for (int i = 0; i < hop_length; i++) {
|
||||
if (window_sum_buffer[i] > 1e-8f) {
|
||||
output[i] = overlap_buffer[i] / window_sum_buffer[i];
|
||||
} else {
|
||||
output[i] = overlap_buffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
// shift buffers left by hop_length
|
||||
std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
|
||||
std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
|
||||
|
||||
std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
|
||||
std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
|
||||
|
||||
// Remove padding if needed
|
||||
int to_remove = std::min(padding_to_remove, (int) output.size());
|
||||
padding_to_remove -= to_remove;
|
||||
output.erase(output.begin(), output.begin() + to_remove);
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
std::vector<float> mtmd_audio_streaming_istft::flush() {
|
||||
std::vector<float> output;
|
||||
|
||||
// Extract remaining samples from overlap buffer
|
||||
// Continue until we've extracted all meaningful samples
|
||||
int remaining = n_fft - hop_length;
|
||||
while (remaining > 0) {
|
||||
int chunk_size = std::min(remaining, hop_length);
|
||||
|
||||
for (int i = 0; i < chunk_size; i++) {
|
||||
float sample;
|
||||
if (window_sum_buffer[i] > 1e-8f) {
|
||||
sample = overlap_buffer[i] / window_sum_buffer[i];
|
||||
} else {
|
||||
sample = overlap_buffer[i];
|
||||
}
|
||||
output.push_back(sample);
|
||||
}
|
||||
|
||||
// Shift buffers
|
||||
std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
|
||||
std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
|
||||
|
||||
std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
|
||||
std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
|
||||
|
||||
remaining -= chunk_size;
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
@@ -17,38 +17,6 @@ struct mtmd_audio_mel {
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct mtmd_audio_mel_filters {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
// cache for audio processing, each processor instance owns its own cache
|
||||
struct mtmd_audio_cache {
|
||||
std::vector<float> sin_vals;
|
||||
std::vector<float> cos_vals;
|
||||
|
||||
std::vector<float> hann_window;
|
||||
|
||||
mtmd_audio_mel_filters filters;
|
||||
|
||||
void fill_sin_cos_table(int n);
|
||||
|
||||
void fill_hann_window(int length, bool periodic);
|
||||
|
||||
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
|
||||
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
|
||||
void fill_mel_filterbank_matrix(int n_mel,
|
||||
int n_fft,
|
||||
int sample_rate, // e.g. 16000
|
||||
float fmin = 0.0f, // e.g. 0.0
|
||||
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
|
||||
bool slaney_area_norm = true,
|
||||
float scale = 1.0f // optional extra scaling
|
||||
);
|
||||
};
|
||||
|
||||
struct mtmd_audio_preprocessor {
|
||||
const clip_hparams & hparams;
|
||||
|
||||
@@ -63,51 +31,10 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
||||
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||
void initialize() override;
|
||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||
|
||||
private:
|
||||
mtmd_audio_cache cache;
|
||||
};
|
||||
|
||||
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
|
||||
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||
void initialize() override;
|
||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||
|
||||
private:
|
||||
mtmd_audio_cache cache;
|
||||
};
|
||||
|
||||
//
|
||||
// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
|
||||
//
|
||||
struct mtmd_audio_streaming_istft {
|
||||
mtmd_audio_streaming_istft(int n_fft, int hop_length);
|
||||
|
||||
// reset streaming state
|
||||
void reset();
|
||||
|
||||
// process a single STFT frame (streaming)
|
||||
// frame_spectrum: [n_fft_bins x 2] interleaved real/imag
|
||||
// returns: up to hop_length samples
|
||||
std::vector<float> process_frame(const float * frame_spectrum);
|
||||
|
||||
// flush remaining samples at end of stream
|
||||
std::vector<float> flush();
|
||||
|
||||
private:
|
||||
int n_fft;
|
||||
int hop_length;
|
||||
int n_fft_bins;
|
||||
|
||||
// Own cache for output processing
|
||||
mtmd_audio_cache cache;
|
||||
|
||||
// Streaming state
|
||||
std::vector<float> overlap_buffer;
|
||||
std::vector<float> window_sum_buffer;
|
||||
int padding_to_remove;
|
||||
|
||||
// Working buffers for IFFT
|
||||
std::vector<float> ifft_in;
|
||||
std::vector<float> ifft_out;
|
||||
};
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "debug.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
@@ -89,8 +88,6 @@ struct mtmd_cli_context {
|
||||
int n_threads = 1;
|
||||
llama_pos n_past = 0;
|
||||
|
||||
base_callback_data cb_data;
|
||||
|
||||
mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
model = llama_init->model();
|
||||
lctx = llama_init->context();
|
||||
@@ -142,10 +139,6 @@ struct mtmd_cli_context {
|
||||
mparams.warmup = params.warmup;
|
||||
mparams.image_min_tokens = params.image_min_tokens;
|
||||
mparams.image_max_tokens = params.image_max_tokens;
|
||||
if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) {
|
||||
mparams.cb_eval_user_data = &cb_data;
|
||||
mparams.cb_eval = common_debug_cb_eval<false>;
|
||||
}
|
||||
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
||||
if (!ctx_vision.get()) {
|
||||
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
||||
|
||||
@@ -111,8 +111,6 @@ mtmd_context_params mtmd_context_params_default() {
|
||||
/* warmup */ true,
|
||||
/* image_min_tokens */ -1,
|
||||
/* image_max_tokens */ -1,
|
||||
/* cb_eval */ nullptr,
|
||||
/* cb_eval_user_data */ nullptr,
|
||||
};
|
||||
return params;
|
||||
}
|
||||
@@ -148,6 +146,8 @@ struct mtmd_context {
|
||||
bool tok_row_end_trail = false;
|
||||
bool ov_img_first = false;
|
||||
|
||||
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
|
||||
|
||||
// string template for slice image delimiters with row/col (idefics3)
|
||||
std::string sli_img_start_tmpl;
|
||||
|
||||
@@ -178,8 +178,6 @@ struct mtmd_context {
|
||||
/* image_min_tokens */ ctx_params.image_min_tokens,
|
||||
/* image_max_tokens */ ctx_params.image_max_tokens,
|
||||
/* warmup */ ctx_params.warmup,
|
||||
/* cb_eval */ ctx_params.cb_eval,
|
||||
/* cb_eval_user_data */ ctx_params.cb_eval_user_data,
|
||||
};
|
||||
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
@@ -219,6 +217,7 @@ struct mtmd_context {
|
||||
|
||||
void init_vision() {
|
||||
GGML_ASSERT(ctx_v != nullptr);
|
||||
use_mrope = clip_is_mrope(ctx_v);
|
||||
|
||||
projector_type proj = clip_get_projector_type(ctx_v);
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||
@@ -267,7 +266,7 @@ struct mtmd_context {
|
||||
}
|
||||
|
||||
// set boi/eoi
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3) {
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
img_beg = "<start_of_image>";
|
||||
img_end = "<end_of_image>";
|
||||
@@ -284,7 +283,7 @@ struct mtmd_context {
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
img_end = "[IMG_END]";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
|
||||
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
@@ -331,7 +330,6 @@ struct mtmd_context {
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
@@ -354,9 +352,6 @@ struct mtmd_context {
|
||||
// [BEGIN_AUDIO] ... (embeddings) ...
|
||||
aud_beg = "[BEGIN_AUDIO]";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
||||
// <sound> ... (embeddings) ...
|
||||
aud_beg = "<sound>";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -628,7 +623,7 @@ struct mtmd_tokenizer {
|
||||
}
|
||||
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
if (ctx->use_mrope) {
|
||||
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
||||
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
@@ -863,24 +858,14 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
switch (ctx->proj_type_v()) {
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
switch (ctx->proj_type_v()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return ctx->use_mrope;
|
||||
}
|
||||
|
||||
bool mtmd_support_vision(mtmd_context * ctx) {
|
||||
|
||||
@@ -27,9 +27,6 @@
|
||||
* - Make sure the C API is aligned with the libllama C API (as in llama.h)
|
||||
* - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
|
||||
* - Keep the API minimal, do not expose internal details unless necessary
|
||||
*
|
||||
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
|
||||
* We encourage human contributors to ensure the quality and reliability of the codebase.
|
||||
*/
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
@@ -95,10 +92,6 @@ struct mtmd_context_params {
|
||||
// limit number of image tokens, only for vision models with dynamic resolution
|
||||
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
|
||||
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
|
||||
|
||||
// callback function passed over to mtmd proper
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
};
|
||||
|
||||
MTMD_API const char * mtmd_default_marker(void);
|
||||
@@ -277,12 +270,12 @@ struct bitmap {
|
||||
ptr.reset(mtmd_bitmap_init(nx, ny, data));
|
||||
}
|
||||
~bitmap() = default;
|
||||
uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
|
||||
uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
|
||||
const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
|
||||
size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
|
||||
std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
|
||||
void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
|
||||
uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
|
||||
uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
|
||||
const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
|
||||
size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
|
||||
std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
|
||||
void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
|
||||
};
|
||||
|
||||
struct bitmaps {
|
||||
@@ -306,8 +299,8 @@ struct input_chunks {
|
||||
input_chunks() = default;
|
||||
input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
|
||||
~input_chunks() = default;
|
||||
size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
|
||||
const mtmd_input_chunk * operator[](size_t idx) const {
|
||||
size_t size() { return mtmd_input_chunks_size(ptr.get()); }
|
||||
const mtmd_input_chunk * operator[](size_t idx) {
|
||||
return mtmd_input_chunks_get(ptr.get(), idx);
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user