model: EmbeddingGemma Adding Support for SentenceTransformers Dense Modules (#16367)

* model: EmbeddingGemma sentence-transformers dense linear projections support * model: add support for EmbeddingGemma SentenceTransformers dense linear projections Adding support for the Dense modules used in EmbeddingGemma models. EmbeddingGemma is a SentenceTransformers model with additional modules beyond the base Transformer backbone. See: https://developers.googleblog.com/en/gemma-explained-embeddinggemma-architecture-and-recipe/ * model: add support for EmbeddingGemma SentenceTransformers dense linear projections - converting model with dense-layers is optional - introduced dense config params * Update convert_hf_to_gguf.py Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com> * fixed formatting issues * Update src/llama-graph.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * - removed pooling_type_opt, always allow overriding pooling_type - asserts checking dense features dims * fix python lint * fix ubuntu gcc build warning * - fixed thread-safety test - moved asserts to load_hparams * - tidying up code - simplifying graph-context expecting both dense weights * minor : add TODO --------- Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-10-09 08:39:18 +02:00
parent 12bbc3fa50
commit e08db42595
12 changed files with 170 additions and 7 deletions
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -219,6 +219,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },

    { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
+    // sentence-transformers dense modules feature dims
+    { LLM_KV_DENSE_2_FEAT_IN,        "%s.dense_2_feat_in"  },
+    { LLM_KV_DENSE_2_FEAT_OUT,       "%s.dense_2_feat_out"  },
+    { LLM_KV_DENSE_3_FEAT_IN,        "%s.dense_3_feat_in"   },
+    { LLM_KV_DENSE_3_FEAT_OUT,       "%s.dense_3_feat_out"  },

    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
@@ -1071,6 +1076,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_DENSE_2_OUT,     "dense_2" },
+            { LLM_TENSOR_DENSE_3_OUT,     "dense_3" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
@@ -2281,6 +2288,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DENSE_2_OUT,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
+    {LLM_TENSOR_DENSE_3_OUT,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
    {LLM_TENSOR_OUTPUT_NORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
    {LLM_TENSOR_DEC_OUTPUT_NORM,            {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
    {LLM_TENSOR_ENC_OUTPUT_NORM,            {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -271,6 +271,12 @@ enum llm_kv {
    LLM_KV_TOKENIZER_PREFIX_ID,
    LLM_KV_TOKENIZER_SUFFIX_ID,
    LLM_KV_TOKENIZER_MIDDLE_ID,
+
+    // sentence-transformers dense layers in and out features
+    LLM_KV_DENSE_2_FEAT_IN,
+    LLM_KV_DENSE_2_FEAT_OUT,
+    LLM_KV_DENSE_3_FEAT_IN,
+    LLM_KV_DENSE_3_FEAT_OUT,
 };

 enum llm_tensor {
@@ -278,6 +284,8 @@ enum llm_tensor {
    LLM_TENSOR_TOKEN_EMBD_NORM,
    LLM_TENSOR_TOKEN_TYPES,
    LLM_TENSOR_POS_EMBD,
+    LLM_TENSOR_DENSE_2_OUT,
+    LLM_TENSOR_DENSE_3_OUT,
    LLM_TENSOR_OUTPUT,
    LLM_TENSOR_OUTPUT_NORM,
    LLM_TENSOR_ROPE_FREQS,
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2346,6 +2346,12 @@ llama_context * llama_init_from_model(
        return nullptr;
    }

+    if (params.pooling_type != model->hparams.pooling_type) {
+        //user-specified pooling-type is different from the model default
+        LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
+                       model->hparams.pooling_type, params.pooling_type);
+    }
+
    try {
        auto * ctx = new llama_context(*model, params);
        return ctx;
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1853,6 +1853,23 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
    return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
 }

+void llm_graph_context::build_dense_out(
+    ggml_tensor * dense_2,
+    ggml_tensor * dense_3) const {
+    if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
+        return;
+    }
+    ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
+    GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
+
+    cur = ggml_mul_mat(ctx0, dense_2, cur);
+    cur = ggml_mul_mat(ctx0, dense_3, cur);
+    cb(cur, "result_embd_pooled", -1);
+    res->t_embd_pooled = cur;
+    ggml_build_forward_expand(gf, cur);
+}
+
+
 void llm_graph_context::build_pooling(
        ggml_tensor * cls,
        ggml_tensor * cls_b,
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -814,6 +814,14 @@ struct llm_graph_context {
            ggml_tensor * cls_b,
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
+
+    //
+    // dense (out)
+    //
+
+    void build_dense_out(
+            ggml_tensor * dense_2,
+            ggml_tensor * dense_3) const;
 };

 // TODO: better name
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -169,6 +169,12 @@ struct llama_hparams {
    uint32_t laurel_rank  = 64;
    uint32_t n_embd_altup = 256;

+    // needed for sentence-transformers dense layers
+    uint32_t dense_2_feat_in  = 0;  // in_features of the 2_Dense
+    uint32_t dense_2_feat_out = 0;  // out_features of the 2_Dense
+    uint32_t dense_3_feat_in  = 0;  // in_features of the 3_Dense
+    uint32_t dense_3_feat_out = 0;  // out_features of the 3_Dense
+
    // xIELU
    std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
    std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1218,12 +1218,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.set_swa_pattern(6);

                hparams.causal_attn = false; // embeddings do not use causal attention
-                hparams.rope_freq_base_train_swa  = 10000.0f;
+                hparams.rope_freq_base_train_swa = 10000.0f;
                hparams.rope_freq_scale_train_swa = 1.0f;

-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type);
+                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+
+                //applied only if model converted with --sentence-transformers-dense-modules
+                ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
+                ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
+                ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
+                ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
+
+                GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
+                GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");

                switch (hparams.n_layer) {
                    case 24: type = LLM_TYPE_0_3B; break;
@@ -3686,6 +3695,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,   "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                    }

+                    // Dense linear weights
+                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
+                    dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
+
+
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];

@@ -19893,6 +19907,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
    // add on pooling layer
    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);

+    // if the gguf model was converted with --sentence-transformers-dense-modules
+    // there will be two additional dense projection layers
+    // dense linear projections are applied after pooling
+    // TODO: move reranking logic here and generalize
+    llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
+
    return llm->res->get_gf();
 }

--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -438,6 +438,12 @@ struct llama_model {

    std::vector<llama_layer> layers;

+    //Dense linear projections for SentenceTransformers models like embeddinggemma
+    // For Sentence Transformers models structure see
+    // https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
+    struct ggml_tensor * dense_2_out_layers = nullptr;
+    struct ggml_tensor * dense_3_out_layers = nullptr;
+
    llama_model_params params;

    // gguf metadata