model: EmbeddingGemma Adding Support for SentenceTransformers Dense Modules (#16367)
* model: EmbeddingGemma sentence-transformers dense linear projections support * model: add support for EmbeddingGemma SentenceTransformers dense linear projections Adding support for the Dense modules used in EmbeddingGemma models. EmbeddingGemma is a SentenceTransformers model with additional modules beyond the base Transformer backbone. See: https://developers.googleblog.com/en/gemma-explained-embeddinggemma-architecture-and-recipe/ * model: add support for EmbeddingGemma SentenceTransformers dense linear projections - converting model with dense-layers is optional - introduced dense config params * Update convert_hf_to_gguf.py Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com> * fixed formatting issues * Update src/llama-graph.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * - removed pooling_type_opt, always allow overriding pooling_type - asserts checking dense features dims * fix python lint * fix ubuntu gcc build warning * - fixed thread-safety test - moved asserts to load_hparams * - tidying up code - simplifying graph-context expecting both dense weights * minor : add TODO --------- Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@@ -219,6 +219,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
|
||||
|
||||
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
|
||||
// sentence-transformers dense modules feature dims
|
||||
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
|
||||
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
|
||||
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
|
||||
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
|
||||
|
||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
||||
@@ -1071,6 +1076,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_DENSE_2_OUT, "dense_2" },
|
||||
{ LLM_TENSOR_DENSE_3_OUT, "dense_3" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
@@ -2281,6 +2288,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
||||
{LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
||||
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
|
||||
@@ -271,6 +271,12 @@ enum llm_kv {
|
||||
LLM_KV_TOKENIZER_PREFIX_ID,
|
||||
LLM_KV_TOKENIZER_SUFFIX_ID,
|
||||
LLM_KV_TOKENIZER_MIDDLE_ID,
|
||||
|
||||
// sentence-transformers dense layers in and out features
|
||||
LLM_KV_DENSE_2_FEAT_IN,
|
||||
LLM_KV_DENSE_2_FEAT_OUT,
|
||||
LLM_KV_DENSE_3_FEAT_IN,
|
||||
LLM_KV_DENSE_3_FEAT_OUT,
|
||||
};
|
||||
|
||||
enum llm_tensor {
|
||||
@@ -278,6 +284,8 @@ enum llm_tensor {
|
||||
LLM_TENSOR_TOKEN_EMBD_NORM,
|
||||
LLM_TENSOR_TOKEN_TYPES,
|
||||
LLM_TENSOR_POS_EMBD,
|
||||
LLM_TENSOR_DENSE_2_OUT,
|
||||
LLM_TENSOR_DENSE_3_OUT,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
|
||||
@@ -2346,6 +2346,12 @@ llama_context * llama_init_from_model(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (params.pooling_type != model->hparams.pooling_type) {
|
||||
//user-specified pooling-type is different from the model default
|
||||
LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
|
||||
model->hparams.pooling_type, params.pooling_type);
|
||||
}
|
||||
|
||||
try {
|
||||
auto * ctx = new llama_context(*model, params);
|
||||
return ctx;
|
||||
|
||||
@@ -1853,6 +1853,23 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
void llm_graph_context::build_dense_out(
|
||||
ggml_tensor * dense_2,
|
||||
ggml_tensor * dense_3) const {
|
||||
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
|
||||
return;
|
||||
}
|
||||
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
|
||||
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
|
||||
|
||||
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
||||
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
||||
cb(cur, "result_embd_pooled", -1);
|
||||
res->t_embd_pooled = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
|
||||
void llm_graph_context::build_pooling(
|
||||
ggml_tensor * cls,
|
||||
ggml_tensor * cls_b,
|
||||
|
||||
@@ -814,6 +814,14 @@ struct llm_graph_context {
|
||||
ggml_tensor * cls_b,
|
||||
ggml_tensor * cls_out,
|
||||
ggml_tensor * cls_out_b) const;
|
||||
|
||||
//
|
||||
// dense (out)
|
||||
//
|
||||
|
||||
void build_dense_out(
|
||||
ggml_tensor * dense_2,
|
||||
ggml_tensor * dense_3) const;
|
||||
};
|
||||
|
||||
// TODO: better name
|
||||
|
||||
@@ -169,6 +169,12 @@ struct llama_hparams {
|
||||
uint32_t laurel_rank = 64;
|
||||
uint32_t n_embd_altup = 256;
|
||||
|
||||
// needed for sentence-transformers dense layers
|
||||
uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense
|
||||
uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense
|
||||
uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense
|
||||
uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense
|
||||
|
||||
// xIELU
|
||||
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
|
||||
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
|
||||
|
||||
@@ -1218,12 +1218,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.set_swa_pattern(6);
|
||||
|
||||
hparams.causal_attn = false; // embeddings do not use causal attention
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
|
||||
//applied only if model converted with --sentence-transformers-dense-modules
|
||||
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
|
||||
ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
|
||||
ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
|
||||
ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
|
||||
|
||||
GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
|
||||
GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = LLM_TYPE_0_3B; break;
|
||||
@@ -3686,6 +3695,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
// Dense linear weights
|
||||
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
|
||||
dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
@@ -19893,6 +19907,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
// add on pooling layer
|
||||
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
||||
|
||||
// if the gguf model was converted with --sentence-transformers-dense-modules
|
||||
// there will be two additional dense projection layers
|
||||
// dense linear projections are applied after pooling
|
||||
// TODO: move reranking logic here and generalize
|
||||
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
||||
|
||||
return llm->res->get_gf();
|
||||
}
|
||||
|
||||
|
||||
@@ -438,6 +438,12 @@ struct llama_model {
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
//Dense linear projections for SentenceTransformers models like embeddinggemma
|
||||
// For Sentence Transformers models structure see
|
||||
// https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
|
||||
struct ggml_tensor * dense_2_out_layers = nullptr;
|
||||
struct ggml_tensor * dense_3_out_layers = nullptr;
|
||||
|
||||
llama_model_params params;
|
||||
|
||||
// gguf metadata
|
||||
|
||||
Reference in New Issue
Block a user