初始化项目，由ModelHub XC社区提供模型

Model: shibing624/text2vec-bge-large-chinese Source: Original Platform
2026-05-14 12:53:42 +08:00
commit 7804d022b1
18 changed files with 85287 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/1_Pooling/config.json
+++ b/1_Pooling/config.json
@@ -0,0 +1,7 @@
 {
  "word_embedding_dimension": 1024,
  "pooling_mode_cls_token": false,
  "pooling_mode_mean_tokens": true,
  "pooling_mode_max_tokens": false,
  "pooling_mode_mean_sqrt_len_tokens": false
 }
--- a/README.md
+++ b/README.md
@@ -0,0 +1,184 @@
 ---
 pipeline_tag: sentence-similarity
 license: apache-2.0
 tags:
 - text2vec
 - feature-extraction
 - sentence-similarity
 - transformers
 - sentence-transformers
 - onnx
 language:
 - zh
 metrics:
 - spearmanr
 library_name: sentence-transformers
 ---
 # shibing624/text2vec-bge-large-chinese
 This is a CoSENT(Cosine Sentence) model: shibing624/text2vec-bge-large-chinese.
 It maps sentences to a 1024 dimensional dense vector space and can be used for tasks 
 like sentence embeddings, text matching or semantic search.
 - training dataset: https://huggingface.co/datasets/shibing624/nli-zh-all/tree/main/text2vec-base-chinese-paraphrase-dataset
 - base model: https://huggingface.co/BAAI/bge-large-zh-noinstruct
 - max_seq_length: 256
 - best epoch: 4
 - sentence embedding dim: 1024
 ## Evaluation
 For an automated evaluation of this model, see the *Evaluation Benchmark*: [text2vec](https://github.com/shibing624/text2vec)
 ### Release Models
 - 本项目release模型的中文匹配评测结果：
 | Arch       | BaseModel                                                   | Model                                                                                                                                             | ATEC  |  BQ   | LCQMC | PAWSX | STS-B | SOHU-dd | SOHU-dc |    Avg    |  QPS  |
 |:-----------|:------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------|:-----:|:-----:|:-----:|:-----:|:-----:|:-------:|:-------:|:---------:|:-----:|
 | Word2Vec   | word2vec                                                    | [w2v-light-tencent-chinese](https://ai.tencent.com/ailab/nlp/en/download.html)                                                                    | 20.00 | 31.49 | 59.46 | 2.57  | 55.78 |  55.04  |  20.70  |   35.03   | 23769 |
 | SBERT      | xlm-roberta-base                                            | [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) | 18.42 | 38.52 | 63.96 | 10.14 | 78.90 |  63.01  |  52.28  |   46.46   | 3138  |
 | CoSENT     | hfl/chinese-macbert-base                                    | [shibing624/text2vec-base-chinese](https://huggingface.co/shibing624/text2vec-base-chinese)                                                       | 31.93 | 42.67 | 70.16 | 17.21 | 79.30 |  70.27  |  50.42  |   51.61   | 3008  |
 | CoSENT     | hfl/chinese-lert-large                                      | [GanymedeNil/text2vec-large-chinese](https://huggingface.co/GanymedeNil/text2vec-large-chinese)                                                   | 32.61 | 44.59 | 69.30 | 14.51 | 79.44 |  73.01  |  59.04  |   53.12   | 2092  |
 | CoSENT     | nghuyong/ernie-3.0-base-zh                                  | [shibing624/text2vec-base-chinese-sentence](https://huggingface.co/shibing624/text2vec-base-chinese-sentence)                                     | 43.37 | 61.43 | 73.48 | 38.90 | 78.25 |  70.60  |  53.08  |   59.87   | 3089  |
 | CoSENT     | nghuyong/ernie-3.0-base-zh                                  | [shibing624/text2vec-base-chinese-paraphrase](https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase)                                 | 44.89 | 63.58 | 74.24 | 40.90 | 78.93 |  76.70  |  63.30  | **63.08** | 3066  |
 | CoSENT     | sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 | [shibing624/text2vec-base-multilingual](https://huggingface.co/shibing624/text2vec-base-multilingual)                                             | 32.39 | 50.33 | 65.64 | 32.56 | 74.45 |  68.88  |  51.17  |   53.67   | 3138  |
 | CoSENT     | BAAI/bge-large-zh-noinstruct                                | [shibing624/text2vec-bge-large-chinese](https://huggingface.co/shibing624/text2vec-bge-large-chinese)                                             | 38.41 | 61.34 | 71.72 | 35.15 | 76.44 |  71.81  |  63.15  |   59.72   |  844  |
 说明：
 - 结果评测指标：spearman系数
 - `shibing624/text2vec-base-chinese`模型，是用CoSENT方法训练，基于`hfl/chinese-macbert-base`在中文STS-B数据训练得到，并在中文STS-B测试集评估达到较好效果，运行[examples/training_sup_text_matching_model.py](https://github.com/shibing624/text2vec/blob/master/examples/training_sup_text_matching_model.py)代码可训练模型，模型文件已经上传HF model hub，中文通用语义匹配任务推荐使用
 - `shibing624/text2vec-base-chinese-sentence`模型，是用CoSENT方法训练，基于`nghuyong/ernie-3.0-base-zh`用人工挑选后的中文STS数据集[shibing624/nli-zh-all/text2vec-base-chinese-sentence-dataset](https://huggingface.co/datasets/shibing624/nli-zh-all/tree/main/text2vec-base-chinese-sentence-dataset)训练得到，并在中文各NLI测试集评估达到较好效果，运行[examples/training_sup_text_matching_model_jsonl_data.py](https://github.com/shibing624/text2vec/blob/master/examples/training_sup_text_matching_model_jsonl_data.py)代码可训练模型，模型文件已经上传HF model hub，中文s2s(句子vs句子)语义匹配任务推荐使用
 - `shibing624/text2vec-base-chinese-paraphrase`模型，是用CoSENT方法训练，基于`nghuyong/ernie-3.0-base-zh`用人工挑选后的中文STS数据集[shibing624/nli-zh-all/text2vec-base-chinese-paraphrase-dataset](https://huggingface.co/datasets/shibing624/nli-zh-all/tree/main/text2vec-base-chinese-paraphrase-dataset)，数据集相对于[shibing624/nli-zh-all/text2vec-base-chinese-sentence-dataset](https://huggingface.co/datasets/shibing624/nli-zh-all/tree/main/text2vec-base-chinese-sentence-dataset)加入了s2p(sentence to paraphrase)数据，强化了其长文本的表征能力，并在中文各NLI测试集评估达到SOTA，运行[examples/training_sup_text_matching_model_jsonl_data.py](https://github.com/shibing624/text2vec/blob/master/examples/training_sup_text_matching_model_jsonl_data.py)代码可训练模型，模型文件已经上传HF model hub，中文s2p(句子vs段落)语义匹配任务推荐使用
 - `shibing624/text2vec-base-multilingual`模型，是用CoSENT方法训练，基于`sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`用人工挑选后的多语言STS数据集[shibing624/nli-zh-all/text2vec-base-multilingual-dataset](https://huggingface.co/datasets/shibing624/nli-zh-all/tree/main/text2vec-base-multilingual-dataset)训练得到，并在中英文测试集评估相对于原模型效果有提升，运行[examples/training_sup_text_matching_model_jsonl_data.py](https://github.com/shibing624/text2vec/blob/master/examples/training_sup_text_matching_model_jsonl_data.py)代码可训练模型，模型文件已经上传HF model hub，多语言语义匹配任务推荐使用
 - `shibing624/text2vec-bge-large-chinese`模型，是用CoSENT方法训练，基于`BAAI/bge-large-zh-noinstruct`用人工挑选后的中文STS数据集[shibing624/nli-zh-all/text2vec-base-chinese-paraphrase-dataset](https://huggingface.co/datasets/shibing624/nli-zh-all/tree/main/text2vec-base-chinese-paraphrase-dataset)训练得到，并在中文测试集评估相对于原模型效果有提升，在短文本区分度上提升明显，运行[examples/training_sup_text_matching_model_jsonl_data.py](https://github.com/shibing624/text2vec/blob/master/examples/training_sup_text_matching_model_jsonl_data.py)代码可训练模型，模型文件已经上传HF model hub，中文s2s(句子vs句子)语义匹配任务推荐使用
 - `w2v-light-tencent-chinese`是腾讯词向量的Word2Vec模型，CPU加载使用，适用于中文字面匹配任务和缺少数据的冷启动情况
 - 各预训练模型均可以通过transformers调用，如MacBERT模型：`--model_name hfl/chinese-macbert-base` 或者roberta模型：`--model_name uer/roberta-medium-wwm-chinese-cluecorpussmall`
 - 为测评模型的鲁棒性，加入了未训练过的SOHU测试集，用于测试模型的泛化能力；为达到开箱即用的实用效果，使用了搜集到的各中文匹配数据集，数据集也上传到HF datasets[链接见下方](#数据集)
 - 中文匹配任务实验表明，pooling最优是`EncoderType.FIRST_LAST_AVG`和`EncoderType.MEAN`，两者预测效果差异很小
 - 中文匹配评测结果复现，可以下载中文匹配数据集到`examples/data`，运行 [tests/model_spearman.py](https://github.com/shibing624/text2vec/blob/master/tests/model_spearman.py) 代码复现评测结果
 - QPS的GPU测试环境是Tesla V100，显存32GB
 模型训练实验报告：[实验报告](https://github.com/shibing624/text2vec/blob/master/docs/model_report.md)
 ## Usage (text2vec)
 Using this model becomes easy when you have [text2vec](https://github.com/shibing624/text2vec) installed:
 ```
 pip install -U text2vec
 ```
 Then you can use the model like this:
 ```python
 from text2vec import SentenceModel
 sentences = ['如何更换花呗绑定银行卡', '花呗更改绑定银行卡']
 model = SentenceModel('shibing624/text2vec-bge-large-chinese')
 embeddings = model.encode(sentences)
 print(embeddings)
 ```
 ## Usage (HuggingFace Transformers)
 Without [text2vec](https://github.com/shibing624/text2vec), you can use the model like this: 
 First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
 Install transformers:
 ```
 pip install transformers
 ```
 Then load model and predict:
 ```python
 from transformers import BertTokenizer, BertModel
 import torch
 # Mean Pooling - Take attention mask into account for correct averaging
 def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 # Load model from HuggingFace Hub
 tokenizer = BertTokenizer.from_pretrained('shibing624/text2vec-bge-large-chinese')
 model = BertModel.from_pretrained('shibing624/text2vec-bge-large-chinese')
 sentences = ['如何更换花呗绑定银行卡', '花呗更改绑定银行卡']
 # Tokenize sentences
 encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
 # Compute token embeddings
 with torch.no_grad():
    model_output = model(**encoded_input)
 # Perform pooling. In this case, mean pooling.
 sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
 print("Sentence embeddings:")
 print(sentence_embeddings)
 ```
 ## Usage (sentence-transformers)
 [sentence-transformers](https://github.com/UKPLab/sentence-transformers) is a popular library to compute dense vector representations for sentences.
 Install sentence-transformers:
 ```
 pip install -U sentence-transformers
 ```
 Then load model and predict:
 ```python
 from sentence_transformers import SentenceTransformer
 m = SentenceTransformer("shibing624/text2vec-bge-large-chinese")
 sentences = ['如何更换花呗绑定银行卡', '花呗更改绑定银行卡']
 sentence_embeddings = m.encode(sentences)
 print("Sentence embeddings:")
 print(sentence_embeddings)
 ```
 ## Full Model Architecture
 ```
 CoSENT(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: ErnieModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_mean_tokens': True})
 )
 ```
 ## Intended uses
 Our model is intented to be used as a sentence and short paragraph encoder. Given an input text, it ouptuts a vector which captures 
 the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
 By default, input text longer than 256 word pieces is truncated.
 ## Training procedure
 ### Pre-training 
 We use the pretrained https://huggingface.co/BAAI/bge-large-zh-noinstruct model. 
 Please refer to the model card for more detailed information about the pre-training procedure.
 ### Fine-tuning 
 We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each 
 possible sentence pairs from the batch.
 We then apply the rank loss by comparing with true pairs and false pairs.
 ## Citing & Authors
 This model was trained by [text2vec](https://github.com/shibing624/text2vec). 
 If you find this model helpful, feel free to cite:
 ```bibtex 
@software{text2vec,
  author = {Ming Xu},
  title = {text2vec: A Tool for Text to Vector},
  year = {2023},
  url = {https://github.com/shibing624/text2vec},
 }
 ```
--- a/config.json
+++ b/config.json
@@ -0,0 +1,40 @@
 {
  "_name_or_path": "shibing624/text2vec-bge-large-chinese",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
 }
--- a/eval_results.txt
+++ b/eval_results.txt
@@ -0,0 +1,2 @@
 eval_pearson = 0.5551856961372815
 eval_spearman = 0.5470897314949114
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:7e6b8b127408144cb6b627895f2da831ef0bb6b232c2f8bf6f6bed47133454d9
 size 1302134568
--- a/onnx/config.json
+++ b/onnx/config.json
@@ -0,0 +1,40 @@
 {
  "_attn_implementation_autoset": true,
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
 }
--- a/onnx/model.onnx
+++ b/onnx/model.onnx
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:15714229468fee8abaf63499d512fa49ebfa2f8089a60c8823e65ec3ea19404b
 size 1298440934
--- a/onnx/special_tokens_map.json
+++ b/onnx/special_tokens_map.json
@@ -0,0 +1,37 @@
 {
  "cls_token": {
    "content": "[CLS]",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "mask_token": {
    "content": "[MASK]",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "[PAD]",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "sep_token": {
    "content": "[SEP]",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "unk_token": {
    "content": "[UNK]",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/onnx/tokenizer.json
+++ b/onnx/tokenizer.json
--- a/onnx/tokenizer_config.json
+++ b/onnx/tokenizer_config.json
@@ -0,0 +1,65 @@
 {
  "added_tokens_decoder": {
    "0": {
      "content": "[PAD]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "100": {
      "content": "[UNK]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "101": {
      "content": "[CLS]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "102": {
      "content": "[SEP]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "103": {
      "content": "[MASK]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    }
  },
  "clean_up_tokenization_spaces": true,
  "cls_token": "[CLS]",
  "do_basic_tokenize": true,
  "do_lower_case": true,
  "extra_special_tokens": {},
  "mask_token": "[MASK]",
  "max_length": 256,
  "model_max_length": 512,
  "never_split": null,
  "pad_to_multiple_of": null,
  "pad_token": "[PAD]",
  "pad_token_type_id": 0,
  "padding_side": "right",
  "sep_token": "[SEP]",
  "stride": 0,
  "strip_accents": null,
  "tokenize_chinese_chars": true,
  "tokenizer_class": "BertTokenizer",
  "truncation_side": "right",
  "truncation_strategy": "longest_first",
  "unk_token": "[UNK]"
 }
--- a/onnx/vocab.txt
+++ b/onnx/vocab.txt
--- a/pytorch_model.bin
+++ b/pytorch_model.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:9a15a7e95aa1c02184301c06d8bfd93921b9a007515e8f864432d19ffbd5e500
 size 1302221545
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,7 @@
 {
  "cls_token": "[CLS]",
  "mask_token": "[MASK]",
  "pad_token": "[PAD]",
  "sep_token": "[SEP]",
  "unk_token": "[UNK]"
 }
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,15 @@
 {
  "clean_up_tokenization_spaces": true,
  "cls_token": "[CLS]",
  "do_basic_tokenize": true,
  "do_lower_case": true,
  "mask_token": "[MASK]",
  "model_max_length": 512,
  "never_split": null,
  "pad_token": "[PAD]",
  "sep_token": "[SEP]",
  "strip_accents": null,
  "tokenize_chinese_chars": true,
  "tokenizer_class": "BertTokenizer",
  "unk_token": "[UNK]"
 }
--- a/training_progress_scores.csv
+++ b/training_progress_scores.csv
@@ -0,0 +1,6 @@
 global_step,train_loss,eval_spearman,eval_pearson
 6217,1.187222957611084,0.5114652838626846,0.4941802088748416
 12434,0.5630741119384766,0.529277507678798,0.5185274121904764
 18651,0.4950772523880005,0.4936530600465713,0.4916064648614785
 24868,0.06126270815730095,0.5470897314949114,0.5551856961372815
 31085,0.09893279522657394,0.5114652838626846,0.5117443179915746
--- a/vocab.txt
+++ b/vocab.txt
		`@@ -0,0 +1,2 @@`
							`eval_pearson = 0.5551856961372815`
							`eval_spearman = 0.5470897314949114`