初始化项目，由ModelHub XC社区提供模型

Model: AITeamVN/Vietnamese_Embedding Source: Original Platform
2026-05-14 12:24:49 +08:00
commit 8d60b5e81a
19 changed files with 619 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,3 @@
+onnx/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+onnx/model.onnx filter=lfs diff=lfs merge=lfs -text
+onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text
--- a/1_Pooling/config.json
+++ b/1_Pooling/config.json
@@ -0,0 +1,10 @@
+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}
--- a/README.md
+++ b/README.md
@@ -0,0 +1,93 @@
+---
+license: apache-2.0
+language:
+- vi
+base_model:
+- BAAI/bge-m3
+pipeline_tag: sentence-similarity
+library_name: sentence-transformers
+tags:
+- Embedding
+- onnx
+---
+
+ 
+## Model Card: Vietnamese_Embedding
+
+Vietnamese_Embedding is an embedding model fine-tuned from the BGE-M3 model (https://huggingface.co/BAAI/bge-m3) to enhance retrieval capabilities for Vietnamese.
+
+* The model was trained on approximately 300,000 triplets of queries, positive documents, and negative documents for Vietnamese.
+* The model was trained with a maximum sequence length of 2048.
+
+## Model Details
+
+### Model Description
+- **Model Type:** Sentence Transformer
+- **Base model:** [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)
+- **Maximum Sequence Length:** 2048 tokens
+- **Output Dimensionality:** 1024 dimensions
+- **Similarity Function:** Dot product Similarity
+- **Language:** Vietnamese
+- **Licence:** Apache 2.0
+
+## Usage
+
+```python
+from sentence_transformers import SentenceTransformer
+import torch
+
+model = SentenceTransformer("AITeamVN/Vietnamese_Embedding")
+model.max_seq_length = 2048
+sentences_1 = ["Trí tuệ nhân tạo là gì", "Lợi ích của giấc ngủ"]
+sentences_2 = ["Trí tuệ nhân tạo là công nghệ giúp máy móc suy nghĩ và học hỏi như con người. Nó hoạt động bằng cách thu thập dữ liệu, nhận diện mẫu và đưa ra quyết định.", 
+               "Giấc ngủ giúp cơ thể và não bộ nghỉ ngơi, hồi phục năng lượng và cải thiện trí nhớ. Ngủ đủ giấc giúp tinh thần tỉnh táo và làm việc hiệu quả hơn."]
+query_embedding = model.encode(sentences_1)
+doc_embeddings = model.encode(sentences_2)
+similarity = query_embedding @ doc_embeddings.T
+print(similarity)
+
+'''
+array([[0.66212064, 0.33066642],
+       [0.25866613, 0.5865289 ]], dtype=float32)
+'''
+```
+
+
+### Evaluation:
+
+- Dataset: Entire training dataset of Legal Zalo 2021. Our model was not trained on this dataset.
+
+| Model                | Accuracy@1 | Accuracy@3 | Accuracy@5 | Accuracy@10  |  MRR@10 |
+|----------------------|------------|------------|------------|-------------|--------------|
+| Vietnamese_Reranker          | 0.7944     | 0.9324    | 0.9537     | 0.9740     | 0.8672       | 
+| Vietnamese_Embedding_v2         | 0.7262     | 0.8927     | 0.9268     | 0.9578     | 0.8149       | 
+| Vietnamese_Embedding  (public)          | 0.7274     | 0.8992     | 0.9305     | 0.9568     | 0.8181       | 
+| Vietnamese-bi-encoder (BKAI)         | 0.7109     | 0.8680     | 0.9014     | 0.9299      | 0.7951       | 
+| BGE-M3 | 0.5682     | 0.7728     | 0.8382     | 0.8921      | 0.6822       |
+
+Vietnamese_Reranker and Vietnamese_Embedding_v2 was trained on 1100000 triplets. 
+
+Although the score on the legal domain drops a bit on Vietnamese_Embedding_v2, since this phase data is much larger, it is very good for other domains.
+
+You can access 2 model via link: [Vietnamese_Embedding_v2](AITeamVN/Vietnamese_Embedding_v2), [Vietnamese_Reranker](https://huggingface.co/AITeamVN/Vietnamese_Reranker)
+
+You can reproduce the evaluation result by running code python evaluation_model.py (data downloaded from Kaggle).
+
+## Contact
+
+Email: nguyennhotrung3004@gmail.com
+
+**Developer**
+
+Member: Nguyễn Nho Trung, Nguyễn Nhật Quang, Nguyen Van Huy
+
+## Citation
+
+```Plaintext
+@misc{Vietnamese_Embedding,
+  title={Vietnamese_Embedding: Embedding model in Vietnamese language.},
+  author={Nguyen Nho Trung, Nguyen Nhat Quang, Nguyen Van Huy},
+  year={2025},
+  publisher={Huggingface},
+} 
+```
--- a/config.json
+++ b/config.json
@@ -0,0 +1,28 @@
+{
+  "_name_or_path": "/AITeamVN/bge_vi_2048",
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 8194,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}
--- a/config_sentence_transformers.json
+++ b/config_sentence_transformers.json
@@ -0,0 +1,9 @@
+{
+  "__version__": {
+    "sentence_transformers": "2.6.1",
+    "transformers": "4.49.0",
+    "pytorch": "2.6.0+cu124"
+  },
+  "prompts": {},
+  "default_prompt_name": null
+}
--- a/evaluation_model.py
+++ b/evaluation_model.py
@@ -0,0 +1,191 @@
+import numpy as np
+import torch
+import json
+import pandas as pd
+from tqdm import tqdm
+from typing import List, Dict, Tuple, Set, Union, Optional
+from langchain.docstore.document import Document
+from langchain_community.vectorstores import FAISS
+from langchain_community.vectorstores.faiss import DistanceStrategy
+from langchain_core.embeddings.embeddings import Embeddings
+from FlagEmbedding import BGEM3FlagModel
+
+def setup_gpu_info() -> None:
+    print(f"Số lượng GPU khả dụng: {torch.cuda.device_count()}")
+    print(f"GPU hiện tại: {torch.cuda.current_device()}")
+    print(f"Tên GPU: {torch.cuda.get_device_name(0)}")
+
+def load_model(model_name: str, use_fp16: bool = False) -> BGEM3FlagModel:
+    return BGEM3FlagModel(model_name, use_fp16=use_fp16)
+
+def load_json_file(file_path: str) -> dict:
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+def load_jsonl_file(file_path: str) -> List[Dict]:
+    corpus = []
+    with open(file_path, "r", encoding="utf-8") as file:
+        for line in file:
+            data = json.loads(line.strip())
+            corpus.append(data)
+    return corpus
+
+def extract_corpus_from_legal_documents(legal_data: dict) -> List[Dict]:
+    corpus = []
+    for document in legal_data:
+        for article in document['articles']:
+            chunk = {
+                "law_id": document['law_id'],
+                "article_id": article['article_id'],
+                "title": article['title'],
+                "text": article['title'] + '\n' + article['text'] 
+            }
+            corpus.append(chunk)
+    return corpus
+
+def convert_corpus_to_documents(corpus: List[Dict[str, str]]) -> List[Document]:
+    documents = []
+    for i in tqdm(range(len(corpus)), desc="Converting corpus to documents"):
+        context = corpus[i]['text']
+        metadata = {
+            'law_id': corpus[i]['law_id'],
+            'article_id': corpus[i]['article_id'],
+            'title': corpus[i]['title']
+        }
+        documents.append(Document(page_content=context, metadata=metadata))
+    return documents
+
+class CustomEmbedding(Embeddings):
+    """Custom embedding class that uses the BGEM3FlagModel."""
+    
+    def __init__(self, model: BGEM3FlagModel, batch_size: int = 1): 
+        self.model = model
+        self.batch_size = batch_size
+        
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        embeddings = []
+        for i in tqdm(range(0, len(texts), self.batch_size), desc="Embedding documents"):
+            batch_texts = texts[i:i+self.batch_size]  
+            batch_embeddings = self._get_batch_embeddings(batch_texts)
+            embeddings.extend(batch_embeddings)
+            torch.cuda.empty_cache()
+        return np.vstack(embeddings) 
+
+    def embed_query(self, text: str) -> List[float]:
+        embedding = self.model.encode(text, max_length=256)['dense_vecs']
+        return embedding
+
+    def _get_batch_embeddings(self, texts: List[str]) -> List[List[float]]:
+        with torch.no_grad():
+            outputs = self.model.encode(texts, batch_size=self.batch_size, max_length=2048)['dense_vecs']
+        batch_embeddings = outputs
+        del outputs
+        return batch_embeddings
+
+
+class VectorDB:
+    """Vector database for document retrieval."""
+    
+    def __init__(
+        self,
+        documents: List[Document],
+        embedding: Embeddings,
+        vector_db=FAISS,
+        index_path: Optional[str] = None
+    ) -> None:
+        self.vector_db = vector_db
+        self.embedding = embedding
+        self.index_path = index_path
+        self.db = self._build_db(documents)
+
+    def _build_db(self, documents: List[Document]):
+        if self.index_path:
+            db = self.vector_db.load_local(
+                self.index_path, 
+                self.embedding, 
+                allow_dangerous_deserialization=True
+            )
+        else:
+            db = self.vector_db.from_documents(
+                documents=documents, 
+                embedding=self.embedding, 
+                distance_strategy=DistanceStrategy.DOT_PRODUCT
+            )
+        return db
+    
+    def get_retriever(self, search_type: str = "similarity", search_kwargs: dict = {"k": 10}):
+        retriever = self.db.as_retriever(search_type=search_type, search_kwargs=search_kwargs)
+        return retriever
+    
+    def save_local(self, folder_path: str) -> None:
+        self.db.save_local(folder_path)
+
+
+def process_sample(sample: dict, retriever) -> List[int]:
+    question = sample['question']
+    docs = retriever.invoke(question)
+    retrieved_article_full_ids = [
+        docs[i].metadata['law_id'] + "#" + docs[i].metadata['article_id'] 
+        for i in range(len(docs))
+    ]
+    indexes = []
+    for article in sample['relevant_articles']:
+        article_full_id = article['law_id'] + "#" + article['article_id']
+        if article_full_id in retrieved_article_full_ids:
+            idx = retrieved_article_full_ids.index(article_full_id) + 1
+            indexes.append(idx)
+        else:
+            indexes.append(0)       
+    return indexes
+
+def calculate_metrics(all_indexes: List[List[int]], num_samples: int, selected_keys: Set[str]) -> Dict[str, float]:
+    count = [len(indexes) for indexes in all_indexes]
+    result = {}
+    
+    for thres in [1, 3, 5, 10, 100]:
+        found = [[y for y in x if 0 < y <= thres] for x in all_indexes]
+        found_count = [len(x) for x in found]
+        acc = sum(1 for i in range(num_samples) if found_count[i] > 0) / num_samples
+        rec = sum(found_count[i] / count[i] for i in range(num_samples)) / num_samples
+        pre = sum(found_count[i] / thres for i in range(num_samples)) / num_samples
+        mrr = sum(1 / min(x) if x else 0 for x in found) / num_samples
+
+        if f"Accuracy@{thres}" in selected_keys:
+            result[f"Accuracy@{thres}"] = acc
+        if f"MRR@{thres}" in selected_keys:
+            result[f"MRR@{thres}"] = mrr
+            
+    return result
+
+
+def save_results(result: Dict[str, float], output_path: str) -> None:
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=4, ensure_ascii=False)
+    print(f"Results saved to {output_path}")
+
+
+def main():
+    setup_gpu_info()
+    model = load_model('AITeamVN/Vietnamese_Embedding', use_fp16=False)
+    samples = load_json_file('zalo_kaggle/train_question_answer.json')['items']
+    legal_data = load_json_file('zalo_kaggle/legal_corpus.json')
+    
+    corpus = extract_corpus_from_legal_documents(legal_data)
+    documents = convert_corpus_to_documents(corpus)
+    embedding = CustomEmbedding(model, batch_size=1)  # Increased batch size for efficiency time
+    vectordb = VectorDB(
+        documents=documents,
+        embedding=embedding,
+        vector_db=FAISS,
+        index_path=None
+    )
+    retriever = vectordb.get_retriever(search_type="similarity", search_kwargs={"k": 100})
+    all_indexes = []
+    for sample in tqdm(samples, desc="Processing samples"):
+        all_indexes.append(process_sample(sample, retriever))
+    selected_keys = {"Accuracy@1", "Accuracy@3", "Accuracy@5", "Accuracy@10", "MRR@10", "Accuracy@100"}
+    result = calculate_metrics(all_indexes, len(samples), selected_keys)
+    print(result)
+    save_results(result, "zalo_kaggle/Vietnamese_Embedding.json")
+if __name__ == "__main__":
+    main()
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f2debafdf03659e8273022a3e902b94deec73cd20c2b7262ab7e21630163f6d
+size 2271064456
--- a/modules.json
+++ b/modules.json
@@ -0,0 +1,20 @@
+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]
--- a/onnx/config.json
+++ b/onnx/config.json
@@ -0,0 +1,29 @@
+{
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "export_model_type": "transformer",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 8194,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}
--- a/onnx/model.onnx
+++ b/onnx/model.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8abc48c79bda16d715a7ea838f91b21a97bf22c8f80ad1d11d7701f78ecbad1d
+size 745085
--- a/onnx/model.onnx_data
+++ b/onnx/model.onnx_data
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4de2a8bc780de9b4f6dda97e2f403ea93a40b29db05ebee065a92ccf69884bf
+size 2266886160
--- a/onnx/special_tokens_map.json
+++ b/onnx/special_tokens_map.json
@@ -0,0 +1,51 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/onnx/tokenizer.json
+++ b/onnx/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bf8afbfd11306bd872018c53bfdf2e160a56f8edbcf49933324404791c148d3
+size 17082900
--- a/onnx/tokenizer_config.json
+++ b/onnx/tokenizer_config.json
@@ -0,0 +1,56 @@
+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}
--- a/sentence_bert_config.json
+++ b/sentence_bert_config.json
@@ -0,0 +1,4 @@
+{
+  "max_seq_length": 8192,
+  "do_lower_case": false
+}
--- a/sentencepiece.bpe.model
+++ b/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,51 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/tokenizer.json
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b74659c780d49afad7a7b9799868f75cbd3014fb6c34956e85a793028d38094a
+size 17098251
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,56 @@
+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}