初始化项目,由ModelHub XC社区提供模型

Model: AITeamVN/Vietnamese_Embedding
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-14 12:24:49 +08:00
commit 8d60b5e81a
19 changed files with 619 additions and 0 deletions

3
.gitattributes vendored Normal file
View File

@@ -0,0 +1,3 @@
onnx/tokenizer.json filter=lfs diff=lfs merge=lfs -text
onnx/model.onnx filter=lfs diff=lfs merge=lfs -text
onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text

10
1_Pooling/config.json Normal file
View File

@@ -0,0 +1,10 @@
{
"word_embedding_dimension": 1024,
"pooling_mode_cls_token": true,
"pooling_mode_mean_tokens": false,
"pooling_mode_max_tokens": false,
"pooling_mode_mean_sqrt_len_tokens": false,
"pooling_mode_weightedmean_tokens": false,
"pooling_mode_lasttoken": false,
"include_prompt": true
}

93
README.md Normal file
View File

@@ -0,0 +1,93 @@
---
license: apache-2.0
language:
- vi
base_model:
- BAAI/bge-m3
pipeline_tag: sentence-similarity
library_name: sentence-transformers
tags:
- Embedding
- onnx
---
## Model Card: Vietnamese_Embedding
Vietnamese_Embedding is an embedding model fine-tuned from the BGE-M3 model (https://huggingface.co/BAAI/bge-m3) to enhance retrieval capabilities for Vietnamese.
* The model was trained on approximately 300,000 triplets of queries, positive documents, and negative documents for Vietnamese.
* The model was trained with a maximum sequence length of 2048.
## Model Details
### Model Description
- **Model Type:** Sentence Transformer
- **Base model:** [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)
- **Maximum Sequence Length:** 2048 tokens
- **Output Dimensionality:** 1024 dimensions
- **Similarity Function:** Dot product Similarity
- **Language:** Vietnamese
- **Licence:** Apache 2.0
## Usage
```python
from sentence_transformers import SentenceTransformer
import torch
model = SentenceTransformer("AITeamVN/Vietnamese_Embedding")
model.max_seq_length = 2048
sentences_1 = ["Trí tuệ nhân tạo là gì", "Lợi ích của giấc ngủ"]
sentences_2 = ["Trí tuệ nhân tạo là công nghệ giúp máy móc suy nghĩ và học hỏi như con người. Nó hoạt động bằng cách thu thập dữ liệu, nhận diện mẫu và đưa ra quyết định.",
"Giấc ngủ giúp cơ thể và não bộ nghỉ ngơi, hồi phục năng lượng và cải thiện trí nhớ. Ngủ đủ giấc giúp tinh thần tỉnh táo và làm việc hiệu quả hơn."]
query_embedding = model.encode(sentences_1)
doc_embeddings = model.encode(sentences_2)
similarity = query_embedding @ doc_embeddings.T
print(similarity)
'''
array([[0.66212064, 0.33066642],
[0.25866613, 0.5865289 ]], dtype=float32)
'''
```
### Evaluation:
- Dataset: Entire training dataset of Legal Zalo 2021. Our model was not trained on this dataset.
| Model | Accuracy@1 | Accuracy@3 | Accuracy@5 | Accuracy@10 | MRR@10 |
|----------------------|------------|------------|------------|-------------|--------------|
| Vietnamese_Reranker | 0.7944 | 0.9324 | 0.9537 | 0.9740 | 0.8672 |
| Vietnamese_Embedding_v2 | 0.7262 | 0.8927 | 0.9268 | 0.9578 | 0.8149 |
| Vietnamese_Embedding (public) | 0.7274 | 0.8992 | 0.9305 | 0.9568 | 0.8181 |
| Vietnamese-bi-encoder (BKAI) | 0.7109 | 0.8680 | 0.9014 | 0.9299 | 0.7951 |
| BGE-M3 | 0.5682 | 0.7728 | 0.8382 | 0.8921 | 0.6822 |
Vietnamese_Reranker and Vietnamese_Embedding_v2 was trained on 1100000 triplets.
Although the score on the legal domain drops a bit on Vietnamese_Embedding_v2, since this phase data is much larger, it is very good for other domains.
You can access 2 model via link: [Vietnamese_Embedding_v2](AITeamVN/Vietnamese_Embedding_v2), [Vietnamese_Reranker](https://huggingface.co/AITeamVN/Vietnamese_Reranker)
You can reproduce the evaluation result by running code python evaluation_model.py (data downloaded from Kaggle).
## Contact
Email: nguyennhotrung3004@gmail.com
**Developer**
Member: Nguyễn Nho Trung, Nguyễn Nhật Quang, Nguyen Van Huy
## Citation
```Plaintext
@misc{Vietnamese_Embedding,
title={Vietnamese_Embedding: Embedding model in Vietnamese language.},
author={Nguyen Nho Trung, Nguyen Nhat Quang, Nguyen Van Huy},
year={2025},
publisher={Huggingface},
}
```

28
config.json Normal file
View File

@@ -0,0 +1,28 @@
{
"_name_or_path": "/AITeamVN/bge_vi_2048",
"architectures": [
"XLMRobertaModel"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 8194,
"model_type": "xlm-roberta",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"output_past": true,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.49.0",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 250002
}

View File

@@ -0,0 +1,9 @@
{
"__version__": {
"sentence_transformers": "2.6.1",
"transformers": "4.49.0",
"pytorch": "2.6.0+cu124"
},
"prompts": {},
"default_prompt_name": null
}

191
evaluation_model.py Normal file
View File

@@ -0,0 +1,191 @@
import numpy as np
import torch
import json
import pandas as pd
from tqdm import tqdm
from typing import List, Dict, Tuple, Set, Union, Optional
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_core.embeddings.embeddings import Embeddings
from FlagEmbedding import BGEM3FlagModel
def setup_gpu_info() -> None:
print(f"Số lượng GPU khả dụng: {torch.cuda.device_count()}")
print(f"GPU hiện tại: {torch.cuda.current_device()}")
print(f"Tên GPU: {torch.cuda.get_device_name(0)}")
def load_model(model_name: str, use_fp16: bool = False) -> BGEM3FlagModel:
return BGEM3FlagModel(model_name, use_fp16=use_fp16)
def load_json_file(file_path: str) -> dict:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_jsonl_file(file_path: str) -> List[Dict]:
corpus = []
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
data = json.loads(line.strip())
corpus.append(data)
return corpus
def extract_corpus_from_legal_documents(legal_data: dict) -> List[Dict]:
corpus = []
for document in legal_data:
for article in document['articles']:
chunk = {
"law_id": document['law_id'],
"article_id": article['article_id'],
"title": article['title'],
"text": article['title'] + '\n' + article['text']
}
corpus.append(chunk)
return corpus
def convert_corpus_to_documents(corpus: List[Dict[str, str]]) -> List[Document]:
documents = []
for i in tqdm(range(len(corpus)), desc="Converting corpus to documents"):
context = corpus[i]['text']
metadata = {
'law_id': corpus[i]['law_id'],
'article_id': corpus[i]['article_id'],
'title': corpus[i]['title']
}
documents.append(Document(page_content=context, metadata=metadata))
return documents
class CustomEmbedding(Embeddings):
"""Custom embedding class that uses the BGEM3FlagModel."""
def __init__(self, model: BGEM3FlagModel, batch_size: int = 1):
self.model = model
self.batch_size = batch_size
def embed_documents(self, texts: List[str]) -> List[List[float]]:
embeddings = []
for i in tqdm(range(0, len(texts), self.batch_size), desc="Embedding documents"):
batch_texts = texts[i:i+self.batch_size]
batch_embeddings = self._get_batch_embeddings(batch_texts)
embeddings.extend(batch_embeddings)
torch.cuda.empty_cache()
return np.vstack(embeddings)
def embed_query(self, text: str) -> List[float]:
embedding = self.model.encode(text, max_length=256)['dense_vecs']
return embedding
def _get_batch_embeddings(self, texts: List[str]) -> List[List[float]]:
with torch.no_grad():
outputs = self.model.encode(texts, batch_size=self.batch_size, max_length=2048)['dense_vecs']
batch_embeddings = outputs
del outputs
return batch_embeddings
class VectorDB:
"""Vector database for document retrieval."""
def __init__(
self,
documents: List[Document],
embedding: Embeddings,
vector_db=FAISS,
index_path: Optional[str] = None
) -> None:
self.vector_db = vector_db
self.embedding = embedding
self.index_path = index_path
self.db = self._build_db(documents)
def _build_db(self, documents: List[Document]):
if self.index_path:
db = self.vector_db.load_local(
self.index_path,
self.embedding,
allow_dangerous_deserialization=True
)
else:
db = self.vector_db.from_documents(
documents=documents,
embedding=self.embedding,
distance_strategy=DistanceStrategy.DOT_PRODUCT
)
return db
def get_retriever(self, search_type: str = "similarity", search_kwargs: dict = {"k": 10}):
retriever = self.db.as_retriever(search_type=search_type, search_kwargs=search_kwargs)
return retriever
def save_local(self, folder_path: str) -> None:
self.db.save_local(folder_path)
def process_sample(sample: dict, retriever) -> List[int]:
question = sample['question']
docs = retriever.invoke(question)
retrieved_article_full_ids = [
docs[i].metadata['law_id'] + "#" + docs[i].metadata['article_id']
for i in range(len(docs))
]
indexes = []
for article in sample['relevant_articles']:
article_full_id = article['law_id'] + "#" + article['article_id']
if article_full_id in retrieved_article_full_ids:
idx = retrieved_article_full_ids.index(article_full_id) + 1
indexes.append(idx)
else:
indexes.append(0)
return indexes
def calculate_metrics(all_indexes: List[List[int]], num_samples: int, selected_keys: Set[str]) -> Dict[str, float]:
count = [len(indexes) for indexes in all_indexes]
result = {}
for thres in [1, 3, 5, 10, 100]:
found = [[y for y in x if 0 < y <= thres] for x in all_indexes]
found_count = [len(x) for x in found]
acc = sum(1 for i in range(num_samples) if found_count[i] > 0) / num_samples
rec = sum(found_count[i] / count[i] for i in range(num_samples)) / num_samples
pre = sum(found_count[i] / thres for i in range(num_samples)) / num_samples
mrr = sum(1 / min(x) if x else 0 for x in found) / num_samples
if f"Accuracy@{thres}" in selected_keys:
result[f"Accuracy@{thres}"] = acc
if f"MRR@{thres}" in selected_keys:
result[f"MRR@{thres}"] = mrr
return result
def save_results(result: Dict[str, float], output_path: str) -> None:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, indent=4, ensure_ascii=False)
print(f"Results saved to {output_path}")
def main():
setup_gpu_info()
model = load_model('AITeamVN/Vietnamese_Embedding', use_fp16=False)
samples = load_json_file('zalo_kaggle/train_question_answer.json')['items']
legal_data = load_json_file('zalo_kaggle/legal_corpus.json')
corpus = extract_corpus_from_legal_documents(legal_data)
documents = convert_corpus_to_documents(corpus)
embedding = CustomEmbedding(model, batch_size=1) # Increased batch size for efficiency time
vectordb = VectorDB(
documents=documents,
embedding=embedding,
vector_db=FAISS,
index_path=None
)
retriever = vectordb.get_retriever(search_type="similarity", search_kwargs={"k": 100})
all_indexes = []
for sample in tqdm(samples, desc="Processing samples"):
all_indexes.append(process_sample(sample, retriever))
selected_keys = {"Accuracy@1", "Accuracy@3", "Accuracy@5", "Accuracy@10", "MRR@10", "Accuracy@100"}
result = calculate_metrics(all_indexes, len(samples), selected_keys)
print(result)
save_results(result, "zalo_kaggle/Vietnamese_Embedding.json")
if __name__ == "__main__":
main()

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1f2debafdf03659e8273022a3e902b94deec73cd20c2b7262ab7e21630163f6d
size 2271064456

20
modules.json Normal file
View File

@@ -0,0 +1,20 @@
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
},
{
"idx": 2,
"name": "2",
"path": "2_Normalize",
"type": "sentence_transformers.models.Normalize"
}
]

29
onnx/config.json Normal file
View File

@@ -0,0 +1,29 @@
{
"_attn_implementation_autoset": true,
"architectures": [
"XLMRobertaModel"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"export_model_type": "transformer",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 8194,
"model_type": "xlm-roberta",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"output_past": true,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.51.3",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 250002
}

3
onnx/model.onnx Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8abc48c79bda16d715a7ea838f91b21a97bf22c8f80ad1d11d7701f78ecbad1d
size 745085

3
onnx/model.onnx_data Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f4de2a8bc780de9b4f6dda97e2f403ea93a40b29db05ebee065a92ccf69884bf
size 2266886160

View File

@@ -0,0 +1,51 @@
{
"bos_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"cls_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"mask_token": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"sep_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

3
onnx/tokenizer.json Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8bf8afbfd11306bd872018c53bfdf2e160a56f8edbcf49933324404791c148d3
size 17082900

View File

@@ -0,0 +1,56 @@
{
"added_tokens_decoder": {
"0": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"250001": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"bos_token": "<s>",
"clean_up_tokenization_spaces": true,
"cls_token": "<s>",
"eos_token": "</s>",
"extra_special_tokens": {},
"mask_token": "<mask>",
"model_max_length": 8192,
"pad_token": "<pad>",
"sep_token": "</s>",
"sp_model_kwargs": {},
"tokenizer_class": "XLMRobertaTokenizer",
"unk_token": "<unk>"
}

View File

@@ -0,0 +1,4 @@
{
"max_seq_length": 8192,
"do_lower_case": false
}

3
sentencepiece.bpe.model Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
size 5069051

51
special_tokens_map.json Normal file
View File

@@ -0,0 +1,51 @@
{
"bos_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"cls_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"mask_token": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"sep_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

3
tokenizer.json Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b74659c780d49afad7a7b9799868f75cbd3014fb6c34956e85a793028d38094a
size 17098251

56
tokenizer_config.json Normal file
View File

@@ -0,0 +1,56 @@
{
"added_tokens_decoder": {
"0": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"250001": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"bos_token": "<s>",
"clean_up_tokenization_spaces": true,
"cls_token": "<s>",
"eos_token": "</s>",
"extra_special_tokens": {},
"mask_token": "<mask>",
"model_max_length": 8192,
"pad_token": "<pad>",
"sep_token": "</s>",
"sp_model_kwargs": {},
"tokenizer_class": "XLMRobertaTokenizer",
"unk_token": "<unk>"
}