commit 7ba5cdc70e7ee8bbff93b14e033e6e3241480e9e Author: ModelHub XC Date: Thu May 28 02:20:16 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: bqbbao6/vietnamese-legal-embedding Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/1_Pooling/config.json b/1_Pooling/config.json new file mode 100644 index 0000000..70ac42b --- /dev/null +++ b/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 768, + "pooling_mode_cls_token": false, + "pooling_mode_mean_tokens": true, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": false, + "include_prompt": true +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..86df3c6 --- /dev/null +++ b/README.md @@ -0,0 +1,164 @@ +--- +language: +- vi +base_model: +- intfloat/multilingual-e5-base +pipeline_tag: sentence-similarity +--- +# Vietnamese Legal Embedding + +**Model:** [bqbbao6/vietnamese-legal-embedding](https://huggingface.co/bqbbao6/vietnamese-legal-embedding) +**Base model:** [intfloat/multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) + +--- + +## Model Description + +`vietnamese-legal-embedding` is a text embedding model fine-tuned for Vietnamese legal document retrieval. Built on top of `multilingual-e5-base`, this model is optimized for semantic search and Retrieval-Augmented Generation (RAG) systems in the Vietnamese legal domain. + +The model learns to map legal queries to their relevant legal passages, making it suitable for retrieving precise legal articles and regulations in response to user questions. + +--- + +## Model Details + +| Property | Value | +|---|---| +| Base Model | intfloat/multilingual-e5-base | +| Language | Vietnamese | +| Max Sequence Length | 512 tokens | +| Embedding Dimension | 768 | +| Similarity Function | Cosine Similarity | + +--- + +## Usage + +First install the Sentence Transformers library: + +```bash +pip install -U sentence-transformers +``` + +Then you can load this model and run inference. +Ex1: +```python +from sentence_transformers import SentenceTransformer +import torch + +model = SentenceTransformer( + "bqbbao6/VN_legal_embedding_512", + trust_remote_code=True +) +query = "query: " + "Người lao động được nghỉ bao nhiêu ngày phép mỗi năm?" +embedding = model.encode(query) + +print(f"Embedding shape: {embedding.shape}") # (768,) +``` + +Ex2: +```python +from sentence_transformers import SentenceTransformer +import torch + +model = SentenceTransformer( + "bqbbao6/VN_legal_embedding_512", + trust_remote_code=True +) + +query = "query: " + "Người lao động được nghỉ bao nhiêu ngày phép năm?" + +corpus = [ + # Đúng + "Người lao động làm việc đủ 12 tháng được nghỉ 12 ngày phép năm có hưởng lương.", + + # Giống chủ ngữ nhưng khác nội dung + "Người lao động được hưởng chế độ bảo hiểm xã hội theo quy định của pháp luật.", + + # Giống vị ngữ nhưng khác chủ ngữ + "Cán bộ công chức được nghỉ 12 ngày phép năm theo quy định.", + + # Giống một phần nhưng nói về đối tượng khác + "Người lao động chưa đủ 12 tháng được nghỉ phép theo tỷ lệ tương ứng.", + + # Hoàn toàn không liên quan + "Doanh nghiệp phải đóng thuế thu nhập doanh nghiệp hàng năm.", +] + +corpus_prefixed = ["passage: " + p for p in corpus] + +q_emb = model.encode(query, normalize_embeddings=True) +c_emb = model.encode(corpus_prefixed, normalize_embeddings=True) + +scores = cos_sim(q_emb, c_emb)[0] +for i, (score, passage) in enumerate(zip(scores, corpus)): + print(f"[{i+1}] Score: {score:.4f} | {passage}") +``` + +## Training Data + +The model was fine-tuned on a dataset of **250,000 triplets** (query, positive passage, hard negative) in the Vietnamese legal domain, covering various legal fields including civil law, criminal law, labor law, and administrative law. + +- All texts were tokenized using [pyvi](https://github.com/trungtv/pyvi) for Vietnamese word segmentation. +- Hard negatives were mined using BM25 to ensure challenging training examples. +- Loss function: `CachedMultipleNegativesRankingLoss` + +--- + +## Evaluation + +The model was evaluated on [GreenNode/zalo-ai-legal-text-retrieval-vn](https://huggingface.co/datasets/GreenNode/zalo-ai-legal-text-retrieval-vn) and compared against the base model and a Vietnamese-specific embedding model. + +| Metric | **bqbbao6/vietnamese-legal-embedding** | intfloat/multilingual-e5-base | bkai-foundation-models/vietnamese-bi-encoder | +|---|---|---|---| +| NDCG@10 | **0.8059** | 0.6030 | 0.6160 | +| MRR@10 | **0.7543** | 0.5482 | 0.5579 | +| MAP@10 | **0.7546** | 0.5491 | 0.5588 | +| Recall@1 | **0.6269** | 0.4467 | 0.4442 | +| Recall@5 | **0.9124** | 0.6916 | 0.7170 | +| Recall@10 | **0.9613** | 0.7722 | 0.7951 | +| Precision@1 | **0.6282** | 0.4480 | 0.4454 | +| Hit Rate@10 | **0.9632** | 0.7728 | 0.7970 | + +The fine-tuned model significantly outperforms both the base model and the Vietnamese-specific bi-encoder across all metrics, achieving a **+20 point improvement in NDCG@10** over the base model, demonstrating the effectiveness of domain-specific fine-tuning for Vietnamese legal retrieval. + +--- +## Citation + +```bibtex + +@inproceedings{10.1007/978-981-95-1746-6_17, + address = {Singapore}, + author = {Pham, Bao Loc +and Hoang, Quoc Viet +and Luu, Quy Tung +and Vo, Trong Thu}, + booktitle = {Proceedings of the Fifth International Conference on Intelligent Systems and Networks}, + isbn = {978-981-95-1746-6}, + pages = {153--163}, + publisher = {Springer Nature Singapore}, + title = {GN-TRVN: A Benchmark for Vietnamese Table Markdown Retrieval Task}, + year = {2026}, +} + + +@article{enevoldsen2025mmtebmassivemultilingualtext, + title={MMTEB: Massive Multilingual Text Embedding Benchmark}, + author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, + publisher = {arXiv}, + journal={arXiv preprint arXiv:2502.13595}, + year={2025}, + url={https://arxiv.org/abs/2502.13595}, + doi = {10.48550/arXiv.2502.13595}, +} + +@article{muennighoff2022mteb, + author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Loïc and Reimers, Nils}, + title = {MTEB: Massive Text Embedding Benchmark}, + publisher = {arXiv}, + journal={arXiv preprint arXiv:2210.07316}, + year = {2022} + url = {https://arxiv.org/abs/2210.07316}, + doi = {10.48550/ARXIV.2210.07316}, +} +``` diff --git a/config.json b/config.json new file mode 100644 index 0000000..3ba0725 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "add_cross_attention": false, + "architectures": [ + "XLMRobertaModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "classifier_dropout": null, + "dtype": "float32", + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "is_decoder": false, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "xlm-roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/config_sentence_transformers.json b/config_sentence_transformers.json new file mode 100644 index 0000000..33d69d3 --- /dev/null +++ b/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "model_type": "SentenceTransformer", + "__version__": { + "sentence_transformers": "5.2.3", + "transformers": "5.0.0", + "pytorch": "2.10.0+cu128" + }, + "prompts": { + "query": "", + "document": "" + }, + "default_prompt_name": null, + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..1455be0 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1557519641af439de4f323a99f752151120a59839733e0d32ec8381a6961dcc +size 1112197064 diff --git a/modules.json b/modules.json new file mode 100644 index 0000000..952a9b8 --- /dev/null +++ b/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/sentence_bert_config.json b/sentence_bert_config.json new file mode 100644 index 0000000..4eca68d --- /dev/null +++ b/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 512, + "do_lower_case": false +} \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..9a53718 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe715a86a37cd2b20e5eaeee8b22815bce65de676d1e0cd856114b59dab67fc +size 16766387 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..b43c19d --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,15 @@ +{ + "add_prefix_space": true, + "backend": "tokenizers", + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "is_local": false, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "" +}