初始化项目,由ModelHub XC社区提供模型

Model: SamilPwC-AXNode-GenAI/PwC-Embedding_expr
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-14 18:02:42 +08:00
commit 6f9cde7915
11 changed files with 275 additions and 0 deletions

36
.gitattributes vendored Normal file
View File

@@ -0,0 +1,36 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
tokenizer.json filter=lfs diff=lfs merge=lfs -text

10
1_Pooling/config.json Executable file
View File

@@ -0,0 +1,10 @@
{
"word_embedding_dimension": 1024,
"pooling_mode_cls_token": false,
"pooling_mode_mean_tokens": true,
"pooling_mode_max_tokens": false,
"pooling_mode_mean_sqrt_len_tokens": false,
"pooling_mode_weightedmean_tokens": false,
"pooling_mode_lasttoken": false,
"include_prompt": true
}

51
README.md Normal file
View File

@@ -0,0 +1,51 @@
---
language:
- ko
license: apache-2.0
tags:
- sentence-transformers
- sentence-similarity
- transformers
---
## PwC-Embedding-expr
We trained the **PwC-Embedding-expr** model on top of the [multilingual-e5-large-instruct](https://huggingface.co/intfloat/multilingual-e5-large-instruct) embedding model.
To enhance performance in Korean, we applied our curated augmentation to STS datasets and fine-tuned the E5 model using a carefully balanced ratio across datasets.
> ⚠️ This is an experimental model and is under continuous development.
### To-do
- [x] MTEB Leaderboard
- [ ] Technical Report
## MTEB
PwC-Embedding_expr was evaluated on the Korean subset of MTEB.
A leaderboard link will be added once it is published.
| Task | PwC-Embedding_expr |
|------------------|--------------------|
| KLUE-STS | 0.88 |
| KLUE-TC | 0.73 |
| Ko-StrategyQA | 0.80 |
| KorSTS | 0.84 |
| MIRACL-Reranking | 0.72 |
| MIRACL-Retrieval | 0.65 |
| **Average** | **0.77** |
## Model
- Base Model: [intfloat/multilingual-e5-large-instruct](https://huggingface.co/intfloat/multilingual-e5-large-instruct)
- Model Size: 0.56B
- Embedding Dimension: 1024
- Max Input Tokens: 514
## Requirements
It works with the dependencies included in the latest version of MTEB.
## Citation
TBD (technical report expected September 2025)

27
config.json Executable file
View File

@@ -0,0 +1,27 @@
{
"architectures": [
"XLMRobertaModel"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "xlm-roberta",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"output_past": true,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.54.1",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 250002
}

View File

@@ -0,0 +1,14 @@
{
"__version__": {
"sentence_transformers": "5.0.0",
"transformers": "4.54.1",
"pytorch": "2.7.1+cu126"
},
"model_type": "SentenceTransformer",
"prompts": {
"query": "",
"document": ""
},
"default_prompt_name": null,
"similarity_fn_name": "cosine"
}

3
model.safetensors Executable file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7885c86ec3e367d8050844502d36af2db5c4599f4b5a3a771a761177bc7c594d
size 2239607176

20
modules.json Executable file
View File

@@ -0,0 +1,20 @@
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
},
{
"idx": 2,
"name": "2",
"path": "2_Normalize",
"type": "sentence_transformers.models.Normalize"
}
]

4
sentence_bert_config.json Executable file
View File

@@ -0,0 +1,4 @@
{
"max_seq_length": 512,
"do_lower_case": false
}

51
special_tokens_map.json Executable file
View File

@@ -0,0 +1,51 @@
{
"bos_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"cls_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"mask_token": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"sep_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

3
tokenizer.json Executable file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
size 17082987

56
tokenizer_config.json Executable file
View File

@@ -0,0 +1,56 @@
{
"added_tokens_decoder": {
"0": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"250001": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [],
"bos_token": "<s>",
"clean_up_tokenization_spaces": true,
"cls_token": "<s>",
"eos_token": "</s>",
"extra_special_tokens": {},
"mask_token": "<mask>",
"model_max_length": 512,
"pad_token": "<pad>",
"sep_token": "</s>",
"tokenizer_class": "XLMRobertaTokenizer",
"unk_token": "<unk>"
}