初始化项目,由ModelHub XC社区提供模型
Model: telepix/PIXIE-Rune-v1.0 Source: Original Platform
This commit is contained in:
37
.gitattributes
vendored
Normal file
37
.gitattributes
vendored
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
|
onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text
|
||||||
10
1_Pooling/config.json
Normal file
10
1_Pooling/config.json
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"word_embedding_dimension": 1024,
|
||||||
|
"pooling_mode_cls_token": true,
|
||||||
|
"pooling_mode_mean_tokens": false,
|
||||||
|
"pooling_mode_max_tokens": false,
|
||||||
|
"pooling_mode_mean_sqrt_len_tokens": false,
|
||||||
|
"pooling_mode_weightedmean_tokens": false,
|
||||||
|
"pooling_mode_lasttoken": false,
|
||||||
|
"include_prompt": true
|
||||||
|
}
|
||||||
212
LICENSE
Normal file
212
LICENSE
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
Copyright (c) 2026 TelePIX Co., Ltd. All rights reserved.
|
||||||
|
|
||||||
|
Built with snowflake-arctic-embed-l-v2.0 — original model by Snowflake, licensed under the Apache License 2.0.
|
||||||
|
|
||||||
|
Unless otherwise stated, all files in this repository (including modified model weights
|
||||||
|
and tokenizer files) are distributed under the terms of the Apache License, Version 2.0
|
||||||
|
(the "License"). You may obtain a copy of the License at:
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under
|
||||||
|
the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||||
|
ANY KIND, either express or implied. See the License for the specific language governing
|
||||||
|
permissions and limitations under the License.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
NOTICE (Apache-2.0 §4 d)
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
This product is built with snowflake-arctic-embed-l-v2.0 developed by Snowflake.
|
||||||
|
(https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0) and distributed under the Apache License 2.0.
|
||||||
|
|
||||||
|
See the upstream snowflake-arctic-embed-l-v2.0 for additional attribution details.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
TRADEMARK
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
"TelePIX" and associated logos are trademarks of TelePIX Co., Ltd.
|
||||||
|
This License does not grant permission to use these trademarks without prior
|
||||||
|
written consent.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
APACHE LICENSE 2.0
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
249
README.md
Normal file
249
README.md
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
---
|
||||||
|
tags:
|
||||||
|
- sentence-transformers
|
||||||
|
- sentence-similarity
|
||||||
|
- dense-encoder
|
||||||
|
- dense
|
||||||
|
- feature-extraction
|
||||||
|
- retrieval
|
||||||
|
- multimodal
|
||||||
|
- multi-modal
|
||||||
|
- crossmodal
|
||||||
|
- cross-modal
|
||||||
|
- aerospace
|
||||||
|
- telepix
|
||||||
|
language:
|
||||||
|
- af
|
||||||
|
- ar
|
||||||
|
- az
|
||||||
|
- be
|
||||||
|
- bg
|
||||||
|
- bn
|
||||||
|
- ca
|
||||||
|
- ceb
|
||||||
|
- cs
|
||||||
|
- cy
|
||||||
|
- da
|
||||||
|
- de
|
||||||
|
- el
|
||||||
|
- en
|
||||||
|
- es
|
||||||
|
- et
|
||||||
|
- eu
|
||||||
|
- fa
|
||||||
|
- fi
|
||||||
|
- fr
|
||||||
|
- gl
|
||||||
|
- gu
|
||||||
|
- he
|
||||||
|
- hi
|
||||||
|
- hr
|
||||||
|
- ht
|
||||||
|
- hu
|
||||||
|
- hy
|
||||||
|
- id
|
||||||
|
- is
|
||||||
|
- it
|
||||||
|
- ja
|
||||||
|
- jv
|
||||||
|
- ka
|
||||||
|
- kk
|
||||||
|
- km
|
||||||
|
- kn
|
||||||
|
- ko
|
||||||
|
- ky
|
||||||
|
- lo
|
||||||
|
- lt
|
||||||
|
- lv
|
||||||
|
- mk
|
||||||
|
- ml
|
||||||
|
- mn
|
||||||
|
- mr
|
||||||
|
- ms
|
||||||
|
- my
|
||||||
|
- ne
|
||||||
|
- nl
|
||||||
|
- pa
|
||||||
|
- pl
|
||||||
|
- pt
|
||||||
|
- qu
|
||||||
|
- ro
|
||||||
|
- ru
|
||||||
|
- si
|
||||||
|
- sk
|
||||||
|
- sl
|
||||||
|
- so
|
||||||
|
- sq
|
||||||
|
- sr
|
||||||
|
- sv
|
||||||
|
- sw
|
||||||
|
- ta
|
||||||
|
- te
|
||||||
|
- th
|
||||||
|
- tl
|
||||||
|
- tr
|
||||||
|
- uk
|
||||||
|
- ur
|
||||||
|
- vi
|
||||||
|
- yo
|
||||||
|
- zh
|
||||||
|
pipeline_tag: feature-extraction
|
||||||
|
library_name: sentence-transformers
|
||||||
|
license: apache-2.0
|
||||||
|
---
|
||||||
|
<p align="center">
|
||||||
|
<img src="https://cdn-uploads.huggingface.co/production/uploads/61d6f4a4d49065ee28a1ee7e/V8n2En7BlMNHoi1YXVv8Q.png" width="400"/>
|
||||||
|
<p>
|
||||||
|
|
||||||
|
# PIXIE-Rune-v1.0
|
||||||
|
**PIXIE-Rune-v1.0** is an encoder-based embedding model trained on Korean and English information retrieval dataset,
|
||||||
|
developed by [TelePIX Co., Ltd](https://telepix.net/).
|
||||||
|
**PIXIE** stands for Tele**PIX** **I**ntelligent **E**mbedding, representing TelePIX’s high-performance embedding technology.
|
||||||
|
This model is specifically optimized for semantic retrieval tasks in Korean and English, and demonstrates strong performance in aerospace domain. Through extensive fine-tuning and domain-specific evaluation, PIXIE shows robust retrieval quality for real-world use cases such as document understanding, technical QA, and semantic search in aerospace and related high-precision fields.
|
||||||
|
It also performs competitively across a wide range of open-domain Korean and English retrieval benchmarks, making it a versatile foundation for multilingual semantic search systems.
|
||||||
|
|
||||||
|
|
||||||
|
## Model Description
|
||||||
|
- **Model Type:** Sentence Transformer
|
||||||
|
<!-- - **Base model:** [Unknown](https://huggingface.co/unknown) -->
|
||||||
|
- **Maximum Sequence Length:** 6144 tokens
|
||||||
|
- **Output Dimensionality:** 1024 dimensions
|
||||||
|
- **Similarity Function:** Cosine Similarity
|
||||||
|
- **Language:** Multilingual — optimized for high performance in Korean and English
|
||||||
|
- **Domain Specialization:** Aerospace Information Retrieval
|
||||||
|
- **License:** apache-2.0
|
||||||
|
|
||||||
|
### Full Model Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
SentenceTransformer(
|
||||||
|
(0): Transformer({'max_seq_length': 6144, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
|
||||||
|
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
||||||
|
(2): Normalize()
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quality Benchmarks
|
||||||
|
**PIXIE-Rune-v1.0** is a multilingual embedding model specialized for Korean and English retrieval tasks.
|
||||||
|
It delivers consistently strong performance across a diverse set of domain-specific and open-domain benchmarks in both languages, demonstrating its effectiveness in real-world semantic search applications.
|
||||||
|
The table below presents the retrieval performance of several embedding models evaluated on a variety of Korean and English benchmarks.
|
||||||
|
We report **Normalized Discounted Cumulative Gain (nDCG@10)** scores, which measure how well a ranked list of documents aligns with ground truth relevance. Higher values indicate better retrieval quality.
|
||||||
|
|
||||||
|
All evaluations were conducted using the open-source **[Korean-MTEB-Retrieval-Evaluators](https://github.com/BM-K/Korean-MTEB-Retrieval-Evaluators)** codebase to ensure consistent dataset handling, indexing, retrieval, and nDCG@10 computation across models.
|
||||||
|
|
||||||
|
### Benchmark Overview and Dataset Descriptions
|
||||||
|
| Model Name | # params | STELLA (XL) | MTEB (ko) | BEIR (en) |
|
||||||
|
|------|:---:|:---:|:---:|:---:|
|
||||||
|
| **telepix/PIXIE-Rune-v1.0** | **0.5B** | **0.6345** | **0.7603** | **0.5872** |
|
||||||
|
| | | | | |
|
||||||
|
| nvidia/llama-embed-nemotron-8b | 8B | 0.7181 | 0.7813 | 0.6935 |
|
||||||
|
| Qwen/Qwen3-Embedding-8B | 8B | 0.6154 | 0.7839 | 0.6701 |
|
||||||
|
| Snowflake/snowflake-arctic-embed-l-v2.0 | 0.5B | 0.5448 | 0.7390 | 0.6006 |
|
||||||
|
| BAAI/bge-m3 | 0.5B | 0.5056 | 0.7483 | 0.5573 |
|
||||||
|
| Qwen/Qwen3-Embedding-0.6B | 0.6B | 0.4707 | 0.7017 | 0.5839 |
|
||||||
|
| Octen/Octen-Embedding-0.6B | 0.6B | 0.4683 | 0.7057 | 0.5769 |
|
||||||
|
| Salesforce/SFR-Embedding-Mistral | 7B | 0.4579 | N/A | N/A |
|
||||||
|
| Alibaba-NLP/gte-multilingual-base | 0.3B | 0.4097 | 0.7084 | 0.5746 |
|
||||||
|
| intfloat/multilingual-e5-large-instruct | 0.6B | 0.2384 | 0.7050 | N/A |
|
||||||
|
| jinaai/jina-embeddings-v3 | 0.5B | N/A | 0.7088 | 0.4861 |
|
||||||
|
| openai/text-embedding-3-large | N/A | N/A | 0.6646 | N/A |
|
||||||
|
|
||||||
|
To better interpret the evaluation results above, we briefly describe the characteristics and evaluation intent of each benchmark suite used in this comparison.
|
||||||
|
Each benchmark is designed to assess different aspects of retrieval capability, ranging from domain-specific technical understanding to open-domain and multilingual generalization.
|
||||||
|
|
||||||
|
#### STELLA
|
||||||
|
[STELLA](https://arxiv.org/abs/2601.03496) is an aerospace-domain Information Retrieval (IR) benchmark constructed from NASA Technical Reports Server (NTRS) documents. It is designed to evaluate both:
|
||||||
|
|
||||||
|
- **Lexical matching** ability (does the retriever benefit from exact technical terms? | TCQ)
|
||||||
|
- **Semantic matching** ability (can the retriever match concepts even when technical terms are not explicitly used? | TAQ).
|
||||||
|
|
||||||
|
STELLA provides **dual-type synthetic queries** and a **cross-lingual extension** for multilingual evaluation while keeping the corpus in English.
|
||||||
|
|
||||||
|
#### 6 Datasets of MTEB (Korean)
|
||||||
|
Descriptions of the benchmark datasets used for evaluation are as follows:
|
||||||
|
- **Ko-StrategyQA**
|
||||||
|
A Korean multi-hop open-domain question answering dataset designed for complex reasoning over multiple documents.
|
||||||
|
- **AutoRAGRetrieval**
|
||||||
|
A domain-diverse retrieval dataset covering finance, government, healthcare, legal, and e-commerce sectors.
|
||||||
|
- **MIRACLRetrieval**
|
||||||
|
A document retrieval benchmark built on Korean Wikipedia articles.
|
||||||
|
- **PublicHealthQA**
|
||||||
|
A retrieval dataset focused on medical and public health topics.
|
||||||
|
- **BelebeleRetrieval**
|
||||||
|
A dataset for retrieving relevant content from web and news articles in Korean.
|
||||||
|
- **MultiLongDocRetrieval**
|
||||||
|
A long-document retrieval benchmark based on Korean Wikipedia and mC4 corpus.
|
||||||
|
|
||||||
|
#### 7 Datasets of BEIR (English)
|
||||||
|
Descriptions of the benchmark datasets used for evaluation are as follows:
|
||||||
|
- **ArguAna**
|
||||||
|
A dataset for argument retrieval based on claim-counterclaim pairs from online debate forums.
|
||||||
|
- **FEVER**
|
||||||
|
A fact verification dataset using Wikipedia for evidence-based claim validation.
|
||||||
|
- **FiQA-2018**
|
||||||
|
A retrieval benchmark tailored to the finance domain with real-world questions and answers.
|
||||||
|
- **HotpotQA**
|
||||||
|
A multi-hop open-domain QA dataset requiring reasoning across multiple documents.
|
||||||
|
- **MSMARCO**
|
||||||
|
A large-scale benchmark using real Bing search queries and corresponding web documents.
|
||||||
|
- **NQ**
|
||||||
|
A Google QA dataset where user questions are answered using Wikipedia articles.
|
||||||
|
- **SCIDOCS**
|
||||||
|
A citation-based document retrieval dataset focused on scientific papers.
|
||||||
|
|
||||||
|
## Direct Use (Semantic Search)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
# Load the model
|
||||||
|
model_name = 'telepix/PIXIE-Rune-v1.0'
|
||||||
|
model = SentenceTransformer(model_name)
|
||||||
|
|
||||||
|
# Define the queries and documents
|
||||||
|
queries = [
|
||||||
|
"텔레픽스는 어떤 산업 분야에서 위성 데이터를 활용하나요?",
|
||||||
|
"국방 분야에 어떤 위성 서비스가 제공되나요?",
|
||||||
|
"텔레픽스의 기술 수준은 어느 정도인가요?",
|
||||||
|
]
|
||||||
|
documents = [
|
||||||
|
"텔레픽스는 해양, 자원, 농업 등 다양한 분야에서 위성 데이터를 분석하여 서비스를 제공합니다.",
|
||||||
|
"정찰 및 감시 목적의 위성 영상을 통해 국방 관련 정밀 분석 서비스를 제공합니다.",
|
||||||
|
"TelePIX의 광학 탑재체 및 AI 분석 기술은 Global standard를 상회하는 수준으로 평가받고 있습니다.",
|
||||||
|
"텔레픽스는 우주에서 수집한 정보를 분석하여 '우주 경제(Space Economy)'라는 새로운 가치를 창출하고 있습니다.",
|
||||||
|
"텔레픽스는 위성 영상 획득부터 분석, 서비스 제공까지 전 주기를 아우르는 솔루션을 제공합니다.",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Compute embeddings: use `prompt_name="query"` to encode queries!
|
||||||
|
query_embeddings = model.encode(queries, prompt_name="query")
|
||||||
|
document_embeddings = model.encode(documents)
|
||||||
|
|
||||||
|
# Compute cosine similarity scores
|
||||||
|
scores = model.similarity(query_embeddings, document_embeddings)
|
||||||
|
|
||||||
|
# Output the results
|
||||||
|
for query, query_scores in zip(queries, scores):
|
||||||
|
doc_score_pairs = list(zip(documents, query_scores))
|
||||||
|
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
|
||||||
|
print("Query:", query)
|
||||||
|
for document, score in doc_score_pairs:
|
||||||
|
print(score, document)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
The PIXIE-Rune-v1.0 model is licensed under Apache License 2.0.
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
```
|
||||||
|
@misc{TelePIX-PIXIE-Rune-v1.0,
|
||||||
|
title={PIXIE-Rune-v1.0},
|
||||||
|
author={TelePIX AI Research Team and Bongmin Kim},
|
||||||
|
year={2026},
|
||||||
|
url={https://huggingface.co/telepix/PIXIE-Rune-v1.0}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Contact
|
||||||
|
|
||||||
|
If you have any suggestions or questions about the PIXIE, please reach out to the authors at bmkim@telepix.net.
|
||||||
30
config.json
Normal file
30
config.json
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"XLMRobertaModel"
|
||||||
|
],
|
||||||
|
"attention_probs_dropout_prob": 0.1,
|
||||||
|
"bos_token_id": 0,
|
||||||
|
"classifier_dropout": null,
|
||||||
|
"dtype": "float32",
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"hidden_act": "gelu",
|
||||||
|
"hidden_dropout_prob": 0.1,
|
||||||
|
"hidden_size": 1024,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 4096,
|
||||||
|
"layer_norm_eps": 1e-05,
|
||||||
|
"matryoshka_dimensions": [
|
||||||
|
256
|
||||||
|
],
|
||||||
|
"max_position_embeddings": 8194,
|
||||||
|
"model_type": "xlm-roberta",
|
||||||
|
"num_attention_heads": 16,
|
||||||
|
"num_hidden_layers": 24,
|
||||||
|
"output_past": true,
|
||||||
|
"pad_token_id": 1,
|
||||||
|
"position_embedding_type": "absolute",
|
||||||
|
"transformers_version": "4.56.2",
|
||||||
|
"type_vocab_size": 1,
|
||||||
|
"use_cache": true,
|
||||||
|
"vocab_size": 250002
|
||||||
|
}
|
||||||
14
config_sentence_transformers.json
Normal file
14
config_sentence_transformers.json
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"__version__": {
|
||||||
|
"sentence_transformers": "5.1.2",
|
||||||
|
"transformers": "4.56.2",
|
||||||
|
"pytorch": "2.8.0+cu128"
|
||||||
|
},
|
||||||
|
"prompts": {
|
||||||
|
"query": "query: ",
|
||||||
|
"document": ""
|
||||||
|
},
|
||||||
|
"default_prompt_name": null,
|
||||||
|
"model_type": "SentenceTransformer",
|
||||||
|
"similarity_fn_name": "cosine"
|
||||||
|
}
|
||||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:eef21979fce2049a10048bddc7ac464b1d4617b04f82359cea2632350566d5ea
|
||||||
|
size 2271064456
|
||||||
20
modules.json
Normal file
20
modules.json
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"idx": 0,
|
||||||
|
"name": "0",
|
||||||
|
"path": "",
|
||||||
|
"type": "sentence_transformers.models.Transformer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"idx": 1,
|
||||||
|
"name": "1",
|
||||||
|
"path": "1_Pooling",
|
||||||
|
"type": "sentence_transformers.models.Pooling"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"idx": 2,
|
||||||
|
"name": "2",
|
||||||
|
"path": "2_Normalize",
|
||||||
|
"type": "sentence_transformers.models.Normalize"
|
||||||
|
}
|
||||||
|
]
|
||||||
3
onnx/model.onnx
Normal file
3
onnx/model.onnx
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:fa56838400193ffc08cda0a7a130abefde5035eb652d1a63a63bb9d7d39321b3
|
||||||
|
size 614872
|
||||||
3
onnx/model.onnx_data
Normal file
3
onnx/model.onnx_data
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:84d64606ff97e4abfbac9b03dc713c1fe3de955a66ea9f46c20c6a12d5f0512b
|
||||||
|
size 2266886160
|
||||||
4
sentence_bert_config.json
Normal file
4
sentence_bert_config.json
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
{
|
||||||
|
"max_seq_length": 6144,
|
||||||
|
"do_lower_case": false
|
||||||
|
}
|
||||||
51
special_tokens_map.json
Normal file
51
special_tokens_map.json
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"cls_token": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"mask_token": {
|
||||||
|
"content": "<mask>",
|
||||||
|
"lstrip": true,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<pad>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"sep_token": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:33b81d158d38590c83f2e3b4c3644f5eb63136c14a77f6bdb147913ad33a9bbc
|
||||||
|
size 17083054
|
||||||
62
tokenizer_config.json
Normal file
62
tokenizer_config.json
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
{
|
||||||
|
"added_tokens_decoder": {
|
||||||
|
"0": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"1": {
|
||||||
|
"content": "<pad>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"3": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"250001": {
|
||||||
|
"content": "<mask>",
|
||||||
|
"lstrip": true,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"bos_token": "<s>",
|
||||||
|
"clean_up_tokenization_spaces": true,
|
||||||
|
"cls_token": "<s>",
|
||||||
|
"eos_token": "</s>",
|
||||||
|
"extra_special_tokens": {},
|
||||||
|
"mask_token": "<mask>",
|
||||||
|
"max_length": 512,
|
||||||
|
"model_max_length": 6144,
|
||||||
|
"pad_to_multiple_of": null,
|
||||||
|
"pad_token": "<pad>",
|
||||||
|
"pad_token_type_id": 0,
|
||||||
|
"padding_side": "right",
|
||||||
|
"sep_token": "</s>",
|
||||||
|
"stride": 0,
|
||||||
|
"tokenizer_class": "XLMRobertaTokenizer",
|
||||||
|
"truncation_side": "right",
|
||||||
|
"truncation_strategy": "longest_first",
|
||||||
|
"unk_token": "<unk>"
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user