初始化项目,由ModelHub XC社区提供模型
Model: telepix/PIXIE-Rune-v1.0 Source: Original Platform
This commit is contained in:
37
.gitattributes
vendored
Normal file
37
.gitattributes
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text
|
||||
10
1_Pooling/config.json
Normal file
10
1_Pooling/config.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"word_embedding_dimension": 1024,
|
||||
"pooling_mode_cls_token": true,
|
||||
"pooling_mode_mean_tokens": false,
|
||||
"pooling_mode_max_tokens": false,
|
||||
"pooling_mode_mean_sqrt_len_tokens": false,
|
||||
"pooling_mode_weightedmean_tokens": false,
|
||||
"pooling_mode_lasttoken": false,
|
||||
"include_prompt": true
|
||||
}
|
||||
212
LICENSE
Normal file
212
LICENSE
Normal file
@@ -0,0 +1,212 @@
|
||||
Copyright (c) 2026 TelePIX Co., Ltd. All rights reserved.
|
||||
|
||||
Built with snowflake-arctic-embed-l-v2.0 — original model by Snowflake, licensed under the Apache License 2.0.
|
||||
|
||||
Unless otherwise stated, all files in this repository (including modified model weights
|
||||
and tokenizer files) are distributed under the terms of the Apache License, Version 2.0
|
||||
(the "License"). You may obtain a copy of the License at:
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under
|
||||
the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
ANY KIND, either express or implied. See the License for the specific language governing
|
||||
permissions and limitations under the License.
|
||||
|
||||
================================================================================
|
||||
NOTICE (Apache-2.0 §4 d)
|
||||
================================================================================
|
||||
|
||||
This product is built with snowflake-arctic-embed-l-v2.0 developed by Snowflake.
|
||||
(https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0) and distributed under the Apache License 2.0.
|
||||
|
||||
See the upstream snowflake-arctic-embed-l-v2.0 for additional attribution details.
|
||||
|
||||
================================================================================
|
||||
TRADEMARK
|
||||
================================================================================
|
||||
|
||||
"TelePIX" and associated logos are trademarks of TelePIX Co., Ltd.
|
||||
This License does not grant permission to use these trademarks without prior
|
||||
written consent.
|
||||
|
||||
================================================================================
|
||||
APACHE LICENSE 2.0
|
||||
================================================================================
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
249
README.md
Normal file
249
README.md
Normal file
@@ -0,0 +1,249 @@
|
||||
---
|
||||
tags:
|
||||
- sentence-transformers
|
||||
- sentence-similarity
|
||||
- dense-encoder
|
||||
- dense
|
||||
- feature-extraction
|
||||
- retrieval
|
||||
- multimodal
|
||||
- multi-modal
|
||||
- crossmodal
|
||||
- cross-modal
|
||||
- aerospace
|
||||
- telepix
|
||||
language:
|
||||
- af
|
||||
- ar
|
||||
- az
|
||||
- be
|
||||
- bg
|
||||
- bn
|
||||
- ca
|
||||
- ceb
|
||||
- cs
|
||||
- cy
|
||||
- da
|
||||
- de
|
||||
- el
|
||||
- en
|
||||
- es
|
||||
- et
|
||||
- eu
|
||||
- fa
|
||||
- fi
|
||||
- fr
|
||||
- gl
|
||||
- gu
|
||||
- he
|
||||
- hi
|
||||
- hr
|
||||
- ht
|
||||
- hu
|
||||
- hy
|
||||
- id
|
||||
- is
|
||||
- it
|
||||
- ja
|
||||
- jv
|
||||
- ka
|
||||
- kk
|
||||
- km
|
||||
- kn
|
||||
- ko
|
||||
- ky
|
||||
- lo
|
||||
- lt
|
||||
- lv
|
||||
- mk
|
||||
- ml
|
||||
- mn
|
||||
- mr
|
||||
- ms
|
||||
- my
|
||||
- ne
|
||||
- nl
|
||||
- pa
|
||||
- pl
|
||||
- pt
|
||||
- qu
|
||||
- ro
|
||||
- ru
|
||||
- si
|
||||
- sk
|
||||
- sl
|
||||
- so
|
||||
- sq
|
||||
- sr
|
||||
- sv
|
||||
- sw
|
||||
- ta
|
||||
- te
|
||||
- th
|
||||
- tl
|
||||
- tr
|
||||
- uk
|
||||
- ur
|
||||
- vi
|
||||
- yo
|
||||
- zh
|
||||
pipeline_tag: feature-extraction
|
||||
library_name: sentence-transformers
|
||||
license: apache-2.0
|
||||
---
|
||||
<p align="center">
|
||||
<img src="https://cdn-uploads.huggingface.co/production/uploads/61d6f4a4d49065ee28a1ee7e/V8n2En7BlMNHoi1YXVv8Q.png" width="400"/>
|
||||
<p>
|
||||
|
||||
# PIXIE-Rune-v1.0
|
||||
**PIXIE-Rune-v1.0** is an encoder-based embedding model trained on Korean and English information retrieval dataset,
|
||||
developed by [TelePIX Co., Ltd](https://telepix.net/).
|
||||
**PIXIE** stands for Tele**PIX** **I**ntelligent **E**mbedding, representing TelePIX’s high-performance embedding technology.
|
||||
This model is specifically optimized for semantic retrieval tasks in Korean and English, and demonstrates strong performance in aerospace domain. Through extensive fine-tuning and domain-specific evaluation, PIXIE shows robust retrieval quality for real-world use cases such as document understanding, technical QA, and semantic search in aerospace and related high-precision fields.
|
||||
It also performs competitively across a wide range of open-domain Korean and English retrieval benchmarks, making it a versatile foundation for multilingual semantic search systems.
|
||||
|
||||
|
||||
## Model Description
|
||||
- **Model Type:** Sentence Transformer
|
||||
<!-- - **Base model:** [Unknown](https://huggingface.co/unknown) -->
|
||||
- **Maximum Sequence Length:** 6144 tokens
|
||||
- **Output Dimensionality:** 1024 dimensions
|
||||
- **Similarity Function:** Cosine Similarity
|
||||
- **Language:** Multilingual — optimized for high performance in Korean and English
|
||||
- **Domain Specialization:** Aerospace Information Retrieval
|
||||
- **License:** apache-2.0
|
||||
|
||||
### Full Model Architecture
|
||||
|
||||
```
|
||||
SentenceTransformer(
|
||||
(0): Transformer({'max_seq_length': 6144, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
|
||||
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
||||
(2): Normalize()
|
||||
)
|
||||
```
|
||||
|
||||
## Quality Benchmarks
|
||||
**PIXIE-Rune-v1.0** is a multilingual embedding model specialized for Korean and English retrieval tasks.
|
||||
It delivers consistently strong performance across a diverse set of domain-specific and open-domain benchmarks in both languages, demonstrating its effectiveness in real-world semantic search applications.
|
||||
The table below presents the retrieval performance of several embedding models evaluated on a variety of Korean and English benchmarks.
|
||||
We report **Normalized Discounted Cumulative Gain (nDCG@10)** scores, which measure how well a ranked list of documents aligns with ground truth relevance. Higher values indicate better retrieval quality.
|
||||
|
||||
All evaluations were conducted using the open-source **[Korean-MTEB-Retrieval-Evaluators](https://github.com/BM-K/Korean-MTEB-Retrieval-Evaluators)** codebase to ensure consistent dataset handling, indexing, retrieval, and nDCG@10 computation across models.
|
||||
|
||||
### Benchmark Overview and Dataset Descriptions
|
||||
| Model Name | # params | STELLA (XL) | MTEB (ko) | BEIR (en) |
|
||||
|------|:---:|:---:|:---:|:---:|
|
||||
| **telepix/PIXIE-Rune-v1.0** | **0.5B** | **0.6345** | **0.7603** | **0.5872** |
|
||||
| | | | | |
|
||||
| nvidia/llama-embed-nemotron-8b | 8B | 0.7181 | 0.7813 | 0.6935 |
|
||||
| Qwen/Qwen3-Embedding-8B | 8B | 0.6154 | 0.7839 | 0.6701 |
|
||||
| Snowflake/snowflake-arctic-embed-l-v2.0 | 0.5B | 0.5448 | 0.7390 | 0.6006 |
|
||||
| BAAI/bge-m3 | 0.5B | 0.5056 | 0.7483 | 0.5573 |
|
||||
| Qwen/Qwen3-Embedding-0.6B | 0.6B | 0.4707 | 0.7017 | 0.5839 |
|
||||
| Octen/Octen-Embedding-0.6B | 0.6B | 0.4683 | 0.7057 | 0.5769 |
|
||||
| Salesforce/SFR-Embedding-Mistral | 7B | 0.4579 | N/A | N/A |
|
||||
| Alibaba-NLP/gte-multilingual-base | 0.3B | 0.4097 | 0.7084 | 0.5746 |
|
||||
| intfloat/multilingual-e5-large-instruct | 0.6B | 0.2384 | 0.7050 | N/A |
|
||||
| jinaai/jina-embeddings-v3 | 0.5B | N/A | 0.7088 | 0.4861 |
|
||||
| openai/text-embedding-3-large | N/A | N/A | 0.6646 | N/A |
|
||||
|
||||
To better interpret the evaluation results above, we briefly describe the characteristics and evaluation intent of each benchmark suite used in this comparison.
|
||||
Each benchmark is designed to assess different aspects of retrieval capability, ranging from domain-specific technical understanding to open-domain and multilingual generalization.
|
||||
|
||||
#### STELLA
|
||||
[STELLA](https://arxiv.org/abs/2601.03496) is an aerospace-domain Information Retrieval (IR) benchmark constructed from NASA Technical Reports Server (NTRS) documents. It is designed to evaluate both:
|
||||
|
||||
- **Lexical matching** ability (does the retriever benefit from exact technical terms? | TCQ)
|
||||
- **Semantic matching** ability (can the retriever match concepts even when technical terms are not explicitly used? | TAQ).
|
||||
|
||||
STELLA provides **dual-type synthetic queries** and a **cross-lingual extension** for multilingual evaluation while keeping the corpus in English.
|
||||
|
||||
#### 6 Datasets of MTEB (Korean)
|
||||
Descriptions of the benchmark datasets used for evaluation are as follows:
|
||||
- **Ko-StrategyQA**
|
||||
A Korean multi-hop open-domain question answering dataset designed for complex reasoning over multiple documents.
|
||||
- **AutoRAGRetrieval**
|
||||
A domain-diverse retrieval dataset covering finance, government, healthcare, legal, and e-commerce sectors.
|
||||
- **MIRACLRetrieval**
|
||||
A document retrieval benchmark built on Korean Wikipedia articles.
|
||||
- **PublicHealthQA**
|
||||
A retrieval dataset focused on medical and public health topics.
|
||||
- **BelebeleRetrieval**
|
||||
A dataset for retrieving relevant content from web and news articles in Korean.
|
||||
- **MultiLongDocRetrieval**
|
||||
A long-document retrieval benchmark based on Korean Wikipedia and mC4 corpus.
|
||||
|
||||
#### 7 Datasets of BEIR (English)
|
||||
Descriptions of the benchmark datasets used for evaluation are as follows:
|
||||
- **ArguAna**
|
||||
A dataset for argument retrieval based on claim-counterclaim pairs from online debate forums.
|
||||
- **FEVER**
|
||||
A fact verification dataset using Wikipedia for evidence-based claim validation.
|
||||
- **FiQA-2018**
|
||||
A retrieval benchmark tailored to the finance domain with real-world questions and answers.
|
||||
- **HotpotQA**
|
||||
A multi-hop open-domain QA dataset requiring reasoning across multiple documents.
|
||||
- **MSMARCO**
|
||||
A large-scale benchmark using real Bing search queries and corresponding web documents.
|
||||
- **NQ**
|
||||
A Google QA dataset where user questions are answered using Wikipedia articles.
|
||||
- **SCIDOCS**
|
||||
A citation-based document retrieval dataset focused on scientific papers.
|
||||
|
||||
## Direct Use (Semantic Search)
|
||||
|
||||
```python
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# Load the model
|
||||
model_name = 'telepix/PIXIE-Rune-v1.0'
|
||||
model = SentenceTransformer(model_name)
|
||||
|
||||
# Define the queries and documents
|
||||
queries = [
|
||||
"텔레픽스는 어떤 산업 분야에서 위성 데이터를 활용하나요?",
|
||||
"국방 분야에 어떤 위성 서비스가 제공되나요?",
|
||||
"텔레픽스의 기술 수준은 어느 정도인가요?",
|
||||
]
|
||||
documents = [
|
||||
"텔레픽스는 해양, 자원, 농업 등 다양한 분야에서 위성 데이터를 분석하여 서비스를 제공합니다.",
|
||||
"정찰 및 감시 목적의 위성 영상을 통해 국방 관련 정밀 분석 서비스를 제공합니다.",
|
||||
"TelePIX의 광학 탑재체 및 AI 분석 기술은 Global standard를 상회하는 수준으로 평가받고 있습니다.",
|
||||
"텔레픽스는 우주에서 수집한 정보를 분석하여 '우주 경제(Space Economy)'라는 새로운 가치를 창출하고 있습니다.",
|
||||
"텔레픽스는 위성 영상 획득부터 분석, 서비스 제공까지 전 주기를 아우르는 솔루션을 제공합니다.",
|
||||
]
|
||||
|
||||
# Compute embeddings: use `prompt_name="query"` to encode queries!
|
||||
query_embeddings = model.encode(queries, prompt_name="query")
|
||||
document_embeddings = model.encode(documents)
|
||||
|
||||
# Compute cosine similarity scores
|
||||
scores = model.similarity(query_embeddings, document_embeddings)
|
||||
|
||||
# Output the results
|
||||
for query, query_scores in zip(queries, scores):
|
||||
doc_score_pairs = list(zip(documents, query_scores))
|
||||
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
|
||||
print("Query:", query)
|
||||
for document, score in doc_score_pairs:
|
||||
print(score, document)
|
||||
|
||||
```
|
||||
|
||||
## License
|
||||
The PIXIE-Rune-v1.0 model is licensed under Apache License 2.0.
|
||||
|
||||
## Citation
|
||||
```
|
||||
@misc{TelePIX-PIXIE-Rune-v1.0,
|
||||
title={PIXIE-Rune-v1.0},
|
||||
author={TelePIX AI Research Team and Bongmin Kim},
|
||||
year={2026},
|
||||
url={https://huggingface.co/telepix/PIXIE-Rune-v1.0}
|
||||
}
|
||||
```
|
||||
|
||||
## Contact
|
||||
|
||||
If you have any suggestions or questions about the PIXIE, please reach out to the authors at bmkim@telepix.net.
|
||||
30
config.json
Normal file
30
config.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"architectures": [
|
||||
"XLMRobertaModel"
|
||||
],
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"bos_token_id": 0,
|
||||
"classifier_dropout": null,
|
||||
"dtype": "float32",
|
||||
"eos_token_id": 2,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 1024,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 4096,
|
||||
"layer_norm_eps": 1e-05,
|
||||
"matryoshka_dimensions": [
|
||||
256
|
||||
],
|
||||
"max_position_embeddings": 8194,
|
||||
"model_type": "xlm-roberta",
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 24,
|
||||
"output_past": true,
|
||||
"pad_token_id": 1,
|
||||
"position_embedding_type": "absolute",
|
||||
"transformers_version": "4.56.2",
|
||||
"type_vocab_size": 1,
|
||||
"use_cache": true,
|
||||
"vocab_size": 250002
|
||||
}
|
||||
14
config_sentence_transformers.json
Normal file
14
config_sentence_transformers.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"__version__": {
|
||||
"sentence_transformers": "5.1.2",
|
||||
"transformers": "4.56.2",
|
||||
"pytorch": "2.8.0+cu128"
|
||||
},
|
||||
"prompts": {
|
||||
"query": "query: ",
|
||||
"document": ""
|
||||
},
|
||||
"default_prompt_name": null,
|
||||
"model_type": "SentenceTransformer",
|
||||
"similarity_fn_name": "cosine"
|
||||
}
|
||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:eef21979fce2049a10048bddc7ac464b1d4617b04f82359cea2632350566d5ea
|
||||
size 2271064456
|
||||
20
modules.json
Normal file
20
modules.json
Normal file
@@ -0,0 +1,20 @@
|
||||
[
|
||||
{
|
||||
"idx": 0,
|
||||
"name": "0",
|
||||
"path": "",
|
||||
"type": "sentence_transformers.models.Transformer"
|
||||
},
|
||||
{
|
||||
"idx": 1,
|
||||
"name": "1",
|
||||
"path": "1_Pooling",
|
||||
"type": "sentence_transformers.models.Pooling"
|
||||
},
|
||||
{
|
||||
"idx": 2,
|
||||
"name": "2",
|
||||
"path": "2_Normalize",
|
||||
"type": "sentence_transformers.models.Normalize"
|
||||
}
|
||||
]
|
||||
3
onnx/model.onnx
Normal file
3
onnx/model.onnx
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:fa56838400193ffc08cda0a7a130abefde5035eb652d1a63a63bb9d7d39321b3
|
||||
size 614872
|
||||
3
onnx/model.onnx_data
Normal file
3
onnx/model.onnx_data
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:84d64606ff97e4abfbac9b03dc713c1fe3de955a66ea9f46c20c6a12d5f0512b
|
||||
size 2266886160
|
||||
4
sentence_bert_config.json
Normal file
4
sentence_bert_config.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"max_seq_length": 6144,
|
||||
"do_lower_case": false
|
||||
}
|
||||
51
special_tokens_map.json
Normal file
51
special_tokens_map.json
Normal file
@@ -0,0 +1,51 @@
|
||||
{
|
||||
"bos_token": {
|
||||
"content": "<s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"cls_token": {
|
||||
"content": "<s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "</s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"mask_token": {
|
||||
"content": "<mask>",
|
||||
"lstrip": true,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": {
|
||||
"content": "<pad>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"sep_token": {
|
||||
"content": "</s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"unk_token": {
|
||||
"content": "<unk>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:33b81d158d38590c83f2e3b4c3644f5eb63136c14a77f6bdb147913ad33a9bbc
|
||||
size 17083054
|
||||
62
tokenizer_config.json
Normal file
62
tokenizer_config.json
Normal file
@@ -0,0 +1,62 @@
|
||||
{
|
||||
"added_tokens_decoder": {
|
||||
"0": {
|
||||
"content": "<s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"1": {
|
||||
"content": "<pad>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"2": {
|
||||
"content": "</s>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"3": {
|
||||
"content": "<unk>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"250001": {
|
||||
"content": "<mask>",
|
||||
"lstrip": true,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"bos_token": "<s>",
|
||||
"clean_up_tokenization_spaces": true,
|
||||
"cls_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
"extra_special_tokens": {},
|
||||
"mask_token": "<mask>",
|
||||
"max_length": 512,
|
||||
"model_max_length": 6144,
|
||||
"pad_to_multiple_of": null,
|
||||
"pad_token": "<pad>",
|
||||
"pad_token_type_id": 0,
|
||||
"padding_side": "right",
|
||||
"sep_token": "</s>",
|
||||
"stride": 0,
|
||||
"tokenizer_class": "XLMRobertaTokenizer",
|
||||
"truncation_side": "right",
|
||||
"truncation_strategy": "longest_first",
|
||||
"unk_token": "<unk>"
|
||||
}
|
||||
Reference in New Issue
Block a user