From 7c5a9bd74b4be5e595dae8bdc6dee8d20956d9fd Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Thu, 14 May 2026 17:04:15 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: telepix/PIXIE-Rune-v1.0 Source: Original Platform --- .gitattributes | 37 +++++ 1_Pooling/config.json | 10 ++ LICENSE | 212 +++++++++++++++++++++++++ README.md | 249 ++++++++++++++++++++++++++++++ config.json | 30 ++++ config_sentence_transformers.json | 14 ++ model.safetensors | 3 + modules.json | 20 +++ onnx/model.onnx | 3 + onnx/model.onnx_data | 3 + sentence_bert_config.json | 4 + special_tokens_map.json | 51 ++++++ tokenizer.json | 3 + tokenizer_config.json | 62 ++++++++ 14 files changed, 701 insertions(+) create mode 100644 .gitattributes create mode 100644 1_Pooling/config.json create mode 100644 LICENSE create mode 100644 README.md create mode 100644 config.json create mode 100644 config_sentence_transformers.json create mode 100644 model.safetensors create mode 100644 modules.json create mode 100644 onnx/model.onnx create mode 100644 onnx/model.onnx_data create mode 100644 sentence_bert_config.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..3fc40c7 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text diff --git a/1_Pooling/config.json b/1_Pooling/config.json new file mode 100644 index 0000000..0dfd14b --- /dev/null +++ b/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 1024, + "pooling_mode_cls_token": true, + "pooling_mode_mean_tokens": false, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": false, + "include_prompt": true +} \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..17e583d --- /dev/null +++ b/LICENSE @@ -0,0 +1,212 @@ +Copyright (c) 2026 TelePIX Co., Ltd. All rights reserved. + +Built with snowflake-arctic-embed-l-v2.0 — original model by Snowflake, licensed under the Apache License 2.0. + +Unless otherwise stated, all files in this repository (including modified model weights +and tokenizer files) are distributed under the terms of the Apache License, Version 2.0 +(the "License"). You may obtain a copy of the License at: + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under +the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +ANY KIND, either express or implied. See the License for the specific language governing +permissions and limitations under the License. + +================================================================================ +NOTICE (Apache-2.0 §4 d) +================================================================================ + +This product is built with snowflake-arctic-embed-l-v2.0 developed by Snowflake. +(https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0) and distributed under the Apache License 2.0. + +See the upstream snowflake-arctic-embed-l-v2.0 for additional attribution details. + +================================================================================ +TRADEMARK +================================================================================ + +"TelePIX" and associated logos are trademarks of TelePIX Co., Ltd. +This License does not grant permission to use these trademarks without prior +written consent. + +================================================================================ +APACHE LICENSE 2.0 +================================================================================ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..c6b9605 --- /dev/null +++ b/README.md @@ -0,0 +1,249 @@ +--- +tags: +- sentence-transformers +- sentence-similarity +- dense-encoder +- dense +- feature-extraction +- retrieval +- multimodal +- multi-modal +- crossmodal +- cross-modal +- aerospace +- telepix +language: +- af +- ar +- az +- be +- bg +- bn +- ca +- ceb +- cs +- cy +- da +- de +- el +- en +- es +- et +- eu +- fa +- fi +- fr +- gl +- gu +- he +- hi +- hr +- ht +- hu +- hy +- id +- is +- it +- ja +- jv +- ka +- kk +- km +- kn +- ko +- ky +- lo +- lt +- lv +- mk +- ml +- mn +- mr +- ms +- my +- ne +- nl +- pa +- pl +- pt +- qu +- ro +- ru +- si +- sk +- sl +- so +- sq +- sr +- sv +- sw +- ta +- te +- th +- tl +- tr +- uk +- ur +- vi +- yo +- zh +pipeline_tag: feature-extraction +library_name: sentence-transformers +license: apache-2.0 +--- +

+ +

+ +# PIXIE-Rune-v1.0 +**PIXIE-Rune-v1.0** is an encoder-based embedding model trained on Korean and English information retrieval dataset, +developed by [TelePIX Co., Ltd](https://telepix.net/). +**PIXIE** stands for Tele**PIX** **I**ntelligent **E**mbedding, representing TelePIX’s high-performance embedding technology. +This model is specifically optimized for semantic retrieval tasks in Korean and English, and demonstrates strong performance in aerospace domain. Through extensive fine-tuning and domain-specific evaluation, PIXIE shows robust retrieval quality for real-world use cases such as document understanding, technical QA, and semantic search in aerospace and related high-precision fields. +It also performs competitively across a wide range of open-domain Korean and English retrieval benchmarks, making it a versatile foundation for multilingual semantic search systems. + + +## Model Description +- **Model Type:** Sentence Transformer + +- **Maximum Sequence Length:** 6144 tokens +- **Output Dimensionality:** 1024 dimensions +- **Similarity Function:** Cosine Similarity +- **Language:** Multilingual — optimized for high performance in Korean and English +- **Domain Specialization:** Aerospace Information Retrieval +- **License:** apache-2.0 + +### Full Model Architecture + +``` +SentenceTransformer( + (0): Transformer({'max_seq_length': 6144, 'do_lower_case': False}) with Transformer model: XLMRobertaModel + (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True}) + (2): Normalize() +) +``` + +## Quality Benchmarks +**PIXIE-Rune-v1.0** is a multilingual embedding model specialized for Korean and English retrieval tasks. +It delivers consistently strong performance across a diverse set of domain-specific and open-domain benchmarks in both languages, demonstrating its effectiveness in real-world semantic search applications. +The table below presents the retrieval performance of several embedding models evaluated on a variety of Korean and English benchmarks. +We report **Normalized Discounted Cumulative Gain (nDCG@10)** scores, which measure how well a ranked list of documents aligns with ground truth relevance. Higher values indicate better retrieval quality. + +All evaluations were conducted using the open-source **[Korean-MTEB-Retrieval-Evaluators](https://github.com/BM-K/Korean-MTEB-Retrieval-Evaluators)** codebase to ensure consistent dataset handling, indexing, retrieval, and nDCG@10 computation across models. + +### Benchmark Overview and Dataset Descriptions +| Model Name | # params | STELLA (XL) | MTEB (ko) | BEIR (en) | +|------|:---:|:---:|:---:|:---:| +| **telepix/PIXIE-Rune-v1.0** | **0.5B** | **0.6345** | **0.7603** | **0.5872** | +| | | | | | +| nvidia/llama-embed-nemotron-8b | 8B | 0.7181 | 0.7813 | 0.6935 | +| Qwen/Qwen3-Embedding-8B | 8B | 0.6154 | 0.7839 | 0.6701 | +| Snowflake/snowflake-arctic-embed-l-v2.0 | 0.5B | 0.5448 | 0.7390 | 0.6006 | +| BAAI/bge-m3 | 0.5B | 0.5056 | 0.7483 | 0.5573 | +| Qwen/Qwen3-Embedding-0.6B | 0.6B | 0.4707 | 0.7017 | 0.5839 | +| Octen/Octen-Embedding-0.6B | 0.6B | 0.4683 | 0.7057 | 0.5769 | +| Salesforce/SFR-Embedding-Mistral | 7B | 0.4579 | N/A | N/A | +| Alibaba-NLP/gte-multilingual-base | 0.3B | 0.4097 | 0.7084 | 0.5746 | +| intfloat/multilingual-e5-large-instruct | 0.6B | 0.2384 | 0.7050 | N/A | +| jinaai/jina-embeddings-v3 | 0.5B | N/A | 0.7088 | 0.4861 | +| openai/text-embedding-3-large | N/A | N/A | 0.6646 | N/A | + +To better interpret the evaluation results above, we briefly describe the characteristics and evaluation intent of each benchmark suite used in this comparison. +Each benchmark is designed to assess different aspects of retrieval capability, ranging from domain-specific technical understanding to open-domain and multilingual generalization. + +#### STELLA +[STELLA](https://arxiv.org/abs/2601.03496) is an aerospace-domain Information Retrieval (IR) benchmark constructed from NASA Technical Reports Server (NTRS) documents. It is designed to evaluate both: + +- **Lexical matching** ability (does the retriever benefit from exact technical terms? | TCQ) +- **Semantic matching** ability (can the retriever match concepts even when technical terms are not explicitly used? | TAQ). + +STELLA provides **dual-type synthetic queries** and a **cross-lingual extension** for multilingual evaluation while keeping the corpus in English. + +#### 6 Datasets of MTEB (Korean) +Descriptions of the benchmark datasets used for evaluation are as follows: +- **Ko-StrategyQA** + A Korean multi-hop open-domain question answering dataset designed for complex reasoning over multiple documents. +- **AutoRAGRetrieval** + A domain-diverse retrieval dataset covering finance, government, healthcare, legal, and e-commerce sectors. +- **MIRACLRetrieval** + A document retrieval benchmark built on Korean Wikipedia articles. +- **PublicHealthQA** + A retrieval dataset focused on medical and public health topics. +- **BelebeleRetrieval** + A dataset for retrieving relevant content from web and news articles in Korean. +- **MultiLongDocRetrieval** + A long-document retrieval benchmark based on Korean Wikipedia and mC4 corpus. + +#### 7 Datasets of BEIR (English) +Descriptions of the benchmark datasets used for evaluation are as follows: +- **ArguAna** + A dataset for argument retrieval based on claim-counterclaim pairs from online debate forums. +- **FEVER** + A fact verification dataset using Wikipedia for evidence-based claim validation. +- **FiQA-2018** + A retrieval benchmark tailored to the finance domain with real-world questions and answers. +- **HotpotQA** + A multi-hop open-domain QA dataset requiring reasoning across multiple documents. +- **MSMARCO** + A large-scale benchmark using real Bing search queries and corresponding web documents. +- **NQ** + A Google QA dataset where user questions are answered using Wikipedia articles. +- **SCIDOCS** + A citation-based document retrieval dataset focused on scientific papers. + +## Direct Use (Semantic Search) + +```python +from sentence_transformers import SentenceTransformer + +# Load the model +model_name = 'telepix/PIXIE-Rune-v1.0' +model = SentenceTransformer(model_name) + +# Define the queries and documents +queries = [ + "텔레픽스는 어떤 산업 분야에서 위성 데이터를 활용하나요?", + "국방 분야에 어떤 위성 서비스가 제공되나요?", + "텔레픽스의 기술 수준은 어느 정도인가요?", +] +documents = [ + "텔레픽스는 해양, 자원, 농업 등 다양한 분야에서 위성 데이터를 분석하여 서비스를 제공합니다.", + "정찰 및 감시 목적의 위성 영상을 통해 국방 관련 정밀 분석 서비스를 제공합니다.", + "TelePIX의 광학 탑재체 및 AI 분석 기술은 Global standard를 상회하는 수준으로 평가받고 있습니다.", + "텔레픽스는 우주에서 수집한 정보를 분석하여 '우주 경제(Space Economy)'라는 새로운 가치를 창출하고 있습니다.", + "텔레픽스는 위성 영상 획득부터 분석, 서비스 제공까지 전 주기를 아우르는 솔루션을 제공합니다.", +] + +# Compute embeddings: use `prompt_name="query"` to encode queries! +query_embeddings = model.encode(queries, prompt_name="query") +document_embeddings = model.encode(documents) + +# Compute cosine similarity scores +scores = model.similarity(query_embeddings, document_embeddings) + +# Output the results +for query, query_scores in zip(queries, scores): + doc_score_pairs = list(zip(documents, query_scores)) + doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True) + print("Query:", query) + for document, score in doc_score_pairs: + print(score, document) + +``` + +## License +The PIXIE-Rune-v1.0 model is licensed under Apache License 2.0. + +## Citation +``` +@misc{TelePIX-PIXIE-Rune-v1.0, + title={PIXIE-Rune-v1.0}, + author={TelePIX AI Research Team and Bongmin Kim}, + year={2026}, + url={https://huggingface.co/telepix/PIXIE-Rune-v1.0} +} +``` + +## Contact + +If you have any suggestions or questions about the PIXIE, please reach out to the authors at bmkim@telepix.net. \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..c0ac75d --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "XLMRobertaModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "classifier_dropout": null, + "dtype": "float32", + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "matryoshka_dimensions": [ + 256 + ], + "max_position_embeddings": 8194, + "model_type": "xlm-roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "transformers_version": "4.56.2", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/config_sentence_transformers.json b/config_sentence_transformers.json new file mode 100644 index 0000000..d397d30 --- /dev/null +++ b/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "__version__": { + "sentence_transformers": "5.1.2", + "transformers": "4.56.2", + "pytorch": "2.8.0+cu128" + }, + "prompts": { + "query": "query: ", + "document": "" + }, + "default_prompt_name": null, + "model_type": "SentenceTransformer", + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..8f82646 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eef21979fce2049a10048bddc7ac464b1d4617b04f82359cea2632350566d5ea +size 2271064456 diff --git a/modules.json b/modules.json new file mode 100644 index 0000000..952a9b8 --- /dev/null +++ b/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/onnx/model.onnx b/onnx/model.onnx new file mode 100644 index 0000000..1490b0e --- /dev/null +++ b/onnx/model.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa56838400193ffc08cda0a7a130abefde5035eb652d1a63a63bb9d7d39321b3 +size 614872 diff --git a/onnx/model.onnx_data b/onnx/model.onnx_data new file mode 100644 index 0000000..0cc30d2 --- /dev/null +++ b/onnx/model.onnx_data @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d64606ff97e4abfbac9b03dc713c1fe3de955a66ea9f46c20c6a12d5f0512b +size 2266886160 diff --git a/sentence_bert_config.json b/sentence_bert_config.json new file mode 100644 index 0000000..736faeb --- /dev/null +++ b/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 6144, + "do_lower_case": false +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..b1879d7 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..33d69e5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33b81d158d38590c83f2e3b4c3644f5eb63136c14a77f6bdb147913ad33a9bbc +size 17083054 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..e2a2c7c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,62 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "max_length": 512, + "model_max_length": 6144, + "pad_to_multiple_of": null, + "pad_token": "", + "pad_token_type_id": 0, + "padding_side": "right", + "sep_token": "", + "stride": 0, + "tokenizer_class": "XLMRobertaTokenizer", + "truncation_side": "right", + "truncation_strategy": "longest_first", + "unk_token": "" +}