初始化项目,由ModelHub XC社区提供模型
Model: llm-semantic-router/aegis-embed Source: Original Platform
This commit is contained in:
40
.gitattributes
vendored
Normal file
40
.gitattributes
vendored
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
|
onnx/layer-11/model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
||||||
|
onnx/layer-16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
||||||
|
onnx/layer-22/model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
||||||
|
onnx/layer-6/model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
||||||
10
1_Pooling/config.json
Normal file
10
1_Pooling/config.json
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"word_embedding_dimension": 768,
|
||||||
|
"pooling_mode_cls_token": false,
|
||||||
|
"pooling_mode_mean_tokens": true,
|
||||||
|
"pooling_mode_max_tokens": false,
|
||||||
|
"pooling_mode_mean_sqrt_len_tokens": false,
|
||||||
|
"pooling_mode_weightedmean_tokens": false,
|
||||||
|
"pooling_mode_lasttoken": false,
|
||||||
|
"include_prompt": true
|
||||||
|
}
|
||||||
308
README.md
Normal file
308
README.md
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
---
|
||||||
|
library_name: sentence-transformers
|
||||||
|
tags:
|
||||||
|
- sentence-transformers
|
||||||
|
- sentence-similarity
|
||||||
|
- feature-extraction
|
||||||
|
- embeddings
|
||||||
|
- multilingual
|
||||||
|
- matryoshka
|
||||||
|
- 2d-matryoshka
|
||||||
|
- long-context
|
||||||
|
- modernbert
|
||||||
|
- retrieval
|
||||||
|
- rag
|
||||||
|
- agents
|
||||||
|
- routing
|
||||||
|
- memory
|
||||||
|
base_model: llm-semantic-router/mmbert-32k-yarn
|
||||||
|
datasets:
|
||||||
|
- BAAI/bge-m3-data
|
||||||
|
language:
|
||||||
|
- multilingual
|
||||||
|
license: apache-2.0
|
||||||
|
pipeline_tag: sentence-similarity
|
||||||
|
model-index:
|
||||||
|
- name: aegis-embed
|
||||||
|
results:
|
||||||
|
- task:
|
||||||
|
type: STS
|
||||||
|
dataset:
|
||||||
|
name: STS Benchmark
|
||||||
|
type: mteb/stsbenchmark-sts
|
||||||
|
metrics:
|
||||||
|
- type: spearman
|
||||||
|
value: 80.5
|
||||||
|
---
|
||||||
|
|
||||||
|
# aegis-embed
|
||||||
|
|
||||||
|
`aegis-embed` is a **multilingual long-context embedding model purpose-built for agent-native retrieval, memory, and decision workflows**.
|
||||||
|
|
||||||
|
It is designed for systems where embeddings sit on the semantic hot path rather than at the edge of the stack: **memory lookup, knowledge retrieval, tool matching, task routing, long-horizon recall, clustering, and multilingual indexing**. Its value is not just a benchmark score, but a practical operating profile that fits real agent runtimes: **32K context**, **2D Matryoshka adaptability across dimensions and layers**, **307M-class deployability**, and **strong latency-quality efficiency under repeated inference**.
|
||||||
|
|
||||||
|
In short, `aegis-embed` is built for teams that want one embedding space to support **fast routing, scalable retrieval, and high-confidence semantic matching** without paying the operational cost of a much larger model.
|
||||||
|
|
||||||
|
## Why it fits agentic workloads
|
||||||
|
|
||||||
|
Agentic systems do not call embeddings once. They call them **everywhere**: before retrieval, during routing, when matching tools, when searching memory, and while compressing or re-ranking state. That means a useful agent embedding model must be more than accurate — it must also be flexible under tight runtime budgets.
|
||||||
|
|
||||||
|
`aegis-embed` is designed around that reality.
|
||||||
|
|
||||||
|
### 1. One model, many budget tiers
|
||||||
|
|
||||||
|
This model supports **Matryoshka embeddings**, which means you can encode once at full size and truncate to smaller dimensions with limited quality loss.
|
||||||
|
|
||||||
|
That is especially useful for agent systems because different stages of the stack often need different budgets:
|
||||||
|
|
||||||
|
- **64d** for very cheap candidate generation, broad routing, or huge memory banks
|
||||||
|
- **256d** for balanced retrieval over large corpora
|
||||||
|
- **768d** for highest-quality retrieval, offline indexing, or final-stage matching
|
||||||
|
|
||||||
|
Instead of managing separate embedding models for each tier, you can keep **one semantic space** and choose the dimensional budget that matches the task.
|
||||||
|
|
||||||
|
### 2. 2D Matryoshka gives runtime flexibility, not just storage savings
|
||||||
|
|
||||||
|
The model is trained with **2D Matryoshka** behavior:
|
||||||
|
|
||||||
|
- **dimension reduction** for smaller vectors and lower storage / bandwidth cost
|
||||||
|
- **layer reduction** for lower-latency inference paths in custom runtimes
|
||||||
|
|
||||||
|
This matters for agents because the same system often mixes:
|
||||||
|
|
||||||
|
- latency-sensitive routing decisions
|
||||||
|
- high-volume memory scans
|
||||||
|
- higher-quality retrieval for final evidence gathering
|
||||||
|
|
||||||
|
A single model that can serve multiple latency / quality profiles is much easier to operate than a stack of unrelated specialized encoders.
|
||||||
|
|
||||||
|
### 3. Long context helps when agent state is not naturally short
|
||||||
|
|
||||||
|
Many agent workloads are not short isolated queries. They involve:
|
||||||
|
|
||||||
|
- tool descriptions
|
||||||
|
- execution traces
|
||||||
|
- long notes
|
||||||
|
- merged memory summaries
|
||||||
|
- multi-hop research snippets
|
||||||
|
- large document chunks
|
||||||
|
|
||||||
|
With **32,768 tokens** of context length, `aegis-embed` can represent larger semantic units before you are forced into aggressive chunking. That helps preserve cross-section meaning in long documents and richer memory entries.
|
||||||
|
|
||||||
|
### 4. Small enough to be operationally practical
|
||||||
|
|
||||||
|
At roughly **307M parameters**, this model sits in a useful middle ground:
|
||||||
|
|
||||||
|
- substantially lighter than large embedding models in the 600M+ or multi-billion range
|
||||||
|
- still expressive enough for multilingual retrieval and similarity work
|
||||||
|
- easier to host in systems where embedding is part of a hot path rather than an occasional offline batch
|
||||||
|
|
||||||
|
For agentic platforms, that usually means better economics and simpler scaling.
|
||||||
|
|
||||||
|
### 5. One embedding space across the stack
|
||||||
|
|
||||||
|
Agent systems are easier to operate when **routing, retrieval, memory search, and semantic matching** all live in the same vector space.
|
||||||
|
|
||||||
|
`aegis-embed` is well suited to that pattern:
|
||||||
|
|
||||||
|
- **64d** can serve broad routing and large-memory scanning
|
||||||
|
- **256d** can cover the main retrieval tier
|
||||||
|
- **768d** can stay reserved for the highest-fidelity matching paths
|
||||||
|
|
||||||
|
That means one model can cover multiple semantic stages without forcing the system to juggle incompatible encoders, duplicated indexes, or divergent retrieval behavior.
|
||||||
|
|
||||||
|
## Model at a glance
|
||||||
|
|
||||||
|
| Feature | Value |
|
||||||
|
|---------|-------|
|
||||||
|
| **Parameters** | 307M |
|
||||||
|
| **Architecture** | ModernBERT encoder with YaRN scaling |
|
||||||
|
| **Hidden Size** | 768 |
|
||||||
|
| **Layers** | 22 |
|
||||||
|
| **Context Length** | 32,768 tokens |
|
||||||
|
| **Pooling** | Mean pooling |
|
||||||
|
| **Similarity** | Cosine |
|
||||||
|
| **Languages** | Multilingual |
|
||||||
|
| **Matryoshka Dimensions** | 768, 512, 256, 128, 64 |
|
||||||
|
|
||||||
|
## Headline results
|
||||||
|
|
||||||
|
| Metric | Score |
|
||||||
|
|--------|-------|
|
||||||
|
| **MTEB Mean (24 tasks)** | **61.4** |
|
||||||
|
| **STS Benchmark** | **80.5** |
|
||||||
|
| **Dimension Retention** | **99% @ 256d**, **98% @ 64d** |
|
||||||
|
| **Layer Speedup** | **3.3× @ 6L**, **5.8× @ 3L** |
|
||||||
|
| **Latency vs BGE-M3** | **1.6-3.1× faster** on longer sequences / larger batches |
|
||||||
|
|
||||||
|
These numbers make the model particularly attractive for systems that must balance **quality, latency, vector size, and deployment simplicity** instead of optimizing only for leaderboard peak score.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Basic usage with Sentence Transformers
|
||||||
|
|
||||||
|
```python
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
model = SentenceTransformer("/path/to/aegis-embed")
|
||||||
|
|
||||||
|
texts = [
|
||||||
|
"Find tool descriptions related to browser automation.",
|
||||||
|
"检索和用户历史偏好相关的记忆。",
|
||||||
|
"Retrieve notes about deployment failures in staging.",
|
||||||
|
]
|
||||||
|
|
||||||
|
embeddings = model.encode(texts)
|
||||||
|
print(embeddings.shape) # (3, 768)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Matryoshka truncation for smaller vectors
|
||||||
|
|
||||||
|
```python
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
model = SentenceTransformer("/path/to/aegis-embed")
|
||||||
|
embeddings = model.encode(texts, convert_to_tensor=True)
|
||||||
|
|
||||||
|
# Balanced retrieval tier
|
||||||
|
embeddings_256d = F.normalize(embeddings[:, :256], p=2, dim=1)
|
||||||
|
|
||||||
|
# Ultra-cheap routing / large memory-bank tier
|
||||||
|
embeddings_64d = F.normalize(embeddings[:, :64], p=2, dim=1)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Long-context encoding
|
||||||
|
|
||||||
|
```python
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
model = SentenceTransformer("/path/to/aegis-embed")
|
||||||
|
model.max_seq_length = 8192 # can be increased up to 32768
|
||||||
|
|
||||||
|
long_note = "..."
|
||||||
|
embedding = model.encode(long_note)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Why Matryoshka matters for agents
|
||||||
|
|
||||||
|
A common agent stack has several retrieval-like stages:
|
||||||
|
|
||||||
|
1. **broad candidate fetch** over a very large store
|
||||||
|
2. **narrower semantic lookup** over a smaller candidate set
|
||||||
|
3. **high-confidence final matching** before action or answer synthesis
|
||||||
|
|
||||||
|
Matryoshka lets one model support all three stages:
|
||||||
|
|
||||||
|
| Stage | Suggested Dim | Why |
|
||||||
|
|------|---------------|-----|
|
||||||
|
| Broad routing / candidate generation | 64d | Maximize speed and minimize storage |
|
||||||
|
| Main retrieval | 256d | Strong balance of quality and cost |
|
||||||
|
| Final matching / offline indexing | 768d | Best semantic fidelity |
|
||||||
|
|
||||||
|
That is often a better operational story than mixing several incompatible embedding models across the same pipeline.
|
||||||
|
|
||||||
|
## Evaluation details
|
||||||
|
|
||||||
|
### MTEB benchmark (24 tasks)
|
||||||
|
|
||||||
|
| Category | Score |
|
||||||
|
|----------|-------|
|
||||||
|
| STS (7 tasks) | **79.3** |
|
||||||
|
| Classification (6) | 62.4 |
|
||||||
|
| Pair Classification (2) | 76.2 |
|
||||||
|
| Reranking (2) | 64.4 |
|
||||||
|
| Clustering (4) | 36.9 |
|
||||||
|
| Retrieval (3) | 38.2 |
|
||||||
|
| **Overall Mean** | **61.4** |
|
||||||
|
|
||||||
|
### STS benchmark comparison
|
||||||
|
|
||||||
|
| Model | Parameters | STS Score |
|
||||||
|
|-------|------------|-----------|
|
||||||
|
| Qwen3-Embed-0.6B | 600M | 76.17 |
|
||||||
|
| **aegis-embed** | **307M** | **80.5** |
|
||||||
|
| Qwen3-Embed-8B | 8B | 81.08 |
|
||||||
|
|
||||||
|
### 2D Matryoshka quality matrix (STS)
|
||||||
|
|
||||||
|
| Layers | 768d | 256d | 64d |
|
||||||
|
|--------|------|------|-----|
|
||||||
|
| 22L | **80.5** | 79.9 | 78.5 |
|
||||||
|
| 11L | 53.7 | 48.0 | 44.4 |
|
||||||
|
| 6L | 45.2 | 45.2 | 43.5 |
|
||||||
|
| 3L | 44.0 | 44.1 | 41.8 |
|
||||||
|
|
||||||
|
### Long-context retrieval (4K tokens)
|
||||||
|
|
||||||
|
| Metric | Score |
|
||||||
|
|--------|-------|
|
||||||
|
| R@1 | 68.8% |
|
||||||
|
| R@10 | 81.2% |
|
||||||
|
| MRR | 71.9% |
|
||||||
|
|
||||||
|
### Throughput (AMD MI300X)
|
||||||
|
|
||||||
|
| Layers | Throughput | Speedup |
|
||||||
|
|--------|------------|---------|
|
||||||
|
| 22L | 477/s | 1.0× |
|
||||||
|
| 11L | 916/s | 1.9× |
|
||||||
|
| 6L | 1573/s | 3.3× |
|
||||||
|
| 3L | 2761/s | 5.8× |
|
||||||
|
|
||||||
|
## Training
|
||||||
|
|
||||||
|
### Data
|
||||||
|
|
||||||
|
Trained on [BAAI/bge-m3-data](https://huggingface.co/datasets/BAAI/bge-m3-data) with multilingual triplets across diverse domains.
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
- **Base model**: [llm-semantic-router/mmbert-32k-yarn](https://huggingface.co/llm-semantic-router/mmbert-32k-yarn)
|
||||||
|
- **Loss**: `Matryoshka2dLoss` (combines adaptive layer loss and Matryoshka loss)
|
||||||
|
- **Matryoshka dimensions**: `[768, 512, 256, 128, 64]`
|
||||||
|
- **Max sequence length**: `32768`
|
||||||
|
- **Batch size**: `16` (effective `32` with gradient accumulation)
|
||||||
|
- **Learning rate**: `2e-5`
|
||||||
|
- **Hardware**: AMD Instinct MI300X
|
||||||
|
|
||||||
|
## Recommended use cases
|
||||||
|
|
||||||
|
`aegis-embed` is especially well suited for:
|
||||||
|
|
||||||
|
- **Agent memory retrieval** across long, mixed-format notes or histories
|
||||||
|
- **Tool and skill selection** where descriptions need semantic matching
|
||||||
|
- **Knowledge-base retrieval** for assistants and RAG systems
|
||||||
|
- **Multilingual search** across mixed-language corpora
|
||||||
|
- **Large memory banks** that benefit from 64d / 256d vector tiers
|
||||||
|
- **Long-document semantic indexing** where short-context encoders lose structure
|
||||||
|
|
||||||
|
## Model lineage and packaging
|
||||||
|
|
||||||
|
`aegis-embed` is derived from `llm-semantic-router/mmbert-embed-32k-2d-matryoshka` and distributed here as a lean Sentence Transformers / PyTorch package.
|
||||||
|
|
||||||
|
This build intentionally omits bundled ONNX artifacts so the model remains smaller and easier to move, mirror, cache, and deploy in environments that primarily rely on native Transformers runtimes.
|
||||||
|
|
||||||
|
## Limitations
|
||||||
|
|
||||||
|
- Full-quality mode is still the best default for important retrieval decisions; aggressive layer reduction trades away quality.
|
||||||
|
- Although the model supports up to 32K tokens, very long inputs still increase compute and memory cost.
|
||||||
|
- The model is optimized for retrieval and semantic similarity; some downstream tasks may benefit from task-specific fine-tuning.
|
||||||
|
- If your deployment stack requires ONNX out of the box, you will need to export that separately.
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
If you use this model, please cite the upstream work it is derived from:
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@misc{mmbert-embed-2d-matryoshka,
|
||||||
|
title={mmBERT-Embed: Multilingual Embedding Model with 2D Matryoshka Training},
|
||||||
|
author={vLLM Semantic Router Team},
|
||||||
|
year={2025},
|
||||||
|
url={https://huggingface.co/llm-semantic-router/mmbert-embed-32k-2d-matryoshka}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Apache 2.0
|
||||||
46
config.json
Normal file
46
config.json
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"ModernBertModel"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 2,
|
||||||
|
"classifier_activation": "gelu",
|
||||||
|
"classifier_bias": false,
|
||||||
|
"classifier_dropout": 0.0,
|
||||||
|
"classifier_pooling": "mean",
|
||||||
|
"cls_token_id": 1,
|
||||||
|
"decoder_bias": true,
|
||||||
|
"deterministic_flash_attn": false,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"embedding_dropout": 0.0,
|
||||||
|
"eos_token_id": 1,
|
||||||
|
"global_attn_every_n_layers": 3,
|
||||||
|
"global_rope_theta": 160000,
|
||||||
|
"gradient_checkpointing": false,
|
||||||
|
"hidden_activation": "gelu",
|
||||||
|
"hidden_size": 768,
|
||||||
|
"initializer_cutoff_factor": 2.0,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 1152,
|
||||||
|
"layer_norm_eps": 1e-05,
|
||||||
|
"local_attention": 128,
|
||||||
|
"local_rope_theta": 160000,
|
||||||
|
"mask_token_id": 4,
|
||||||
|
"max_position_embeddings": 32768,
|
||||||
|
"mlp_bias": false,
|
||||||
|
"mlp_dropout": 0.0,
|
||||||
|
"model_type": "modernbert",
|
||||||
|
"norm_bias": false,
|
||||||
|
"norm_eps": 1e-05,
|
||||||
|
"num_attention_heads": 12,
|
||||||
|
"num_hidden_layers": 22,
|
||||||
|
"pad_token_id": 0,
|
||||||
|
"position_embedding_type": "sans_pos",
|
||||||
|
"repad_logits_with_grad": false,
|
||||||
|
"sep_token_id": 1,
|
||||||
|
"sparse_pred_ignore_index": -100,
|
||||||
|
"sparse_prediction": false,
|
||||||
|
"transformers_version": "4.57.6",
|
||||||
|
"vocab_size": 256000
|
||||||
|
}
|
||||||
14
config_sentence_transformers.json
Normal file
14
config_sentence_transformers.json
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"model_type": "SentenceTransformer",
|
||||||
|
"__version__": {
|
||||||
|
"sentence_transformers": "5.3.0.dev0",
|
||||||
|
"transformers": "4.57.6",
|
||||||
|
"pytorch": "2.9.1+git8907517"
|
||||||
|
},
|
||||||
|
"prompts": {
|
||||||
|
"query": "",
|
||||||
|
"document": ""
|
||||||
|
},
|
||||||
|
"default_prompt_name": null,
|
||||||
|
"similarity_fn_name": "cosine"
|
||||||
|
}
|
||||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:173eb71beab911ddccf5c23d46129890894f1b741bedbf15e6d0f46084da3391
|
||||||
|
size 613892480
|
||||||
14
modules.json
Normal file
14
modules.json
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"idx": 0,
|
||||||
|
"name": "0",
|
||||||
|
"path": "",
|
||||||
|
"type": "sentence_transformers.models.Transformer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"idx": 1,
|
||||||
|
"name": "1",
|
||||||
|
"path": "1_Pooling",
|
||||||
|
"type": "sentence_transformers.models.Pooling"
|
||||||
|
}
|
||||||
|
]
|
||||||
4
sentence_bert_config.json
Normal file
4
sentence_bert_config.json
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
{
|
||||||
|
"max_seq_length": 32768,
|
||||||
|
"do_lower_case": false
|
||||||
|
}
|
||||||
55
special_tokens_map.json
Normal file
55
special_tokens_map.json
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
{
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<start_of_turn>",
|
||||||
|
"<end_of_turn>"
|
||||||
|
],
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<bos>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"cls_token": {
|
||||||
|
"content": "<bos>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<eos>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"mask_token": {
|
||||||
|
"content": "<mask>",
|
||||||
|
"lstrip": true,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<pad>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"sep_token": {
|
||||||
|
"content": "<eos>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:5b14c7584d507951e1723f53f4e82cc76db81b7c0df3dc3c48bed45954b0277c
|
||||||
|
size 34363443
|
||||||
2018
tokenizer_config.json
Normal file
2018
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user