初始化项目,由ModelHub XC社区提供模型

Model: BAAI/bge-code-v1
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-14 14:33:23 +08:00
commit b631246acd
17 changed files with 152581 additions and 0 deletions

36
.gitattributes vendored Normal file
View File

@@ -0,0 +1,36 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
tokenizer.json filter=lfs diff=lfs merge=lfs -text

10
1_Pooling/config.json Normal file
View File

@@ -0,0 +1,10 @@
{
"word_embedding_dimension": 1536,
"pooling_mode_cls_token": false,
"pooling_mode_mean_tokens": false,
"pooling_mode_max_tokens": false,
"pooling_mode_mean_sqrt_len_tokens": false,
"pooling_mode_weightedmean_tokens": false,
"pooling_mode_lasttoken": true,
"include_prompt": true
}

213
README.md Normal file
View File

@@ -0,0 +1,213 @@
---
language:
- zh
- en
tags:
- sentence-transformers
- sentence-similarity
- feature-extraction
- transformers
pipeline_tag: sentence-similarity
library_name: sentence-transformers
license: apache-2.0
---
<h1 align="center">FlagEmbedding</h1>
For more details please refer to our Github: [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding).
**BGE-Code-v1** is an LLM-based code embedding model that supports code retrieval, text retrieval, and multilingual retrieval. It primarily demonstrates the following capabilities:
- Superior Code Retrieval Performance: The model demonstrates exceptional code retrieval capabilities, supporting natural language queries in both English and Chinese, as well as 20 programming languages.
- Robust Text Retrieval Capabilities: The model maintains strong text retrieval capabilities comparable to text embedding models of similar scale.
- Extensive Multilingual Support: BGE-Code-v1 offers comprehensive multilingual retrieval capabilities, excelling in languages such as English, Chinese, Japanese, French, and more.
## Usage
### Using FlagEmbedding
```
git clone https://github.com/FlagOpen/FlagEmbedding.git
cd FlagEmbedding
pip install -e .
```
```python
from FlagEmbedding import FlagLLMModel
queries = [
"Delete the record with ID 4 from the 'Staff' table.",
'Delete all records in the "Livestock" table where age is greater than 5'
]
documents = [
"DELETE FROM Staff WHERE StaffID = 4;",
"DELETE FROM Livestock WHERE age > 5;"
]
model = FlagLLMModel('BAAI/bge-code-v1',
query_instruction_format="<instruct>{}\n<query>{}",
query_instruction_for_retrieval="Given a question in text, retrieve SQL queries that are appropriate responses to the question.",
trust_remote_code=True,
use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
embeddings_1 = model.encode_queries(queries)
embeddings_2 = model.encode_corpus(documents)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)
```
By default, FlagLLMModel will use all available GPUs when encoding. Please set `os.environ["CUDA_VISIBLE_DEVICES"]` to select specific GPUs. You also can set `os.environ["CUDA_VISIBLE_DEVICES"]=""` to make all GPUs unavailable.
### Using Sentence Transformers
```python
from sentence_transformers import SentenceTransformer
import torch
# Load the model, optionally in float16 precision for faster inference
model = SentenceTransformer(
"BAAI/bge-code-v1",
trust_remote_code=True,
model_kwargs={"torch_dtype": torch.float16},
)
# Prepare a prompt given an instruction
instruction = 'Given a question in text, retrieve SQL queries that are appropriate responses to the question.'
prompt = f'<instruct>{instruction}\n<query>'
# Prepare queries and documents
queries = [
"Delete the record with ID 4 from the 'Staff' table.",
'Delete all records in the "Livestock" table where age is greater than 5'
]
documents = [
"DELETE FROM Staff WHERE StaffID = 4;",
"DELETE FROM Livestock WHERE age > 5;"
]
# Compute the query and document embeddings
query_embeddings = model.encode(queries, prompt=prompt)
document_embeddings = model.encode(documents)
# Compute the cosine similarity between the query and document embeddings
similarities = model.similarity(query_embeddings, document_embeddings)
print(similarities)
```
### Using HuggingFace Transformers
```python
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
def last_token_pool(last_hidden_states: Tensor,
attention_mask: Tensor) -> Tensor:
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
def get_detailed_instruct(task_description: str, query: str) -> str:
return f'<instruct>{task_description}\n<query>{query}'
instruction = 'Given a question in text, retrieve SQL queries that are appropriate responses to the question.'
queries = [
"Delete the record with ID 4 from the 'Staff' table.",
'Delete all records in the "Livestock" table where age is greater than 5'
]
documents = [
"DELETE FROM Staff WHERE StaffID = 4;",
"DELETE FROM Livestock WHERE age > 5;"
]
input_texts = queries + documents
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-code-v1', trust_remote_code=True)
model = AutoModel.from_pretrained('BAAI/bge-code-v1', trust_remote_code=True)
model.eval()
max_length = 4096
# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt', pad_to_multiple_of=8)
with torch.no_grad():
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:2] @ embeddings[2:].T) * 100
print(scores.tolist())
```
## Evaluation
**BGE-Code-v1** achieves state-of-the-art performance on both the CoIR and CodeRAG benchmarks.
- CoIR
| | CodeXEmbed-2B | CodeXEmbed-7B | Voyage-Code-002 | Voyage-Code-003 | BGE-Code-v1 |
|---------------------------------------|---------------|---------------|-----------------|-----------------|-----------|
| **Apps** | 76.86 | 85.38 | 26.52 | 93.62 | 98.08 |
| **CosQA** | 40.47 | 42.47 | 29.79 | 34.45 | 46.72 |
| **Text2SQL** | 78.42 | 78.94 | 69.26 | 62.87 | 64.35 |
| **CSN** | 87.87 | 89.67 | 81.79 | 89.35 | 89.53 |
| **CSN-CCR** | 97.66 | 97.95 | 73.45 | 90.05 | 98.30 |
| **CodeTrans-Contest** | 90.30 | 94.45 | 72.77 | 94.96 | 94.38 |
| **CodeTrans-DL** | 38.57 | 40.46 | 27.48 | 38.57 | 46.13 |
| **StackOverFlow-QA** | 94.47 | 96.33 | 67.68 | 97.17 | 95.35 |
| **CodeFeedBack-ST** | 86.36 | 87.53 | 65.35 | 90.67 | 90.56 |
| **CodeFeedBack-MT** | 65.51 | 68.83 | 28.74 | 93.58 | 94.38 |
| **AVG** | **75.65** | **78.20** | **56.26** | **78.53** | **81.77** |
- CodedRAG
| | HummanEval | MBPP | DS-1000 | ODEX | RepoEval | SWE-bench-Lite | AVG |
| --------------- | ---------- | ---- | ------- | ---- | -------- | -------------- | ---- |
| SFR | 100.0 | 99.0 | 19.3 | 37.1 | 83.8 | 62.7 | **67.0** |
| Jina-v2-code | 100.0 | 97.7 | 26.2 | 19.9 | 90.5 | 58.3 | **65.4** |
| CodeXEmbed-2B | 100.0 | 97.4 | 25.4 | 23.9 | 88.7 | 52.4 | **64.6** |
| Voyage-Code-002 | 100.0 | 99.0 | 33.1 | 26.6 | 94.3 | 29.1 | **63.7** |
| BGE-Code-v1 | 100.0 | 99.2 | 40.9 | 36.1 | 93.1 | 67.4 | **72.8** |
### Instructions for Evaluation
```python
{
"Apps": "Given a code contest problem description, retrieve relevant code that can help solve the problem.",
"CosQA": "Given a web search query, retrieve relevant code that can help answer the query.",
"Text2SQL": "Given a question in text, retrieve SQL queries that are appropriate responses to the question.",
"CSN": "Given a piece of code, retrieve the document string that summarizes the code.",
"CSN-CCR": "Given a piece of code segment, retrieve the code segment that is the latter part of the code.",
"CodeTrans-DL": "Given a piece of code, retrieve code that is semantically equivalent to the input code.",
"CodeTrans-Contest": "Given a piece of Python code, retrieve C++ code that is semantically equivalent to the input code.",
"StackOverFlow-QA": "Given a question that consists of a mix of text and code snippets, retrieve relevant answers that also consist of a mix of text and code snippets, and can help answer the question.",
"CodeFeedBack-ST": "Given a question that consists of a mix of text and code snippets, retrieve relevant answers that also consist of a mix of text and code snippets, and can help answer the question.",
"CodeFeedBack-MT": "Given a multi-turn conversation history that consists of a mix of text and code snippets, retrieve relevant answers that also consist of a mix of text and code snippets, and can help answer the question.",
"HummanEval": "Given a question that consists of a mix of text and code snippets, retrieve relevant answers that also consist of a mix of text and code snippets, and can help answer the question.",
"MBPP": "Given a textual explanation of code functionality, retrieve the corresponding code implementation.",
"DS-1000": "Given a question that consists of a mix of text and code snippets, retrieve relevant answers that also consist of a mix of text and code snippets, and can help answer the question.",
"ODEX": "Given a question, retrieve relevant answers that also consist of a mix of text and code snippets, and can help answer the question.",
"RepoEval": "Given a piece of code segment, retrieve the code segment that is the latter part of the code.",
"SWE-bench-Lite": "Given a code snippet containing a bug and a natural language description of the bug or error, retrieve code snippets that demonstrate solutions or fixes for similar bugs or errors (the desired documents)."
}
```
## Citation
If you find this repository useful, please consider giving a star :star: and citation
```
@misc{bge_code,
title={Towards A Generalist Code Embedding Model Based On Massive Data Synthesis},
author={Chaofan Li and Jianlyu Chen and Yingxia Shao and Defu Lian and Zheng Liu},
year={2025},
eprint={2505.12697},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2505.12697},
}
```

26
added_tokens.json Normal file
View File

@@ -0,0 +1,26 @@
{
"</tool_call>": 151658,
"<instruct>": 151665,
"<query>": 151666,
"<tool_call>": 151657,
"<|box_end|>": 151649,
"<|box_start|>": 151648,
"<|endoftext|>": 151643,
"<|file_sep|>": 151664,
"<|fim_middle|>": 151660,
"<|fim_pad|>": 151662,
"<|fim_prefix|>": 151659,
"<|fim_suffix|>": 151661,
"<|im_end|>": 151645,
"<|im_start|>": 151644,
"<|image_pad|>": 151655,
"<|object_ref_end|>": 151647,
"<|object_ref_start|>": 151646,
"<|quad_end|>": 151651,
"<|quad_start|>": 151650,
"<|repo_name|>": 151663,
"<|video_pad|>": 151656,
"<|vision_end|>": 151653,
"<|vision_pad|>": 151654,
"<|vision_start|>": 151652
}

29
config.json Normal file
View File

@@ -0,0 +1,29 @@
{
"_name_or_path": "bge-code-v1",
"architectures": [
"Qwen2Model"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151643,
"hidden_act": "silu",
"hidden_size": 1536,
"initializer_range": 0.02,
"intermediate_size": 8960,
"max_position_embeddings": 32768,
"max_window_layers": 28,
"model_type": "qwen2",
"num_attention_heads": 12,
"num_hidden_layers": 28,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000.0,
"sliding_window": null,
"tie_word_embeddings": true,
"torch_dtype": "float32",
"transformers_version": "4.49.0",
"use_cache": false,
"use_sliding_window": false,
"vocab_size": 151667
}

View File

@@ -0,0 +1,10 @@
{
"__version__": {
"sentence_transformers": "3.4.1",
"transformers": "4.49.0",
"pytorch": "2.5.1+cu124"
},
"prompts": {},
"default_prompt_name": null,
"similarity_fn_name": "cosine"
}

151388
merges.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3f0012a815833b137559c60485e3d087fb029e034b7b5e5c18fa9cabcc3faafc
size 4995016160

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:93d0d51780cad746744bed246b4c13e520e458ba36509c33aff1ed0c9ac16d71
size 1178224504

View File

@@ -0,0 +1,345 @@
{
"metadata": {
"total_size": 6173204480
},
"weight_map": {
"embed_tokens.weight": "model-00001-of-00002.safetensors",
"layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
"layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
"layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
"layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
"layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
"layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
"layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"norm.weight": "model-00002-of-00002.safetensors"
}
}

20
modules.json Normal file
View File

@@ -0,0 +1,20 @@
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
},
{
"idx": 2,
"name": "2",
"path": "2_Normalize",
"type": "sentence_transformers.models.Normalize"
}
]

View File

@@ -0,0 +1,4 @@
{
"max_seq_length": 32768,
"do_lower_case": false
}

20
special_tokens_map.json Normal file
View File

@@ -0,0 +1,20 @@
{
"additional_special_tokens": [
"<instruct>",
"<query>"
],
"eos_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

250
tokenization_qwen.py Normal file
View File

@@ -0,0 +1,250 @@
"""
Copied from https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct/blob/main/tokenization_qwen.py
"""
from typing import List, Optional
from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as OriginalQwen2Tokenizer
from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as OriginalQwen2TokenizerFast
from tokenizers import processors
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_file": "tokenizer.json",
}
class Qwen2Tokenizer(OriginalQwen2Tokenizer):
"""
Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
```python
>>> from transformers import Qwen2Tokenizer
>>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
>>> tokenizer("Hello world")["input_ids"]
[9707, 1879]
>>> tokenizer(" Hello world")["input_ids"]
[21927, 1879]
```
This is expected.
You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*):
The beginning of sequence token. Not applicable for this tokenizer.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The token used for padding, for example when batching sequences of different lengths.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
Whether or not the model should cleanup the spaces that were added when splitting the input text during the
tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
split_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the special tokens should be split during the tokenization process. The default behavior is
to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
'|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences.
"""
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="<|endoftext|>",
bos_token=None,
eos_token="<|endoftext|>",
pad_token="<|endoftext|>",
clean_up_tokenization_spaces=False,
split_special_tokens=False,
add_eos_token=False,
**kwargs,
):
# The add_eos_token code was inspired by the LlamaTokenizer
self.add_eos_token = add_eos_token
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
errors=errors,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
split_special_tokens=split_special_tokens,
add_eos_token=add_eos_token,
**kwargs,
)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + token_ids_1 + eos_token_id
return output
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
eos_token_id = [1] if self.add_eos_token else []
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + eos_token_id
return (
([0] * len(token_ids_0))
+ eos_token_id
+ ([0] * len(token_ids_1))
+ eos_token_id
)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = [0] * len(token_ids_0 + eos_token_id)
if token_ids_1 is not None:
output += [1] * len(token_ids_1 + eos_token_id)
return output
class Qwen2TokenizerFast(OriginalQwen2TokenizerFast):
"""
Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
Byte-Pair-Encoding.
Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
```python
>>> from transformers import Qwen2TokenizerFast
>>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
>>> tokenizer("Hello world")["input_ids"]
[9707, 1879]
>>> tokenizer(" Hello world")["input_ids"]
[21927, 1879]
```
This is expected.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
Args:
vocab_file (`str`, *optional*):
Path to the vocabulary file.
merges_file (`str`, *optional*):
Path to the merges file.
tokenizer_file (`str`, *optional*):
Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
contains everything needed to load the tokenizer.
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. Not applicable to this tokenizer.
bos_token (`str`, *optional*):
The beginning of sequence token. Not applicable for this tokenizer.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The token used for padding, for example when batching sequences of different lengths.
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences.
"""
slow_tokenizer_class = Qwen2Tokenizer
padding_side = "left"
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token=None,
eos_token="<|endoftext|>",
pad_token="<|endoftext|>",
add_eos_token=False,
**kwargs,
):
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
**kwargs,
)
self._add_eos_token = add_eos_token
self.update_post_processor()
def update_post_processor(self):
"""
Updates the underlying post processor with the current `eos_token`.
"""
eos = self.eos_token
eos_token_id = self.eos_token_id
if eos is None and self.add_eos_token:
raise ValueError("add_eos_token = True but eos_token = None")
single = f"$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
pair = f"{single} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
special_tokens = []
if self.add_eos_token:
special_tokens.append((eos, eos_token_id))
self._tokenizer.post_processor = processors.TemplateProcessing(
single=single, pair=pair, special_tokens=special_tokens
)
@property
def add_eos_token(self):
return self._add_eos_token

3
tokenizer.json Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a56524092f5d0676e63537511b535e73e7580a7efe440247ef3fa43d019a0af0
size 11422261

220
tokenizer_config.json Normal file
View File

@@ -0,0 +1,220 @@
{
"add_bos_token": false,
"add_eos_token": true,
"add_prefix_space": false,
"added_tokens_decoder": {
"151643": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151644": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151645": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151646": {
"content": "<|object_ref_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151647": {
"content": "<|object_ref_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151648": {
"content": "<|box_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151649": {
"content": "<|box_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151650": {
"content": "<|quad_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151651": {
"content": "<|quad_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151652": {
"content": "<|vision_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151653": {
"content": "<|vision_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151654": {
"content": "<|vision_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151655": {
"content": "<|image_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151656": {
"content": "<|video_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151657": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151658": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151659": {
"content": "<|fim_prefix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151660": {
"content": "<|fim_middle|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151661": {
"content": "<|fim_suffix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151662": {
"content": "<|fim_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151663": {
"content": "<|repo_name|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151664": {
"content": "<|file_sep|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151665": {
"content": "<instruct>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151666": {
"content": "<query>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<instruct>",
"<query>"
],
"auto_map": {
"AutoTokenizer": [
"tokenization_qwen.Qwen2Tokenizer",
null
]
},
"bos_token": null,
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
"clean_up_tokenization_spaces": false,
"eos_token": "<|endoftext|>",
"errors": "replace",
"extra_special_tokens": {},
"model_max_length": 256,
"pad_token": "<|endoftext|>",
"split_special_tokens": false,
"tokenizer_class": "Qwen2Tokenizer",
"unk_token": null
}

1
vocab.json Normal file

File diff suppressed because one or more lines are too long