Compare commits

..

10 Commits

Author SHA1 Message Date
x54-729
571400500e replace get_max_length 2025-03-13 15:04:55 +08:00
ai-modelscope
b7229bee87 update modeling file to newest 2025-02-26 21:05:04 +08:00
ai-modelscope
b19e3ae77b update modeling file to newest 2025-01-15 22:25:04 +08:00
x54-729
ece3aef931 update modeling file to newest 2024-08-20 06:49:50 +00:00
x54-729
5729cef948 fix no white space when using stream_chat with fast tokenizer 2024-02-28 13:33:29 +08:00
x54-729
ad112bb573 Add special tokens for chat 2024-02-26 12:22:08 +08:00
x54-729
2d5cd71608 update readme 2024-01-29 16:38:06 +08:00
x54-729
2b57cbed95 fix img url 2024-01-29 16:33:00 +08:00
x54-729
5fed2070aa metadata 2024-01-29 16:26:54 +08:00
x54-729
e36f66b425 update readme 2024-01-29 16:16:55 +08:00
7 changed files with 911 additions and 425 deletions

42
.gitattributes vendored
View File

@@ -1,55 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text *.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text *.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text *.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text *.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text *.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text *.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text *.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text *.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text *.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text *.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text *.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text *.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text *.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text *.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text *.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text *.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text *.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text *.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text *.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text
*.tfevents* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text
*.db* filter=lfs diff=lfs merge=lfs -text
*.ark* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
model-00001-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00003-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00005-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00006-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00014-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00004-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00011-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00012-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00017-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00020-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00002-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00009-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00015-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00021-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00007-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00008-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00010-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00013-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00016-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00018-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text
model-00019-of-00021.safetensors filter=lfs diff=lfs merge=lfs -text

View File

@@ -24,22 +24,33 @@ tags:
<div> </div> <div> </div>
</div> </div>
State-of-the-art bilingual open-sourced Math reasoning LLMs. State-of-the-art bilingual open-sourced Math reasoning LLMs.
A **solver**, **prover**, **verifier**, **augmentor**.
[💻 Github](https://github.com/InternLM/InternLM-Math) [🤗 Demo](https://huggingface.co/spaces/internlm/internlm2-math-7b) [🤗 Checkpoints](https://huggingface.co/internlm/internlm2-math-7b) [![OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM2-Math-7B) [<img src="https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/modelscope_logo.png" width="20px" /> ModelScope](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)
</div> </div>
# News
- [2024.01.29] We add checkpoints from ModelScope. Tech report is on the way!
- [2024.01.26] We add checkpoints from OpenXLab, which ease Chinese users to download!
# Introduction # Introduction
- **7B and 20B Chinese and English Math LMs with better than ChatGPT performances.** InternLM2-Math are continued pretrained from InternLM2-Base with ~100B high quality math-related tokens and SFT with ~2M bilingual math supervised data. We apply minhash and exact number match to decontaminate possible test set leakage. - **7B and 20B Chinese and English Math LMs with better than ChatGPT performances.** InternLM2-Math are continued pretrained from InternLM2-Base with ~100B high quality math-related tokens and SFT with ~2M bilingual math supervised data. We apply minhash and exact number match to decontaminate possible test set leakage.
- **Add Lean as a support language for math problem solving and math theorem proving.** We are exploring combining Lean 3 with InternLM-Math for verifiable math reasoning. InternLM-Math can generate Lean codes for simple math reasoning tasks like GSM8K or provide possible proof tactics based on Lean states. - **Add Lean as a support language for math problem solving and math theorem proving.** We are exploring combining Lean 3 with InternLM-Math for verifiable math reasoning. InternLM-Math can generate Lean codes for simple math reasoning tasks like GSM8K or provide possible proof tactics based on Lean states.
- **Also can be viewed as a reward model, which supports the Outcome/Process/Lean Reward Model.** We supervise InternLM2-Math with various types of reward modeling data, to make InternLM2-Math can also verify chain-of-thought processes. We also add the ability to convert a chain-of-thought process into Lean 3 code. - **Also can be viewed as a reward model, which supports the Outcome/Process/Lean Reward Model.** We supervise InternLM2-Math with various types of reward modeling data, to make InternLM2-Math can also verify chain-of-thought processes. We also add the ability to convert a chain-of-thought process into Lean 3 code.
- **A Math LM Augment Helper** and **Code Intepreter**. InternLM2-Math can help augment math reasoning problems and solve them using the code interpreter which makes you generate synthesis data quicker! - **A Math LM Augment Helper** and **Code Interpreter**. InternLM2-Math can help augment math reasoning problems and solve them using the code interpreter which makes you generate synthesis data quicker!
![hungarian](https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/hungary.jpeg)
# Models # Models
| Model | Transformers(HF) |Release Date | **InternLM2-Math-Base-7B** and **InternLM2-Math-Base-20B** are pretrained checkpoints. **InternLM2-Math-7B** and **InternLM2-Math-20B** are SFT checkpoints.
|---|---|---| | Model |Model Type | Transformers(HF) |OpenXLab| ModelScope | Release Date |
| **InternLM2-Math-Base-7B** | [🤗internlm/internlm2-math-base-7b](https://huggingface.co/internlm/internlm2-math-base-7b) | 2024-01-23| |---|---|---|---|---|---|
| **InternLM2-Math-Base-20B** | [🤗internlm/internlm2-math-base-20b](https://huggingface.co/internlm/internlm2-math-base-20b) | 2024-01-23| | **InternLM2-Math-Base-7B** | Base| [🤗internlm/internlm2-math-base-7b](https://huggingface.co/internlm/internlm2-math-base-7b) |[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM2-Math-Base-7B)| [<img src="https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/modelscope_logo.png" width="20px" /> internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b/summary)| 2024-01-23|
| **InternLM2-Math-7B** | [🤗internlm/internlm2-math-7b](https://huggingface.co/internlm/internlm2-math-7b) | 2024-01-23| | **InternLM2-Math-Base-20B** | Base| [🤗internlm/internlm2-math-base-20b](https://huggingface.co/internlm/internlm2-math-base-20b) |[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM2-Math-Base-20B)|[<img src="https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/modelscope_logo.png" width="20px" /> internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)| 2024-01-23|
| **InternLM2-Math-20B** | [🤗internlm/internlm2-math-20b](https://huggingface.co/internlm/internlm2-math-20b) | 2024-01-23| | **InternLM2-Math-7B** | Chat| [🤗internlm/internlm2-math-7b](https://huggingface.co/internlm/internlm2-math-7b) |[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM2-Math-7B)|[<img src="https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/modelscope_logo.png" width="20px" /> internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)| 2024-01-23|
| **InternLM2-Math-20B** | Chat| [🤗internlm/internlm2-math-20b](https://huggingface.co/internlm/internlm2-math-20b) |[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM2-Math-20B)|[<img src="https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/modelscope_logo.png" width="20px" /> internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)| 2024-01-23|
# Performance # Performance
@@ -81,14 +92,26 @@ All performance is based on greedy decoding with COT. We notice that the perform
# Inference # Inference
## LMDeploy
We suggest using [LMDeploy](https://github.com/InternLM/LMDeploy)(>=0.2.1) for inference.
```python ```python
from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
import torch
model_dir = snapshot_download("Shanghai_AI_Laboratory/internlm2-math-20b") backend_config = TurbomindEngineConfig(model_name='internlm2-chat-7b', tp=1, cache_max_entry_count=0.3)
tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True) chat_template = ChatTemplateConfig(model_name='internlm2-chat-7b', system='', eosys='', meta_instruction='')
pipe = pipeline(model_path='internlm/internlm2-math-7b', chat_template_config=chat_template, backend_config=backend_config)
problem = '1+1='
result = pipe([problem], request_output_len=1024, top_k=1)
```
## Huggingface
```python
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-math-20b", trust_remote_code=True)
# Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error. # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16) model = AutoModelForCausalLM.from_pretrained("internlm/internlm2-math-20b", trust_remote_code=True, torch_dtype=torch.float16).cuda()
model = model.eval() model = model.eval()
response, history = model.chat(tokenizer, "1+1=", history=[], meta_instruction="") response, history = model.chat(tokenizer, "1+1=", history=[], meta_instruction="")
print(response) print(response)
@@ -97,6 +120,26 @@ print(response)
# Special usages # Special usages
We list some instructions used in our SFT. You can use them to help you. You can use the other ways to prompt the model, but the following are recommended. InternLM2-Math may combine the following abilities but it is not guaranteed. We list some instructions used in our SFT. You can use them to help you. You can use the other ways to prompt the model, but the following are recommended. InternLM2-Math may combine the following abilities but it is not guaranteed.
Translate proof problem to Lean:
![nl2lean3](https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/nl2lean.jpeg)
Using Lean 3 to solve GSM8K problem:
![gsm8k_lean](https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/gsm8k_lean.jpeg)
Generate problem based on Lean 3 code:
![lean_problem](https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/lean_problem.jpeg)
Play 24 point game:
![24](https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/24.jpeg)
Augment a harder math problem:
![augment_hard](https://raw.githubusercontent.com/InternLM/InternLM-Math/main/assets/augment_hard.jpeg)
| Description | Query | | Description | Query |
| --- | --- | | --- | --- |
| Solving question via chain-of-thought | {Question} | | Solving question via chain-of-thought | {Question} |
@@ -105,26 +148,27 @@ We list some instructions used in our SFT. You can use them to help you. You can
| Process reward model | Given a question and an answer, check correctness of each step.\nQuestion:{Question}\nAnswer:{COT} | | Process reward model | Given a question and an answer, check correctness of each step.\nQuestion:{Question}\nAnswer:{COT} |
| Reward model | Given a question and two answers, which one is better? \nQuestion:{Question}\nAnswer 1:{COT}\nAnswer 2:{COT} | | Reward model | Given a question and two answers, which one is better? \nQuestion:{Question}\nAnswer 1:{COT}\nAnswer 2:{COT} |
| Convert chain-of-thought to Lean 3 | Convert this answer into Lean3. Question:{Question}\nAnswer:{COT} | | Convert chain-of-thought to Lean 3 | Convert this answer into Lean3. Question:{Question}\nAnswer:{COT} |
| Convert Lean 3 to chain-of-thought | Convert this lean 3 code into a natural language problem with answers:\n{LEAN} | | Convert Lean 3 to chain-of-thought | Convert this lean 3 code into a natural language problem with answers:\n{LEAN Code} |
| Translate question and chain-of-thought answer to a proof statement | Convert this question and answer into a proof format.\nQuestion:{Question}\nAnswer:{COT} | | Translate question and chain-of-thought answer to a proof statement | Convert this question and answer into a proof format.\nQuestion:{Question}\nAnswer:{COT} |
| Translate proof problem to Lean 3 | Convert this natural langauge statement into a Lean 3 theorem statement:{Theorem} | | Translate proof problem to Lean 3 | Convert this natural langauge statement into a Lean 3 theorem statement:{Theorem} |
| Translate Lean 3 to proof problem | Convert this Lean 3 theorem statement into natural language:{STATEMENT} | | Translate Lean 3 to proof problem | Convert this Lean 3 theorem statement into natural language:{STATEMENT} |
| Suggest a tactic based on Lean state | Given the Lean 3 tactic state, suggest a next tactic:\n{State} | | Suggest a tactic based on Lean state | Given the Lean 3 tactic state, suggest a next tactic:\n{LEAN State} |
| Rephrase Problem | Describe this problem in another way. {STATEMENT} | | Rephrase Problem | Describe this problem in another way. {Question} |
| Augment Problem | Please augment a new problem based on: {Question} | | Augment Problem | Please augment a new problem based on: {Question} |
| Augment a harder Problem | Increase the complexity of the problem: {Question} | | Augment a harder Problem | Increase the complexity of the problem: {Question} |
| Change specific numbers | Change specific numbers: {Question}| | Change specific numbers | Change specific numbers: {Question}|
| Introduce fractions or percentages | Introduce fractions or percentages: {Question}| | Introduce fractions or percentages | Introduce fractions or percentages: {Question}|
| Code Intepreter | [lagent](https://github.com/InternLM/InternLM/blob/main/agent/lagent.md) | | Code Interpreter | [lagent](https://github.com/InternLM/InternLM/blob/main/agent/lagent.md) |
| In-context Learning | Question:{Question}\nAnswer:{COT}\n...Question:{Question}\nAnswer:{COT}| | In-context Learning | Question:{Question}\nAnswer:{COT}\n...Question:{Question}\nAnswer:{COT}|
# Fine-tune and others # Fine-tune and others
Please refer to [InternLM](https://github.com/InternLM/InternLM/tree/main). Please refer to [InternLM](https://github.com/InternLM/InternLM/tree/main).
# Known issues # Known issues
Our model is still under development and will be upgraded. There are some possible issues of InternLM-Math. Our model is still under development and will be upgraded. There are some possible issues of InternLM-Math. If you find performances of some abilities are not great, welcome to open an issue.
- Jump the calculating step. - Jump the calculating step.
- Perform badly at Chinese fill-in-the-bank problems and English choice problems due to SFT data composition. - Perform badly at Chinese fill-in-the-bank problems and English choice problems due to SFT data composition.
- Tend to generate Code Interpreter when facing Chinese problems due to SFT data composition.
- The reward model mode can be better leveraged with assigned token probabilities. - The reward model mode can be better leveraged with assigned token probabilities.
- Code switch due to SFT data composition. - Code switch due to SFT data composition.
- Some abilities of Lean can only be adapted to GSM8K-like problems (e.g. Convert chain-of-thought to Lean 3), and performance related to Lean is not guaranteed. - Some abilities of Lean can only be adapted to GSM8K-like problems (e.g. Convert chain-of-thought to Lean 3), and performance related to Lean is not guaranteed.

View File

@@ -44,9 +44,9 @@ class InternLM2Config(PretrainedConfig):
intermediate_size (`int`, *optional*, defaults to 11008): intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations. Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32): num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32): num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*): num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
@@ -58,22 +58,42 @@ class InternLM2Config(PretrainedConfig):
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder. The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048): max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. InternLM2 supports up to 32768 tokens.
just in case (e.g., 512 or 1024 or 2048).
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-12): rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers. The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`. relevant if `config.is_decoder=True`.
tie_word_embeddings(`bool`, *optional*, defaults to `False`): pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism)
to understand more about it. This value is necessary to ensure exact reproducibility
of the pretraining results. Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings Whether to tie weight embeddings
Example: rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking API changes in future versions.
""" """
model_type = "internlm2"
_auto_class = "AutoConfig" _auto_class = "AutoConfig"
model_type = "internlm2"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__( # pylint: disable=W0102 def __init__( # pylint: disable=W0102
self, self,
@@ -91,11 +111,12 @@ class InternLM2Config(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
bos_token_id=1, bos_token_id=1,
eos_token_id=2, eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False, tie_word_embeddings=False,
bias=True, bias=True,
rope_theta=10000, rope_theta=10000,
rope_scaling=None, rope_scaling=None,
attn_implementation="eager", attn_implementation=None,
**kwargs, **kwargs,
): ):
self.vocab_size = vocab_size self.vocab_size = vocab_size
@@ -113,14 +134,15 @@ class InternLM2Config(PretrainedConfig):
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache self.use_cache = use_cache
self.rope_theta = rope_theta self.rope_theta = rope_theta
self.rope_scaling = rope_scaling self.rope_scaling = rope_scaling
self._rope_scaling_validation() self._rope_scaling_validation()
self.attn_implementation = attn_implementation self.attn_implementation = attn_implementation
if self.attn_implementation is None: if self.attn_implementation is None:
self.attn_implementation = "eager" self.attn_implementation = "eager"
super().__init__( super().__init__(
pad_token_id=pad_token_id, pad_token_id=pad_token_id,
bos_token_id=bos_token_id, bos_token_id=bos_token_id,
@@ -147,5 +169,12 @@ class InternLM2Config(PretrainedConfig):
raise ValueError( raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
) )
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0: if (
raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}") rope_scaling_factor is None
or not isinstance(rope_scaling_factor, (float, int))
or rope_scaling_factor < 1.0
):
raise ValueError(
f"`rope_scaling`'s factor field must be a number >= 1, got {rope_scaling_factor} "
f"of type {type(rope_scaling_factor)}"
)

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,12 @@
{ {
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|action_start|>",
"<|action_end|>",
"<|interpreter|>",
"<|plugin|>"
],
"bos_token": "<s>", "bos_token": "<s>",
"eos_token": "</s>", "eos_token": "</s>",
"pad_token": "</s>", "pad_token": "</s>",

View File

@@ -56,14 +56,14 @@ class InternLM2Converter(SpmConverter):
return unk_id return unk_id
def decoder(self, replacement, add_prefix_space): def decoder(self, replacement, add_prefix_space):
return decoders.Sequence( decoders_sequence = [
[ decoders.Replace("", " "),
decoders.Replace("", " "), decoders.ByteFallback(),
decoders.ByteFallback(), decoders.Fuse(),
decoders.Fuse(), ]
decoders.Strip(content=" ", left=1), if self.proto.normalizer_spec.add_dummy_prefix:
] decoders_sequence.append(decoders.Strip(content=" ", left=1))
) return decoders.Sequence(decoders_sequence)
def tokenizer(self, proto): def tokenizer(self, proto):
model_type = proto.trainer_spec.model_type model_type = proto.trainer_spec.model_type

View File

@@ -86,5 +86,13 @@
"special": true "special": true
} }
}, },
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|action_start|>",
"<|action_end|>",
"<|interpreter|>",
"<|plugin|>"
],
"chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
} }