初始化项目，由ModelHub XC社区提供模型

Model: AI-ModelScope/internlm3-8b-instruct Source: Original Platform
2026-05-22 10:08:12 +08:00
commit a8edc4b024
15 changed files with 3608 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -0,0 +1,201 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright 2023-2024 Shanghai AI Laboratory
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,889 @@
 ---
 license: apache-2.0
 pipeline_tag: text-generation
 ---
 # InternLM 
 <div align="center">
 <img src="https://github.com/InternLM/InternLM/assets/22529082/b9788105-8892-4398-8b47-b513a292378e" width="200"/>
  <div>&nbsp;</div>
  <div align="center">
    <b><font size="5">InternLM</font></b>
    <sup>
      <a href="https://internlm.intern-ai.org.cn/">
        <i><font size="4">HOT</font></i>
      </a>
    </sup>
    <div>&nbsp;</div>
  </div>
 [![evaluation](https://github.com/InternLM/InternLM/assets/22529082/f80a2a58-5ddf-471a-8da4-32ab65c8fd3b)](https://github.com/internLM/OpenCompass/)
 [💻Github Repo](https://github.com/InternLM/InternLM) • [🤗Demo](https://huggingface.co/spaces/internlm/internlm3-8b-instruct) • [🤔Reporting Issues](https://github.com/InternLM/InternLM/issues/new) • [📜Technical Report](https://arxiv.org/abs/2403.17297)
 </div>
 <p align="center">
    👋 join us on <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://github.com/InternLM/InternLM/assets/25839884/a6aad896-7232-4220-ac84-9e070c2633ce" target="_blank">WeChat</a>
 </p>
 ## Introduction 
 InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning. This model has the following characteristics:
 - **Enhanced performance at reduced cost**: 
 State-of-the-art performance on reasoning and knowledge-intensive tasks surpass models like Llama3.1-8B and Qwen2.5-7B. Remarkably, InternLM3 is trained on only 4 trillion high-quality tokens, saving more than 75% of the training cost compared to other LLMs of similar scale. 
 - **Deep thinking capability**:
 InternLM3 supports both the deep thinking mode for solving complicated reasoning tasks via the long chain-of-thought and the normal response mode for fluent user interactions. 
 ## InternLM3-8B-Instruct
 ### Performance Evaluation
 We conducted a comprehensive evaluation of InternLM using the open-source evaluation tool [OpenCompass](https://github.com/internLM/OpenCompass/). The evaluation covered five dimensions of capabilities: disciplinary competence, language competence, knowledge competence, inference competence, and comprehension competence. Here are some of the evaluation results, and you can visit the [OpenCompass leaderboard](https://rank.opencompass.org.cn) for more evaluation results.
 |              | Benchmark                       | InternLM3-8B-Instruct | Qwen2.5-7B-Instruct | Llama3.1-8B-Instruct | GPT-4o-mini(closed source) |
 | ------------ | ------------------------------- | --------------------- | ------------------- | -------------------- | -------------------------- |
 | General      | CMMLU(0-shot)                   | **83.1**              | 75.8                | 53.9                 | 66.0                       |
 |              | MMLU(0-shot)                    | 76.6                  | **76.8**            | 71.8                 | 82.7                       |
 |              | MMLU-Pro(0-shot)                | **57.6**              | 56.2                | 48.1                 | 64.1                       |
 | Reasoning    | GPQA-Diamond(0-shot)            | **37.4**              | 33.3                | 24.2                 | 42.9                       |
 |              | DROP(0-shot)                    | **83.1**              | 80.4                | 81.6                 | 85.2                       |
 |              | HellaSwag(10-shot)              | **91.2**              | 85.3                | 76.7                 | 89.5                       |
 |              | KOR-Bench(0-shot)               | **56.4**              | 44.6                | 47.7                 | 58.2                       |
 | MATH         | MATH-500(0-shot)                | **83.0***             | 72.4                | 48.4                 | 74.0                       |
 |              | AIME2024(0-shot)                | **20.0***             | 16.7                | 6.7                  | 13.3                       |
 | Coding       | LiveCodeBench(2407-2409 Pass@1) | **17.8**              | 16.8                | 12.9                 | 21.8                       |
 |              | HumanEval(Pass@1)               | 82.3                  | **85.4**            | 72.0                 | 86.6                       |
 | Instrunction | IFEval(Prompt-Strict)           | **79.3**              | 71.7                | 75.2                 | 79.7                       |
 | Long Context | RULER(4-128K Average)           | 87.9                  | 81.4                | **88.5**             | 90.7                       |
 | Chat         | AlpacaEval 2.0(LC WinRate)      | **51.1**              | 30.3                | 25.0                 | 50.7                       |
 |              | WildBench(Raw Score)            | **33.1**              | 23.3                | 1.5                  | 40.3                       |
 |              | MT-Bench-101(Score 1-10)        | **8.59**              | 8.49                | 8.37                 | 8.87                       |
 - Values marked in bold indicate the **highest** in open source models
 - The evaluation results were obtained from [OpenCompass](https://github.com/internLM/OpenCompass/) (some data marked with *, which means evaluating with Thinking Mode), and evaluation configuration can be found in the configuration files provided by [OpenCompass](https://github.com/internLM/OpenCompass/). 
 - The evaluation data may have numerical differences due to the version iteration of [OpenCompass](https://github.com/internLM/OpenCompass/), so please refer to the latest evaluation results of [OpenCompass](https://github.com/internLM/OpenCompass/).
 **Limitations:** Although we have made efforts to ensure the safety of the model during the training process and to encourage the model to generate text that complies with ethical and legal requirements, the model may still produce unexpected outputs due to its size and probabilistic generation paradigm. For example, the generated responses may contain biases, discrimination, or other harmful content. Please do not propagate such content. We are not responsible for any consequences resulting from the dissemination of harmful information.
 ### Requirements
 ```python
 transformers >= 4.48
 ```
 ### Conversation Mode
 #### Transformers inference
 To load the InternLM3 8B Instruct model using Transformers, use the following code:
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 model_dir = "internlm/internlm3-8b-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
 model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
  # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
  # pip install -U bitsandbytes
  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
 model = model.eval()
 system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语).
 - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文."""
 messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
 ]
 tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 generated_ids = model.generate(tokenized_chat, max_new_tokens=1024, temperature=1, repetition_penalty=1.005, top_k=40, top_p=0.8)
 generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
 prompt = tokenizer.batch_decode(tokenized_chat)[0]
 print(prompt)
 response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 print(response)
 ```
 #### LMDeploy inference
 LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
 ```bash
 pip install lmdeploy
 ```
 You can run batch inference locally with the following python code:
 ```python
 import lmdeploy
 model_dir = "internlm/internlm3-8b-instruct"
 pipe = lmdeploy.pipeline(model_dir)
 response = pipe("Please tell me five scenic spots in Shanghai")
 print(response)
 ```
 Or you can launch an OpenAI compatible server with the following command:
 ```bash
 lmdeploy serve api_server internlm/internlm3-8b-instruct --model-name internlm3-8b-instruct --server-port 23333 
 ```
 Then you can send a chat request to the server:
 ```bash
 curl http://localhost:23333/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
    "model": "internlm3-8b-instruct",
    "messages": [
    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"}
    ]
    }'
 ```
 Find more details in the [LMDeploy documentation](https://lmdeploy.readthedocs.io/en/latest/)
 ####  Ollama inference
 First install ollama,
 ```python
 # install ollama
 curl -fsSL https://ollama.com/install.sh | sh
 # fetch model
 ollama pull internlm/internlm3-8b-instruct
 # install 
 pip install ollama
 ```
 inference code,
 ```python
 import ollama
 system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语).
 - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文."""
 messages = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {
        "role": "user",
        "content": "Please tell me five scenic spots in Shanghai"
    },
 ]
 stream = ollama.chat(
    model='internlm/internlm3-8b-instruct',
    messages=messages,
    stream=True,
 )
 for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)
 ```
 #### vLLM inference
 Refer to [installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) to install the latest code of vllm
 ```python
 pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 inference code:
 ```python
 from vllm import LLM, SamplingParams
 llm = LLM(model="internlm/internlm3-8b-instruct")
 sampling_params = SamplingParams(temperature=1, repetition_penalty=1.005, top_k=40, top_p=0.8)
 system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语).
 - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文."""
 prompts = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {
        "role": "user",
        "content": "Please tell me five scenic spots in Shanghai"
    },
 ]
 outputs = llm.chat(prompts,
                   sampling_params=sampling_params,
                   use_tqdm=False)
 print(outputs)
 ```
 ### Thinking Mode
 #### Thinking Demo
 <img src="https://github.com/InternLM/InternLM/blob/017ba7446d20ecc3b9ab8e7b66cc034500868ab4/assets/solve_puzzle.png?raw=true" width="400"/>
 #### Thinking system prompt
 ```python
 thinking_system_prompt = """You are an expert mathematician with extensive experience in mathematical competitions. You approach problems through systematic thinking and rigorous reasoning. When solving problems, follow these thought processes:
 ## Deep Understanding
 Take time to fully comprehend the problem before attempting a solution. Consider:
 - What is the real question being asked?
 - What are the given conditions and what do they tell us?
 - Are there any special restrictions or assumptions?
 - Which information is crucial and which is supplementary?
 ## Multi-angle Analysis
 Before solving, conduct thorough analysis:
 - What mathematical concepts and properties are involved?
 - Can you recall similar classic problems or solution methods?
 - Would diagrams or tables help visualize the problem?
 - Are there special cases that need separate consideration?
 ## Systematic Thinking
 Plan your solution path:
 - Propose multiple possible approaches
 - Analyze the feasibility and merits of each method
 - Choose the most appropriate method and explain why
 - Break complex problems into smaller, manageable steps
 ## Rigorous Proof
 During the solution process:
 - Provide solid justification for each step
 - Include detailed proofs for key conclusions
 - Pay attention to logical connections
 - Be vigilant about potential oversights
 ## Repeated Verification
 After completing your solution:
 - Verify your results satisfy all conditions
 - Check for overlooked special cases
 - Consider if the solution can be optimized or simplified
 - Review your reasoning process
 Remember:
 1. Take time to think thoroughly rather than rushing to an answer
 2. Rigorously prove each key conclusion
 3. Keep an open mind and try different approaches
 4. Summarize valuable problem-solving methods
 5. Maintain healthy skepticism and verify multiple times
 Your response should reflect deep mathematical understanding and precise logical thinking, making your solution path and reasoning clear to others.
 When you're ready, present your complete solution with:
 - Clear problem understanding
 - Detailed solution process
 - Key insights
 - Thorough verification
 Focus on clear, logical progression of ideas and thorough explanation of your mathematical reasoning. Provide answers in the same language as the user asking the question, repeat the final answer using a '\\boxed{}' without any units, you have [[8192]] tokens to complete the answer.
 """
 ```
 #### Transformers inference
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 model_dir = "internlm/internlm3-8b-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
 model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
  # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
  # pip install -U bitsandbytes
  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
 model = model.eval()
 messages = [
    {"role": "system", "content": thinking_system_prompt},
    {"role": "user", "content": "Given the function\(f(x)=\mathrm{e}^{x}-ax - a^{3}\),\n(1) When \(a = 1\), find the equation of the tangent line to the curve \(y = f(x)\) at the point \((1,f(1))\).\n(2) If \(f(x)\) has a local minimum and the minimum value is less than \(0\), determine the range of values for \(a\)."},
 ]
 tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 generated_ids = model.generate(tokenized_chat, max_new_tokens=8192)
 generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
 prompt = tokenizer.batch_decode(tokenized_chat)[0]
 print(prompt)
 response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 print(response)
 ```
 #### LMDeploy inference
 LMDeploy is a toolkit for compressing, deploying, and serving LLM.
 ```bash
 pip install lmdeploy
 ```
 You can run batch inference locally with the following python code:
 ```python
 from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
 model_dir = "internlm/internlm3-8b-instruct"
 chat_template_config = ChatTemplateConfig(model_name='internlm3')
 pipe = pipeline(model_dir, chat_template_config=chat_template_config)
 messages = [
        {"role": "system", "content": thinking_system_prompt},
        {"role": "user", "content": "Given the function\(f(x)=\mathrm{e}^{x}-ax - a^{3}\),\n(1) When \(a = 1\), find the equation of the tangent line to the curve \(y = f(x)\) at the point \((1,f(1))\).\n(2) If \(f(x)\) has a local minimum and the minimum value is less than \(0\), determine the range of values for \(a\)."},
 ]
 response = pipe(messages, gen_config=GenerationConfig(max_new_tokens=2048))
 print(response)
 ```
 ####  Ollama inference
 First install ollama,
 ```python
 # install ollama
 curl -fsSL https://ollama.com/install.sh | sh
 # fetch model
 ollama pull internlm/internlm3-8b-instruct
 # install
 pip install ollama
 ```
 inference code,
 ```python
 import ollama
 messages = [
    {
        "role": "system",
        "content": thinking_system_prompt,
    },
    {
        "role": "user",
        "content": "Given the function\(f(x)=\mathrm{e}^{x}-ax - a^{3}\),\n(1) When \(a = 1\), find the equation of the tangent line to the curve \(y = f(x)\) at the point \((1,f(1))\).\n(2) If \(f(x)\) has a local minimum and the minimum value is less than \(0\), determine the range of values for \(a\)."
    },
 ]
 stream = ollama.chat(
    model='internlm/internlm3-8b-instruct',
    messages=messages,
    stream=True,
 )
 for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)
 ```
 #### 
 #### vLLM inference
 Refer to [installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) to install the latest code of vllm
 ```python
 pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 inference code
 ```python
 from vllm import LLM, SamplingParams
 llm = LLM(model="internlm/internlm3-8b-instruct")
 sampling_params = SamplingParams(temperature=1, repetition_penalty=1.005, top_k=40, top_p=0.8, max_tokens=8192)
 prompts = [
    {
        "role": "system",
        "content": thinking_system_prompt,
    },
    {
        "role": "user",
        "content": "Given the function\(f(x)=\mathrm{e}^{x}-ax - a^{3}\),\n(1) When \(a = 1\), find the equation of the tangent line to the curve \(y = f(x)\) at the point \((1,f(1))\).\n(2) If \(f(x)\) has a local minimum and the minimum value is less than \(0\), determine the range of values for \(a\)."
    },
 ]
 outputs = llm.chat(prompts,
                   sampling_params=sampling_params,
                   use_tqdm=False)
 print(outputs)
 ```
 ## Open Source License
 Code and model weights are licensed under Apache-2.0. 
 ## Citation
 ```
@misc{cai2024internlm2,
      title={InternLM2 Technical Report},
      author={Zheng Cai and Maosong Cao and Haojiong Chen and Kai Chen and Keyu Chen and Xin Chen and Xun Chen and Zehui Chen and Zhi Chen and Pei Chu and Xiaoyi Dong and Haodong Duan and Qi Fan and Zhaoye Fei and Yang Gao and Jiaye Ge and Chenya Gu and Yuzhe Gu and Tao Gui and Aijia Guo and Qipeng Guo and Conghui He and Yingfan Hu and Ting Huang and Tao Jiang and Penglong Jiao and Zhenjiang Jin and Zhikai Lei and Jiaxing Li and Jingwen Li and Linyang Li and Shuaibin Li and Wei Li and Yining Li and Hongwei Liu and Jiangning Liu and Jiawei Hong and Kaiwen Liu and Kuikun Liu and Xiaoran Liu and Chengqi Lv and Haijun Lv and Kai Lv and Li Ma and Runyuan Ma and Zerun Ma and Wenchang Ning and Linke Ouyang and Jiantao Qiu and Yuan Qu and Fukai Shang and Yunfan Shao and Demin Song and Zifan Song and Zhihao Sui and Peng Sun and Yu Sun and Huanze Tang and Bin Wang and Guoteng Wang and Jiaqi Wang and Jiayu Wang and Rui Wang and Yudong Wang and Ziyi Wang and Xingjian Wei and Qizhen Weng and Fan Wu and Yingtong Xiong and Chao Xu and Ruiliang Xu and Hang Yan and Yirong Yan and Xiaogui Yang and Haochen Ye and Huaiyuan Ying and Jia Yu and Jing Yu and Yuhang Zang and Chuyu Zhang and Li Zhang and Pan Zhang and Peng Zhang and Ruijie Zhang and Shuo Zhang and Songyang Zhang and Wenjian Zhang and Wenwei Zhang and Xingcheng Zhang and Xinyue Zhang and Hui Zhao and Qian Zhao and Xiaomeng Zhao and Fengzhe Zhou and Zaida Zhou and Jingming Zhuo and Yicheng Zou and Xipeng Qiu and Yu Qiao and Dahua Lin},
      year={2024},
      eprint={2403.17297},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
 }
 ```
 ## 简介
 ### InternLM3-8B-Instruct
 InternLM3，即书生·浦语大模型第3代，开源了80亿参数，面向通用使用与高阶推理的指令模型（InternLM3-8B-Instruct）。模型具备以下特点：
 - **更低的代价取得更高的性能**:
 在推理、知识类任务上取得同量级最优性能，超过Llama3.1-8B和Qwen2.5-7B。值得关注的是InternLM3只用了4万亿词元进行训练，对比同级别模型训练成本节省75%以上。
 - **深度思考能力**:
 InternLM3支持通过长思维链求解复杂推理任务的深度思考模式，同时还兼顾了用户体验更流畅的通用回复模式。
 #### 性能评测
 我们使用开源评测工具 [OpenCompass](https://github.com/internLM/OpenCompass/) 从学科综合能力、语言能力、知识能力、推理能力、理解能力五大能力维度对InternLM开展全面评测，部分评测结果如下表所示，欢迎访问[ OpenCompass 榜单 ](https://rank.opencompass.org.cn)获取更多的评测结果。
 |              | 评测集\模型                     | InternLM3-8B-Instruct | Qwen2.5-7B-Instruct | Llama3.1-8B-Instruct | GPT-4o-mini(闭源) |
 | ------------ | ------------------------------- | --------------------- | ------------------- | -------------------- | ----------------- |
 | General      | CMMLU(0-shot)                   | **83.1**              | 75.8                | 53.9                 | 66.0              |
 |              | MMLU(0-shot)                    | 76.6                  | **76.8**            | 71.8                 | 82.7              |
 |              | MMLU-Pro(0-shot)                | **57.6**              | 56.2                | 48.1                 | 64.1              |
 | Reasoning    | GPQA-Diamond(0-shot)            | **37.4**              | 33.3                | 24.2                 | 42.9              |
 |              | DROP(0-shot)                    | **83.1**              | 80.4                | 81.6                 | 85.2              |
 |              | HellaSwag(10-shot)              | **91.2**              | 85.3                | 76.7                 | 89.5              |
 |              | KOR-Bench(0-shot)               | **56.4**              | 44.6                | 47.7                 | 58.2              |
 | MATH         | MATH-500(0-shot)                | **83.0***             | 72.4                | 48.4                 | 74.0              |
 |              | AIME2024(0-shot)                | **20.0***             | 16.7                | 6.7                  | 13.3              |
 | Coding       | LiveCodeBench(2407-2409 Pass@1) | **17.8**              | 16.8                | 12.9                 | 21.8              |
 |              | HumanEval(Pass@1)               | 82.3                  | **85.4**            | 72.0                 | 86.6              |
 | Instrunction | IFEval(Prompt-Strict)           | **79.3**              | 71.7                | 75.2                 | 79.7              |
 | LongContext  | RULER(4-128K Average)           | 87.9                  | 81.4                | **88.5**             | 90.7              |
 | Chat         | AlpacaEval 2.0(LC WinRate)      | **51.1**              | 30.3                | 25.0                 | 50.7              |
 |              | WildBench(Raw Score)            | **33.1**              | 23.3                | 1.5                  | 40.3              |
 |              | MT-Bench-101(Score 1-10)        | **8.59**              | 8.49                | 8.37                 | 8.87              |
 - 表中标粗的数值表示在对比的开源模型中的最高值。
 - 以上评测结果基于 [OpenCompass](https://github.com/internLM/OpenCompass/) 获得(部分数据标注`*`代表使用深度思考模式进行评测)，具体测试细节可参见 [OpenCompass](https://github.com/internLM/OpenCompass/) 中提供的配置文件。
 - 评测数据会因 [OpenCompass](https://github.com/internLM/OpenCompass/) 的版本迭代而存在数值差异，请以 [OpenCompass](https://github.com/internLM/OpenCompass/) 最新版的评测结果为主。
 **局限性：** 尽管在训练过程中我们非常注重模型的安全性，尽力促使模型输出符合伦理和法律要求的文本，但受限于模型大小以及概率生成范式，模型可能会产生各种不符合预期的输出，例如回复内容包含偏见、歧视等有害内容，请勿传播这些内容。由于传播不良信息导致的任何后果，本项目不承担责任。
 #### 依赖
 ```python
 transformers >= 4.48
 ```
 #### 常规对话模式
 ##### Transformers 推理
 通过以下的代码加载  InternLM3 8B Instruct 模型
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 model_dir = "internlm/internlm3-8b-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
 model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
  # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
  # pip install -U bitsandbytes
  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
 model = model.eval()
 system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语).
 - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文."""
 messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
 ]
 tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 generated_ids = model.generate(tokenized_chat, max_new_tokens=1024, temperature=1, repetition_penalty=1.005, top_k=40, top_p=0.8)
 generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
 prompt = tokenizer.batch_decode(tokenized_chat)[0]
 print(prompt)
 response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 print(response)
 ```
 ##### LMDeploy 推理
 LMDeploy 是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。
 ```bash
 pip install lmdeploy
 ```
 你可以使用以下 python 代码进行本地批量推理:
 ```python
 import lmdeploy
 model_dir = "internlm/internlm3-8b-instruct"
 pipe = lmdeploy.pipeline(model_dir)
 response = pipe(["Please tell me five scenic spots in Shanghai"])
 print(response)
 ```
 或者你可以使用以下命令启动兼容 OpenAI API 的服务:
 ```bash
 lmdeploy serve api_server internlm/internlm3-8b-instruct --model-name internlm3-8b-instruct --server-port 23333 
 ```
 然后你可以向服务端发起一个聊天请求:
 ```bash
 curl http://localhost:23333/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
    "model": "internlm3-8b-instruct",
    "messages": [
    {"role": "user", "content": "介绍一下深度学习。"}
    ]
    }'
 ```
 更多信息请查看 [LMDeploy 文档](https://lmdeploy.readthedocs.io/en/latest/)
 #####  Ollama 推理
 准备工作
 ```python
 # install ollama
 curl -fsSL https://ollama.com/install.sh | sh
 # fetch 模型
 ollama pull internlm/internlm3-8b-instruct
 # install python库
 pip install ollama
 ```
 推理代码
 ```python
 import ollama
 system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语).
 - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文."""
 messages = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {
        "role": "user",
        "content": "Please tell me five scenic spots in Shanghai"
    },
 ]
 stream = ollama.chat(
    model='internlm/internlm3-8b-instruct',
    messages=messages,
    stream=True,
 )
 for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)
 ```
 #### 
 ##### vLLM 推理
 参考[文档](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) 安装 vllm 最新代码
 ```bash
 pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 推理代码
 ```python
 from vllm import LLM, SamplingParams
 llm = LLM(model="internlm/internlm3-8b-instruct")
 sampling_params = SamplingParams(temperature=1, repetition_penalty=1.005, top_k=40, top_p=0.8)
 system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语).
 - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文."""
 prompts = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {
        "role": "user",
        "content": "Please tell me five scenic spots in Shanghai"
    },
 ]
 outputs = llm.chat(prompts,
                   sampling_params=sampling_params,
                   use_tqdm=False)
 print(outputs)
 ```
 #### 深度思考模式
 ##### 深度思考 Demo
 <img src="https://github.com/InternLM/InternLM/blob/017ba7446d20ecc3b9ab8e7b66cc034500868ab4/assets/solve_puzzle.png?raw=true" width="400"/>
 ##### 深度思考 system prompt
 ```python
 thinking_system_prompt = """You are an expert mathematician with extensive experience in mathematical competitions. You approach problems through systematic thinking and rigorous reasoning. When solving problems, follow these thought processes:
 ## Deep Understanding
 Take time to fully comprehend the problem before attempting a solution. Consider:
 - What is the real question being asked?
 - What are the given conditions and what do they tell us?
 - Are there any special restrictions or assumptions?
 - Which information is crucial and which is supplementary?
 ## Multi-angle Analysis
 Before solving, conduct thorough analysis:
 - What mathematical concepts and properties are involved?
 - Can you recall similar classic problems or solution methods?
 - Would diagrams or tables help visualize the problem?
 - Are there special cases that need separate consideration?
 ## Systematic Thinking
 Plan your solution path:
 - Propose multiple possible approaches
 - Analyze the feasibility and merits of each method
 - Choose the most appropriate method and explain why
 - Break complex problems into smaller, manageable steps
 ## Rigorous Proof
 During the solution process:
 - Provide solid justification for each step
 - Include detailed proofs for key conclusions
 - Pay attention to logical connections
 - Be vigilant about potential oversights
 ## Repeated Verification
 After completing your solution:
 - Verify your results satisfy all conditions
 - Check for overlooked special cases
 - Consider if the solution can be optimized or simplified
 - Review your reasoning process
 Remember:
 1. Take time to think thoroughly rather than rushing to an answer
 2. Rigorously prove each key conclusion
 3. Keep an open mind and try different approaches
 4. Summarize valuable problem-solving methods
 5. Maintain healthy skepticism and verify multiple times
 Your response should reflect deep mathematical understanding and precise logical thinking, making your solution path and reasoning clear to others.
 When you're ready, present your complete solution with:
 - Clear problem understanding
 - Detailed solution process
 - Key insights
 - Thorough verification
 Focus on clear, logical progression of ideas and thorough explanation of your mathematical reasoning. Provide answers in the same language as the user asking the question, repeat the final answer using a '\\boxed{}' without any units, you have [[8192]] tokens to complete the answer.
 """
 ```
 ##### Transformers 推理
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 model_dir = "internlm/internlm3-8b-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
 model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
  # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
  # pip install -U bitsandbytes
  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
 model = model.eval()
 messages = [
    {"role": "system", "content": thinking_system_prompt},
    {"role": "user", "content": "已知函数\(f(x)=\mathrm{e}^{x}-ax - a^{3}\)。\n（1）当\(a = 1\)时，求曲线\(y = f(x)\)在点\((1,f(1))\)处的切线方程；\n（2）若\(f(x)\)有极小值，且极小值小于\(0\)，求\(a\)的取值范围。"},
 ]
 tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 generated_ids = model.generate(tokenized_chat, max_new_tokens=8192)
 generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
 prompt = tokenizer.batch_decode(tokenized_chat)[0]
 print(prompt)
 response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 print(response)
 ```
 ##### LMDeploy 推理
 LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
 ```bash
 pip install lmdeploy
 ```
 You can run batch inference locally with the following python code:
 ```python
 from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
 model_dir = "internlm/internlm3-8b-instruct"
 chat_template_config = ChatTemplateConfig(model_name='internlm3')
 pipe = pipeline(model_dir, chat_template_config=chat_template_config)
 messages = [
        {"role": "system", "content": thinking_system_prompt},
        {"role": "user", "content": "已知函数\(f(x)=\mathrm{e}^{x}-ax - a^{3}\)。\n（1）当\(a = 1\)时，求曲线\(y = f(x)\)在点\((1,f(1))\)处的切线方程；\n（2）若\(f(x)\)有极小值，且极小值小于\(0\)，求\(a\)的取值范围。"},
 ]
 response = pipe(messages, gen_config=GenerationConfig(max_new_tokens=2048))
 print(response)
 ```
 #####  Ollama 推理
 准备工作
 ```python
 # install ollama
 curl -fsSL https://ollama.com/install.sh | sh
 # fetch 模型
 ollama pull internlm/internlm3-8b-instruct
 # install python库
 pip install ollama
 ```
 inference code,
 ```python
 import ollama
 messages = [
    {
        "role": "system",
        "content": thinking_system_prompt,
    },
    {
        "role": "user",
        "content": "Given the function\(f(x)=\mathrm{e}^{x}-ax - a^{3}\),\n(1) When \(a = 1\), find the equation of the tangent line to the curve \(y = f(x)\) at the point \((1,f(1))\).\n(2) If \(f(x)\) has a local minimum and the minimum value is less than \(0\), determine the range of values for \(a\)."
    },
 ]
 stream = ollama.chat(
    model='internlm/internlm3-8b-instruct',
    messages=messages,
    stream=True,
 )
 for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)
 ```
 #### 
 ##### vLLM 推理
 参考[文档](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) 安装 vllm 最新代码
 ```bash
 pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 推理代码
 ```python
 from vllm import LLM, SamplingParams
 llm = LLM(model="internlm/internlm3-8b-instruct")
 sampling_params = SamplingParams(temperature=1, repetition_penalty=1.005, top_k=40, top_p=0.8, max_tokens=8192)
 prompts = [
    {
        "role": "system",
        "content": thinking_system_prompt,
    },
    {
        "role": "user",
        "content": "已知函数\(f(x)=\mathrm{e}^{x}-ax - a^{3}\)。\n（1）当\(a = 1\)时，求曲线\(y = f(x)\)在点\((1,f(1))\)处的切线方程；\n（2）若\(f(x)\)有极小值，且极小值小于\(0\)，求\(a\)的取值范围。"
    },
 ]
 outputs = llm.chat(prompts,
                   sampling_params=sampling_params,
                   use_tqdm=False)
 print(outputs)
 ```
 ## 开源许可证
 本仓库的代码和权重依照 Apache-2.0 协议开源。
 ## 引用
 ```
@misc{cai2024internlm2,
      title={InternLM2 Technical Report},
      author={Zheng Cai and Maosong Cao and Haojiong Chen and Kai Chen and Keyu Chen and Xin Chen and Xun Chen and Zehui Chen and Zhi Chen and Pei Chu and Xiaoyi Dong and Haodong Duan and Qi Fan and Zhaoye Fei and Yang Gao and Jiaye Ge and Chenya Gu and Yuzhe Gu and Tao Gui and Aijia Guo and Qipeng Guo and Conghui He and Yingfan Hu and Ting Huang and Tao Jiang and Penglong Jiao and Zhenjiang Jin and Zhikai Lei and Jiaxing Li and Jingwen Li and Linyang Li and Shuaibin Li and Wei Li and Yining Li and Hongwei Liu and Jiangning Liu and Jiawei Hong and Kaiwen Liu and Kuikun Liu and Xiaoran Liu and Chengqi Lv and Haijun Lv and Kai Lv and Li Ma and Runyuan Ma and Zerun Ma and Wenchang Ning and Linke Ouyang and Jiantao Qiu and Yuan Qu and Fukai Shang and Yunfan Shao and Demin Song and Zifan Song and Zhihao Sui and Peng Sun and Yu Sun and Huanze Tang and Bin Wang and Guoteng Wang and Jiaqi Wang and Jiayu Wang and Rui Wang and Yudong Wang and Ziyi Wang and Xingjian Wei and Qizhen Weng and Fan Wu and Yingtong Xiong and Chao Xu and Ruiliang Xu and Hang Yan and Yirong Yan and Xiaogui Yang and Haochen Ye and Huaiyuan Ying and Jia Yu and Jing Yu and Yuhang Zang and Chuyu Zhang and Li Zhang and Pan Zhang and Peng Zhang and Ruijie Zhang and Shuo Zhang and Songyang Zhang and Wenjian Zhang and Wenwei Zhang and Xingcheng Zhang and Xinyue Zhang and Hui Zhao and Qian Zhao and Xiaomeng Zhao and Fengzhe Zhou and Zaida Zhou and Jingming Zhuo and Yicheng Zou and Xipeng Qiu and Yu Qiao and Dahua Lin},
      year={2024},
      eprint={2403.17297},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
 }
 ```
--- a/config.json
+++ b/config.json
@@ -0,0 +1,37 @@
 {
    "architectures": [
        "InternLM3ForCausalLM"
    ],
    "attention_dropout": 0.0,
    "auto_map": {
        "AutoConfig": "configuration_internlm3.InternLM3Config",
        "AutoModel": "modeling_internlm3.InternLM3Model",
        "AutoModelForCausalLM": "modeling_internlm3.InternLM3ForCausalLM"
    },
    "bias": false,
    "bos_token_id": 1,
    "eos_token_id": 2,
    "head_dim": 128,
    "hidden_act": "silu",
    "hidden_size": 4096,
    "initializer_range": 0.02,
    "intermediate_size": 10240,
    "max_position_embeddings": 32768,
    "model_type": "internlm3",
    "num_attention_heads": 32,
    "num_hidden_layers": 48,
    "num_key_value_heads": 2,
    "pad_token_id": 2,
    "qkv_bias": false,
    "rms_norm_eps": 1e-05,
    "rope_scaling": {
        "factor": 6.0,
        "rope_type": "dynamic"
    },
    "rope_theta": 50000000,
    "tie_word_embeddings": false,
    "torch_dtype": "bfloat16",
    "transformers_version": "4.47.1",
    "use_cache": true,
    "vocab_size": 128512
 }
--- a/configuration.json
+++ b/configuration.json
@@ -0,0 +1 @@
 {"framework": "pytorch", "task": "text-generation", "allow_remote": true}
--- a/configuration_internlm3.py
+++ b/configuration_internlm3.py
@@ -0,0 +1,197 @@
 # coding=utf-8
 # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on transformers/src/transformers/models/llama/configuration_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ InternLM3 model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class InternLM3Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
    an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 151936):
            Vocabulary size of the InternLM3 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`InternLM3Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 22016):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 32):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key and value projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in o_proj, up_proj, down_proj and gate_proj layers.
        head_dim (`int`, *optional*):
            The attention head dimension. If None, it will default to hidden_size // num_heads
    ```python
    >>> from transformers import InternLM3Model, InternLM3Config
    >>> # Initializing a InternLM3 style configuration
    >>> configuration = InternLM3Config()
    >>> # Initializing a model from the InternLM3-8B style configuration
    >>> model = InternLM3Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "internlm3"
    keys_to_ignore_at_inference = ["past_key_values"]
    # Default tensor parallel plan for base model `InternLM3`
    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    def __init__(
        self,
        vocab_size=128512,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=32,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        qkv_bias=False,
        attention_dropout=0.0,
        bias=False,
        head_dim=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.qkv_bias = qkv_bias
        self.attention_dropout = attention_dropout
        self.bias = bias
        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
        rope_config_validation(self)
        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,9 @@
 {
  "bos_token_id": 1,
  "eos_token_id": [
    2,
    128131
  ],
  "pad_token_id": 2,
  "transformers_version": "4.47.1"
 }
--- a/model-00001-of-00002.safetensors
+++ b/model-00001-of-00002.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:9a18eb700cd28f03c0745ac75ab53ead514cfa703fabb82c61a37b85cb593ff4
 size 9928388896
--- a/model-00002-of-00002.safetensors
+++ b/model-00002-of-00002.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:e324110df616ca5190f20469528f13421476f38f0e2597dac4d9ee0bdcede797
 size 7680144544
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
@@ -0,0 +1,442 @@
 {
  "metadata": {
    "total_size": 17608482816
  },
  "weight_map": {
    "lm_head.weight": "model-00002-of-00002.safetensors",
    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.36.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.36.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.36.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.36.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.36.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.36.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.36.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.36.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.36.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.37.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.37.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.37.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.37.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.37.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.37.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.37.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.37.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.37.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.38.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.38.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.38.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.38.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.38.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.38.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.38.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.38.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.38.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.39.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.39.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.39.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.39.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.39.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.39.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.39.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.39.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.39.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.40.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.40.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.40.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.40.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.40.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.40.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.40.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.40.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.40.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.41.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.41.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.41.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.41.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.41.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.41.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.41.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.41.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.41.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.42.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.42.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.42.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.42.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.42.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.42.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.42.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.42.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.42.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.43.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.43.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.43.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.43.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.43.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.43.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.43.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.43.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.43.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.44.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.44.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.44.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.44.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.44.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.44.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.44.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.44.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.44.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.45.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.45.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.45.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.45.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.45.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.45.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.45.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.45.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.45.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.46.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.46.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.46.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.46.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.46.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.46.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.46.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.46.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.46.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.47.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.47.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.47.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.47.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.47.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.47.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.47.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.47.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.47.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.norm.weight": "model-00002-of-00002.safetensors"
  }
 }
--- a/modeling_internlm3.py
+++ b/modeling_internlm3.py
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,54 @@
 {
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|action_start|>",
    "<|action_end|>",
    "<|interpreter|>",
    "<|plugin|>",
    "<restate>",
    "</restate>",
    "<planning>",
    "</planning>",
    "<recollect>",
    "</recollect>",
    "<execution>",
    "</execution>",
    "<review>",
    "</review>",
    "<summarize>",
    "</summarize>",
    "<retry>",
    "</retry>",
    "<conclude>",
    "</conclude>"
  ],
  "bos_token": {
    "content": "<s>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "eos_token": {
    "content": "</s>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "</s>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "unk_token": {
    "content": "<unk>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenization_internlm3.py
+++ b/tokenization_internlm3.py
@@ -0,0 +1,294 @@
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 import sentencepiece as spm
 from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
 from transformers.utils import logging
 if TYPE_CHECKING:
    from transformers.tokenization_utils_base import TextInput
 logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
 SPIECE_UNDERLINE = "▁"
 class InternLM3Tokenizer(PreTrainedTokenizer):
    """
    Construct a InternLM3 tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.
            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for InternLM3 should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
        spaces_for_interleaved_special_tokens (`bool`, *optional*, defaults to `False`):
           Whether or not to add spaces between special tokens that are interleaved with normal tokens.
        add_prefix_space (`bool`, *optional*, defaults to `True`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. Again, this should be set with `from_slow=True` to make sure it's taken into account.
    """
    _auto_class = "AutoTokenizer"
    vocab_files_names = VOCAB_FILES_NAMES
    model_input_names = ["input_ids", "attention_mask"]
    def __init__(
        self,
        vocab_file,
        unk_token="<unk>",
        bos_token="<s>",
        eos_token="</s>",
        pad_token=None,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        add_bos_token=True,
        add_eos_token=False,
        clean_up_tokenization_spaces=False,
        use_default_system_prompt=False,
        spaces_between_special_tokens=False,
        spaces_for_interleaved_special_tokens=False,
        add_prefix_space=True,
        **kwargs,
    ):
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
        self.vocab_file = vocab_file
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_default_system_prompt = use_default_system_prompt
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)
        self.add_prefix_space = add_prefix_space
        self.spaces_for_interleaved_special_tokens = spaces_for_interleaved_special_tokens
        vocab_size = self.sp_model.get_piece_size()
        self.decoder = {i: self.sp_model.id_to_piece(i) for i in range(vocab_size)}
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            sp_model_kwargs=sp_model_kwargs,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            use_default_system_prompt=use_default_system_prompt,
            spaces_between_special_tokens=spaces_between_special_tokens,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
        return state
    def __setstate__(self, d):
        self.__dict__.update(d)
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
    @property
    def vocab_size(self):
        """Returns vocab size"""
        return self.sp_model.get_piece_size()
    def get_vocab(self):
        """Returns vocab as a dict"""
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab
    def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
        """
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        """
        return super().tokenize(text, **kwargs)
    def _tokenize(self, text, **kwargs):
        """
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        """
        return self.sp_model.encode(text, out_type=str)
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.sp_model.piece_to_id(token)
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.decoder.get(index, "")
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # since we manually add the prefix space, we have to remove it when decoding
        if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
            tokens[0] = tokens[0][1:]
        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for i, token in enumerate(tokens):
            # make sure that special tokens are not decoded using sentencepiece model
            if token in self.all_special_tokens:
                if not prev_is_special and i != 0 and self.spaces_for_interleaved_special_tokens:
                    out_string += " "
                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                if (
                    prev_is_special
                    and i == 1
                    and self.add_prefix_space
                    and not token.startswith(SPIECE_UNDERLINE)
                    and self.spaces_for_interleaved_special_tokens
                ):
                    out_string += " "
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string
    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary and special tokens file to a directory.
        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"])
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)
        return (out_vocab_file,)
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
        output = bos_token_id + token_ids_0 + eos_token_id
        if token_ids_1 is not None:
            output = output + bos_token_id + token_ids_1 + eos_token_id
        return output
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.
        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.
        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
        bos_token_id = [1] if self.add_bos_token else []
        eos_token_id = [1] if self.add_eos_token else []
        if token_ids_1 is None:
            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
        return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id
    def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:
        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```
        if token_ids_1 is None, only returns the first portion of the mask (0s).
        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
        if token_ids_1 is not None:
            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
        return output
--- a/tokenizer.model
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:bcacff3229854f5103ee7a85473a30ca9a8b3a68f3aae9b7479574b23ac2256b
 size 2475075
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,249 @@
 {
  "add_bos_token": true,
  "add_eos_token": false,
  "add_prefix_space": true,
  "added_tokens_decoder": {
    "0": {
      "content": "<unk>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "1": {
      "content": "<s>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "2": {
      "content": "</s>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128111": {
      "content": "<restate>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128112": {
      "content": "</restate>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128113": {
      "content": "<planning>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128114": {
      "content": "</planning>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128115": {
      "content": "<recollect>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128116": {
      "content": "</recollect>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128117": {
      "content": "<execution>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128118": {
      "content": "</execution>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128119": {
      "content": "<review>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128120": {
      "content": "</review>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128121": {
      "content": "<summarize>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128122": {
      "content": "</summarize>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128123": {
      "content": "<retry>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128124": {
      "content": "</retry>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128125": {
      "content": "<conclude>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128126": {
      "content": "</conclude>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128127": {
      "content": "<|plugin|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128128": {
      "content": "<|interpreter|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128129": {
      "content": "<|action_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128130": {
      "content": "<|action_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128131": {
      "content": "<|im_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "128132": {
      "content": "<|im_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    }
  },
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|action_start|>",
    "<|action_end|>",
    "<|interpreter|>",
    "<|plugin|>",
    "<restate>",
    "</restate>",
    "<planning>",
    "</planning>",
    "<recollect>",
    "</recollect>",
    "<execution>",
    "</execution>",
    "<review>",
    "</review>",
    "<summarize>",
    "</summarize>",
    "<retry>",
    "</retry>",
    "<conclude>",
    "</conclude>"
  ],
  "auto_map": {
    "AutoTokenizer": [
      "tokenization_internlm3.InternLM3Tokenizer",
      null
    ]
  },
  "bos_token": "<s>",
  "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
  "clean_up_tokenization_spaces": false,
  "eos_token": "</s>",
  "extra_special_tokens": {},
  "model_max_length": 1000000000000000019884624838656,
  "pad_token": "</s>",
  "sp_model_kwargs": {},
  "spaces_between_special_tokens": false,
  "tokenizer_class": "InternLM3Tokenizer",
  "unk_token": "<unk>",
  "use_default_system_prompt": false
 }
		`@@ -0,0 +1 @@`
							`{"framework": "pytorch", "task": "text-generation", "allow_remote": true}`