初始化项目,由ModelHub XC社区提供模型
Model: LLM-Research/Phi-4-mini-reasoning Source: Original Platform
This commit is contained in:
49
.gitattributes
vendored
Normal file
49
.gitattributes
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
||||
*.tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
*.db* filter=lfs diff=lfs merge=lfs -text
|
||||
*.ark* filter=lfs diff=lfs merge=lfs -text
|
||||
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
|
||||
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
|
||||
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.gguf* filter=lfs diff=lfs merge=lfs -text
|
||||
*.ggml filter=lfs diff=lfs merge=lfs -text
|
||||
*.llamafile* filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
|
||||
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
9
CODE_OF_CONDUCT.md
Normal file
9
CODE_OF_CONDUCT.md
Normal file
@@ -0,0 +1,9 @@
|
||||
# Microsoft Open Source Code of Conduct
|
||||
|
||||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
||||
|
||||
Resources:
|
||||
|
||||
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
|
||||
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
|
||||
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
|
||||
22
LICENSE
Normal file
22
LICENSE
Normal file
@@ -0,0 +1,22 @@
|
||||
Microsoft.
|
||||
Copyright (c) Microsoft Corporation.
|
||||
|
||||
MIT License
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
38
NOTICE.md
Normal file
38
NOTICE.md
Normal file
@@ -0,0 +1,38 @@
|
||||
NOTICES AND INFORMATION
|
||||
Do Not Translate or Localize
|
||||
|
||||
This software incorporates material from third parties.
|
||||
|
||||
**Component.** https://github.com/Dao-AILab/flash-attention
|
||||
|
||||
**Open Source License/Copyright Notice.**
|
||||
|
||||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
BIN
Phi-4-Mini-Reasoning.pdf
Normal file
BIN
Phi-4-Mini-Reasoning.pdf
Normal file
Binary file not shown.
234
README.md
Normal file
234
README.md
Normal file
@@ -0,0 +1,234 @@
|
||||
---
|
||||
language:
|
||||
- en
|
||||
library_name: transformers
|
||||
license: mit
|
||||
license_link: https://huggingface.co/microsoft/Phi-4-mini-instruct-reasoning/resolve/main/LICENSE
|
||||
pipeline_tag: text-generation
|
||||
tags:
|
||||
- nlp
|
||||
- math
|
||||
- code
|
||||
widget:
|
||||
- messages:
|
||||
- role: user
|
||||
content: How to solve 3*x^2+4*x+5=1?
|
||||
---
|
||||
|
||||
## Model Summary
|
||||
|
||||
Phi-4-mini-reasoning is a lightweight open model built upon synthetic data with a focus on high-quality, reasoning dense data further finetuned for more advanced math reasoning capabilities.
|
||||
The model belongs to the Phi-4 model family and supports 128K token context length.
|
||||
|
||||
📰 [Phi-4-mini-reasoning Blog](https://aka.ms/phi4-mini-reasoning/blog), and [Developer Article](https://techcommunity.microsoft.com/blog/azuredevcommunityblog/make-phi-4-mini-reasoning-more-powerful-with-industry-reasoning-on-edge-devices/4409764)<br>
|
||||
📖 [Phi-4-mini-reasoning Technical Report](https://aka.ms/phi4-mini-reasoning/techreport) | [HF paper](https://huggingface.co/papers/2504.21233) <br>
|
||||
👩🍳 [Phi Cookbook](https://github.com/microsoft/PhiCookBook) <br>
|
||||
🏡 [Phi Portal](https://azure.microsoft.com/en-us/products/phi) <br>
|
||||
🖥️ Try It [Azure](https://aka.ms/phi4-mini-reasoning/azure) <br>
|
||||
|
||||
|
||||
🎉**Phi-4 models**: [[Phi-4-reasoning](https://huggingface.co/microsoft/Phi-4-reasoning)] | [[multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) | [onnx](https://huggingface.co/microsoft/Phi-4-multimodal-instruct-onnx)];
|
||||
[[mini-instruct](https://huggingface.co/microsoft/Phi-4-mini-instruct) | [onnx](https://huggingface.co/microsoft/Phi-4-mini-instruct-onnx)]
|
||||
|
||||
## Intended Uses
|
||||
|
||||
### Primary Use Cases
|
||||
|
||||
Phi-4-mini-reasoning is designed for multi-step, logic-intensive mathematical problem-solving tasks under memory/compute constrained environments and latency bound scenarios.
|
||||
Some of the use cases include formal proof generation, symbolic computation, advanced word problems, and a wide range of mathematical reasoning scenarios.
|
||||
These models excel at maintaining context across steps, applying structured logic, and delivering accurate, reliable solutions in domains that require deep analytical thinking.
|
||||
|
||||
### Use Case Considerations
|
||||
|
||||
This model is designed and tested for math reasoning only. It is not specifically designed or evaluated for all downstream purposes.
|
||||
Developers should consider common limitations of language models, as well as performance difference across languages, as they select use cases, and evaluate and mitigate for accuracy, safety, and fairness before using within a specific downstream use case, particularly for high-risk scenarios.
|
||||
Developers should be aware of and adhere to applicable laws or regulations (including but not limited to privacy, trade compliance laws, etc.) that are relevant to their use case.
|
||||
|
||||
***Nothing contained in this Model Card should be interpreted as or deemed a restriction or modification to the license the model is released under.***
|
||||
|
||||
## Release Notes
|
||||
|
||||
This release of Phi-4-mini-reasoning addresses user feedback and market demand for a compact reasoning model.
|
||||
It is a compact transformer-based language model optimized for mathematical reasoning, built to deliver high-quality, step-by-step problem solving in environments where computing or latency is constrained.
|
||||
The model is fine-tuned with synthetic math data from a more capable model (much larger, smarter, more accurate, and better at following instructions), which has resulted in enhanced reasoning performance.
|
||||
Phi-4-mini-reasoning balances reasoning ability with efficiency, making it potentially suitable for educational applications, embedded tutoring, and lightweight deployment on edge or mobile systems.
|
||||
If a critical issue is identified with Phi-4-mini-reasoning, it should be promptly reported through the MSRC Researcher Portal or secure@microsoft.com
|
||||
|
||||
### Model Quality
|
||||
|
||||
To understand the capabilities, the 3.8B parameters Phi-4-mini-reasoning model was compared with a set of models over a variety of reasoning benchmarks.
|
||||
A high-level overview of the model quality is as follows:
|
||||
|
||||
| Model | AIME | MATH-500 | GPQA Diamond |
|
||||
|------------------------------------|-------|----------|--------------|
|
||||
| o1-mini* | 63.6 | 90.0 | 60.0 |
|
||||
| DeepSeek-R1-Distill-Qwen-7B | 53.3 | 91.4 | 49.5 |
|
||||
| DeepSeek-R1-Distill-Llama-8B | 43.3 | 86.9 | 47.3 |
|
||||
| Bespoke-Stratos-7B* | 20.0 | 82.0 | 37.8 |
|
||||
| OpenThinker-7B* | 31.3 | 83.0 | 42.4 |
|
||||
| Llama-3.2-3B-Instruct | 6.7 | 44.4 | 25.3 |
|
||||
| Phi-4-Mini (base model, 3.8B) | 10.0 | 71.8 | 36.9 |
|
||||
|**Phi-4-mini-reasoning (3.8B)** | **57.5** | **94.6** | **52.0** |
|
||||
|
||||
Overall, the model with only 3.8B-param achieves a similar level of multilingual language understanding and reasoning ability as much larger models.
|
||||
However, it is still fundamentally limited by its size for certain tasks. The model simply does not have the capacity to store too much factual knowledge, therefore, users may experience factual incorrectness. However, it may be possible to resolve such weakness by augmenting Phi-4 with a search engine, particularly when using the model under RAG settings.
|
||||
|
||||
## Usage
|
||||
|
||||
### Tokenizer
|
||||
|
||||
Phi-4-mini-reasoning supports a vocabulary size of up to `200064` tokens. The [tokenizer files](https://huggingface.co/microsoft/Phi-4-mini-reasoning/blob/main/added_tokens.json) already provide placeholder tokens that can be used for downstream fine-tuning, but they can also be extended up to the model's vocabulary size.
|
||||
|
||||
### Input Formats
|
||||
|
||||
Given the nature of the training data, the Phi-4-mini-instruct
|
||||
model is best suited for prompts using specific formats.
|
||||
Below are the two primary formats:
|
||||
|
||||
#### Chat format
|
||||
|
||||
This format is used for general conversation and instructions:
|
||||
|
||||
```yaml
|
||||
<|system|>Your name is Phi, an AI math expert developed by Microsoft.<|end|><|user|>How to solve 3*x^2+4*x+5=1?<|end|><|assistant|>
|
||||
```
|
||||
### Inference with transformers
|
||||
|
||||
Phi-4-mini-reasoning has been integrated in the `4.51.3` version of `transformers`. The current `transformers` version can be verified with: `pip list | grep transformers`.
|
||||
Python 3.8 and 3.10 will work best.
|
||||
List of required packages:
|
||||
|
||||
```
|
||||
flash_attn==2.7.4.post1
|
||||
torch==2.5.1
|
||||
transformers==4.51.3
|
||||
accelerate==1.3.0
|
||||
```
|
||||
|
||||
Phi-4-mini-reasoning is also available in [Azure AI Studio](https://aka.ms/phi-4-mini-reasoning/azure)
|
||||
|
||||
#### Example
|
||||
|
||||
After obtaining the Phi-4-mini-instruct model checkpoints, users can use this sample code for inference.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
||||
torch.random.manual_seed(0)
|
||||
|
||||
model_id = "microsoft/Phi-4-mini-reasoning"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
device_map="cuda",
|
||||
torch_dtype="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
messages = [{
|
||||
"role": "user",
|
||||
"content": "How to solve 3*x^2+4*x+5=1?"
|
||||
}]
|
||||
inputs = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs.to(model.device),
|
||||
max_new_tokens=32768,
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
do_sample=True,
|
||||
)
|
||||
outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
|
||||
|
||||
print(outputs[0])
|
||||
```
|
||||
|
||||
## Training
|
||||
|
||||
### Model
|
||||
|
||||
+ **Architecture:** Phi-4-mini-reasoning shares the same architecture as Phi-4-Mini, which has 3.8B parameters and is a dense decoder-only Transformer model. When compared with Phi-3.5-Mini, the major changes with Phi-4-Mini are 200K vocabulary, grouped-query attention, and shared input and output embedding.<br>
|
||||
+ **Inputs:** Text. It is best suited for prompts using the chat format.<br>
|
||||
+ **Context length:** 128K tokens<br>
|
||||
+ **GPUs:** 128 H100-80G<br>
|
||||
+ **Training time:** 2 days<br>
|
||||
+ **Training data:** 150B tokens<br>
|
||||
+ **Outputs:** Generated text<br>
|
||||
+ **Dates:** Trained in February 2024<br>
|
||||
+ **Status:** This is a static model trained on offline datasets with the cutoff date of February 2025 for publicly available data.<br>
|
||||
+ **Supported languages:** English<br>
|
||||
+ **Release date:** April 2025<br>
|
||||
|
||||
### Training Datasets
|
||||
|
||||
The training data for Phi-4-mini-reasoning consists exclusively of synthetic mathematical content generated by a stronger and more advanced reasoning model, Deepseek-R1.
|
||||
The objective is to distill knowledge from this model. This synthetic dataset comprises over one million diverse math problems spanning multiple levels of difficulty (from middle school to Ph.D. level).
|
||||
For each problem in the synthetic dataset, eight distinct solutions (rollouts) were sampled, and only those verified as correct were retained, resulting in approximately 30 billion tokens of math content.
|
||||
The dataset integrates three primary components:
|
||||
1) a curated selection of high-quality, publicly available math questions and a part of the SFT(Supervised Fine-Tuning) data that was used to train the base Phi-4-Mini model;
|
||||
2) an extensive collection of synthetic math data generated by the Deepseek-R1 model, designed specifically for high-quality supervised fine-tuning and model distillation; and
|
||||
3) a balanced set of correct and incorrect answers used to construct preference data aimed at enhancing Phi-4-mini-reasoning's reasoning capabilities by learning more effective reasoning trajectories
|
||||
|
||||
## Software
|
||||
* [PyTorch](https://github.com/pytorch/pytorch)
|
||||
* [Transformers](https://github.com/huggingface/transformers)
|
||||
* [Flash-Attention](https://github.com/HazyResearch/flash-attention)
|
||||
|
||||
## Hardware
|
||||
Note that by default, the Phi-4-mini-reasoning model uses flash attention, which requires certain types of GPU hardware to run. We have tested on the following GPU types:
|
||||
* NVIDIA A100
|
||||
* NVIDIA H100
|
||||
|
||||
If you want to run the model on:
|
||||
* NVIDIA V100 or earlier generation GPUs: call AutoModelForCausalLM.from_pretrained() with attn_implementation="eager"
|
||||
|
||||
## Safety Evaluation and Red-Teaming
|
||||
|
||||
The Phi-4 family of models has adopted a robust safety post-training approach. This approach leverages a variety of both open-source and in-house generated datasets. The overall technique employed to do the safety alignment is a combination of SFT, DPO (Direct Preference Optimization), and RLHF (Reinforcement Learning from Human Feedback) approaches by utilizing human-labeled and synthetic English-language datasets, including publicly available datasets focusing on helpfulness and harmlessness, as well as various questions and answers targeted to multiple safety categories.
|
||||
|
||||
Phi-4-Mini-Reasoning was developed in accordance with Microsoft's responsible AI principles. Potential safety risks in the model’s responses were assessed using the Azure AI Foundry’s Risk and Safety Evaluation framework, focusing on harmful content, direct jailbreak, and model groundedness. The Phi-4-Mini-Reasoning Model Card contains additional information about our approach to safety and responsible AI considerations that developers should be aware of when using this model.
|
||||
|
||||
## Responsible AI Considerations
|
||||
|
||||
Like other language models, the Phi family of models can potentially behave in ways that are unfair, unreliable, or offensive. Some of the limiting behaviors to be aware of include:
|
||||
|
||||
+ Quality of Service: The Phi models are trained primarily on English text and some additional multilingual text. Languages other than English will experience worse performance as well as performance disparities across non-English. English language varieties with less representation in the training data might experience worse performance than standard American English.
|
||||
+ Multilingual performance and safety gaps: We believe it is important to make language models more widely available across different languages, but the Phi 4 models still exhibit challenges common across multilingual releases. As with any deployment of LLMs, developers will be better positioned to test for performance or safety gaps for their linguistic and cultural context and customize the model with additional fine-tuning and appropriate safeguards.
|
||||
+ Representation of Harms & Perpetuation of Stereotypes: These models can over- or under-represent groups of people, erase representation of some groups, or reinforce demeaning or negative stereotypes. Despite safety post-training, these limitations may still be present due to differing levels of representation of different groups, cultural contexts, or prevalence of examples of negative stereotypes in training data that reflect real-world patterns and societal biases.
|
||||
+ Inappropriate or Offensive Content: These models may produce other types of inappropriate or offensive content, which may make it inappropriate to deploy for sensitive contexts without additional mitigations that are specific to the case.
|
||||
+ Information Reliability: Language models can generate nonsensical content or fabricate content that might sound reasonable but is inaccurate or outdated.
|
||||
+ Election Information Reliability : The model has an elevated defect rate when responding to election-critical queries, which may result in incorrect or unauthoritative election critical information being presented. We are working to improve the model's performance in this area. Users should verify information related to elections with the election authority in their region.
|
||||
+ Limited Scope for Code: The majority of Phi 4 training data is based in Python and uses common packages such as "typing, math, random, collections, datetime, itertools". If the model generates Python scripts that utilize other packages or scripts in other languages, it is strongly recommended that users manually verify all API uses.
|
||||
+ Long Conversation: Phi 4 models, like other models, can in some cases generate responses that are repetitive, unhelpful, or inconsistent in very long chat sessions in both English and non-English languages. Developers are encouraged to place appropriate mitigations, like limiting conversation turns to account for the possible conversational drift.
|
||||
|
||||
Developers should apply responsible AI best practices, including mapping, measuring, and mitigating risks associated with their specific use case and cultural, linguistic context. Phi 4 family of models are general purpose models. As developers plan to deploy these models for specific use cases, they are encouraged to fine-tune the models for their use case and leverage the models as part of broader AI systems with language-specific safeguards in place. Important areas for consideration include:
|
||||
|
||||
+ Allocation: Models may not be suitable for scenarios that could have consequential impact on legal status or the allocation of resources or life opportunities (ex: housing, employment, credit, etc.) without further assessments and additional debiasing techniques.
|
||||
+ High-Risk Scenarios: Developers should assess the suitability of using models in high-risk scenarios where unfair, unreliable or offensive outputs might be extremely costly or lead to harm. This includes providing advice in sensitive or expert domains where accuracy and reliability are critical (ex: legal or health advice). Additional safeguards should be implemented at the application level according to the deployment context.
|
||||
+ Misinformation: Models may produce inaccurate information. Developers should follow transparency best practices and inform end-users they are interacting with an AI system. At the application level, developers can build feedback mechanisms and pipelines to ground responses in use-case specific, contextual information, a technique known as Retrieval Augmented Generation (RAG).
|
||||
+ Generation of Harmful Content: Developers should assess outputs for their context and use available safety classifiers or custom solutions appropriate for their use case.
|
||||
+ Misuse: Other forms of misuse such as fraud, spam, or malware production may be possible, and developers should ensure that their applications do not violate applicable laws and regulations.
|
||||
|
||||
## License
|
||||
The model is licensed under the [MIT license](./LICENSE).
|
||||
|
||||
## Trademarks
|
||||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft’s Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party’s policies.
|
||||
|
||||
|
||||
## Appendix A: Benchmark Methodology
|
||||
|
||||
We include a brief word on methodology here - and in particular, how we think about optimizing prompts. In an ideal world, we would never change any prompts in our benchmarks to ensure it is always an apples-to-apples comparison when comparing different models. Indeed, this is our default approach, and is the case in the vast majority of models we have run to date. For all benchmarks, we consider using the same generation configuration such as max sequence length (32768), the same temperature for the fair comparison.
|
||||
Benchmark datasets
|
||||
We evaluate the model with three of the most popular math benchmarks where the strongest reasoning models are competing together. Specifically:
|
||||
- Math-500: This benchmark consists of 500 challenging math problems designed to test the model's ability to perform complex mathematical reasoning and problem-solving.
|
||||
- AIME 2024: The American Invitational Mathematics Examination (AIME) is a highly regarded math competition that features a series of difficult problems aimed at assessing advanced mathematical skills and logical reasoning.
|
||||
- GPQA Diamond: The Graduate-Level Google-Proof Q&A (GPQA) Diamond benchmark focuses on evaluating the model's ability to understand and solve a wide range of mathematical questions, including both straightforward calculations and more intricate problem-solving tasks.
|
||||
|
||||
## Data Summary
|
||||
https://huggingface.co/microsoft/Phi-4-mini-reasoning/blob/main/data_summary_card.md
|
||||
41
SECURITY.md
Normal file
41
SECURITY.md
Normal file
@@ -0,0 +1,41 @@
|
||||
<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
|
||||
|
||||
## Security
|
||||
|
||||
Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
|
||||
|
||||
If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
|
||||
|
||||
## Reporting Security Issues
|
||||
|
||||
**Please do not report security vulnerabilities through public GitHub issues.**
|
||||
|
||||
Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
|
||||
|
||||
If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
|
||||
|
||||
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
|
||||
|
||||
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
|
||||
|
||||
* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
|
||||
* Full paths of source file(s) related to the manifestation of the issue
|
||||
* The location of the affected source code (tag/branch/commit or direct URL)
|
||||
* Any special configuration required to reproduce the issue
|
||||
* Step-by-step instructions to reproduce the issue
|
||||
* Proof-of-concept or exploit code (if possible)
|
||||
* Impact of the issue, including how an attacker might exploit the issue
|
||||
|
||||
This information will help us triage your report more quickly.
|
||||
|
||||
If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
|
||||
|
||||
## Preferred Languages
|
||||
|
||||
We prefer all communications to be in English.
|
||||
|
||||
## Policy
|
||||
|
||||
Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
|
||||
|
||||
<!-- END MICROSOFT SECURITY.MD BLOCK -->
|
||||
25
SUPPORT.md
Normal file
25
SUPPORT.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# TODO: The maintainer of this repo has not yet edited this file
|
||||
|
||||
**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
|
||||
|
||||
- **No CSS support:** Fill out this template with information about how to file issues and get help.
|
||||
- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
|
||||
- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
|
||||
|
||||
*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
|
||||
|
||||
# Support
|
||||
|
||||
## How to file issues and get help
|
||||
|
||||
This project uses GitHub Issues to track bugs and feature requests. Please search the existing
|
||||
issues before filing new issues to avoid duplicates. For new issues, file your bug or
|
||||
feature request as a new Issue.
|
||||
|
||||
For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
|
||||
FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
|
||||
CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
|
||||
|
||||
## Microsoft Support Policy
|
||||
|
||||
Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
|
||||
12
added_tokens.json
Normal file
12
added_tokens.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"<|/tool_call|>": 200026,
|
||||
"<|/tool|>": 200024,
|
||||
"<|assistant|>": 200019,
|
||||
"<|end|>": 200020,
|
||||
"<|system|>": 200022,
|
||||
"<|tag|>": 200028,
|
||||
"<|tool_call|>": 200025,
|
||||
"<|tool_response|>": 200027,
|
||||
"<|tool|>": 200023,
|
||||
"<|user|>": 200021
|
||||
}
|
||||
138
config.json
Normal file
138
config.json
Normal file
@@ -0,0 +1,138 @@
|
||||
{
|
||||
"architectures": [
|
||||
"Phi3ForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 199999,
|
||||
"embd_pdrop": 0.0,
|
||||
"eos_token_id": 199999,
|
||||
"full_attn_mod": 1,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 3072,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 8192,
|
||||
"interpolate_factor": 1,
|
||||
"lm_head_bias": false,
|
||||
"max_position_embeddings": 131072,
|
||||
"mlp_bias": false,
|
||||
"model_type": "phi3",
|
||||
"num_attention_heads": 24,
|
||||
"num_hidden_layers": 32,
|
||||
"num_key_value_heads": 8,
|
||||
"original_max_position_embeddings": 4096,
|
||||
"pad_token_id": 199999,
|
||||
"partial_rotary_factor": 0.75,
|
||||
"resid_pdrop": 0.0,
|
||||
"rms_norm_eps": 1e-05,
|
||||
"rope_scaling": {
|
||||
"long_factor": [
|
||||
1,
|
||||
1.118320672,
|
||||
1.250641126,
|
||||
1.398617824,
|
||||
1.564103225,
|
||||
1.74916897,
|
||||
1.956131817,
|
||||
2.187582649,
|
||||
2.446418898,
|
||||
2.735880826,
|
||||
3.059592084,
|
||||
3.421605075,
|
||||
3.826451687,
|
||||
4.279200023,
|
||||
4.785517845,
|
||||
5.351743533,
|
||||
5.984965424,
|
||||
6.693110555,
|
||||
7.485043894,
|
||||
8.370679318,
|
||||
9.36110372,
|
||||
10.4687158,
|
||||
11.70738129,
|
||||
13.09260651,
|
||||
14.64173252,
|
||||
16.37415215,
|
||||
18.31155283,
|
||||
20.47818807,
|
||||
22.90118105,
|
||||
25.61086418,
|
||||
28.64115884,
|
||||
32.03,
|
||||
32.1,
|
||||
32.13,
|
||||
32.23,
|
||||
32.6,
|
||||
32.61,
|
||||
32.64,
|
||||
32.66,
|
||||
32.7,
|
||||
32.71,
|
||||
32.93,
|
||||
32.97,
|
||||
33.28,
|
||||
33.49,
|
||||
33.5,
|
||||
44.16,
|
||||
47.77
|
||||
],
|
||||
"short_factor": [
|
||||
1,
|
||||
1.118320672,
|
||||
1.250641126,
|
||||
1.398617824,
|
||||
1.564103225,
|
||||
1.74916897,
|
||||
1.956131817,
|
||||
2.187582649,
|
||||
2.446418898,
|
||||
2.735880826,
|
||||
3.059592084,
|
||||
3.421605075,
|
||||
3.826451687,
|
||||
4.279200023,
|
||||
4.785517845,
|
||||
5.351743533,
|
||||
5.984965424,
|
||||
6.693110555,
|
||||
7.485043894,
|
||||
8.370679318,
|
||||
9.36110372,
|
||||
10.4687158,
|
||||
11.70738129,
|
||||
13.09260651,
|
||||
14.64173252,
|
||||
16.37415215,
|
||||
18.31155283,
|
||||
20.47818807,
|
||||
22.90118105,
|
||||
25.61086418,
|
||||
28.64115884,
|
||||
32.03,
|
||||
32.1,
|
||||
32.13,
|
||||
32.23,
|
||||
32.6,
|
||||
32.61,
|
||||
32.64,
|
||||
32.66,
|
||||
32.7,
|
||||
32.71,
|
||||
32.93,
|
||||
32.97,
|
||||
33.28,
|
||||
33.49,
|
||||
33.5,
|
||||
44.16,
|
||||
47.77
|
||||
],
|
||||
"type": "longrope"
|
||||
},
|
||||
"rope_theta": 10000.0,
|
||||
"sliding_window": 262144,
|
||||
"tie_word_embeddings": true,
|
||||
"torch_dtype": "bfloat16",
|
||||
"transformers_version": "4.50.0",
|
||||
"use_cache": true,
|
||||
"vocab_size": 200064
|
||||
}
|
||||
1
configuration.json
Normal file
1
configuration.json
Normal file
@@ -0,0 +1 @@
|
||||
{"framework": "pytorch", "task": "text-generation", "allow_remote": true}
|
||||
149
data_summary_card.md
Normal file
149
data_summary_card.md
Normal file
@@ -0,0 +1,149 @@
|
||||
|
||||
|
||||
|
||||
|
||||
# Data Summary for microsoft_Phi-4-mini-reasoning, phi-4-mini-instruct, phi-4-mini-flash-reasoning
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## 1. General information
|
||||
|
||||
**1.0.1 Version of the Summary:** 1.0
|
||||
|
||||
|
||||
|
||||
**1.0.2 Last update:** 10-Dec-2025
|
||||
|
||||
|
||||
|
||||
## 1.1 Model Developer Identification
|
||||
|
||||
**1.1.1 Model Developer name and contact details:** Microsoft Corporation at One Microsoft Way, Redmond, WA 98052. Tel: 425-882-8080
|
||||
|
||||
|
||||
|
||||
## 1.2 Model Identification
|
||||
|
||||
**1.2.1 Versioned model name(s):** Phi-4-mini-reasoning, Phi-4-mini-instruct, Phi-4-mini-flash-reasoning
|
||||
|
||||
|
||||
|
||||
**1.2.2 Model release date:** 29-Apr-2025
|
||||
|
||||
|
||||
|
||||
## 1.3 Overall training data size and characteristics
|
||||
|
||||
### 1.3.1 Size of dataset and characteristics
|
||||
|
||||
**1.3.1.A Text training data size:** 1 billion to 10 trillion tokens
|
||||
|
||||
|
||||
|
||||
**1.3.1.B Text training data content:** The training data for Phi-4-mini-reasoning consists exclusively of synthetic mathematical content generated by a stronger and more advanced reasoning model, Deepseek-R1. The objective is to distill knowledge from this model. This synthetic dataset comprises over one million diverse math problems spanning multiple levels of difficulty (from middle school to Ph.D. level). For each problem in the synthetic dataset, eight distinct solutions (rollouts) were sampled, and only those verified as correct were retained.
|
||||
|
||||
|
||||
|
||||
**1.3.1.C Image training data size:** Not applicable. Images are not part of the training data
|
||||
|
||||
|
||||
|
||||
**1.3.1.D Image training data content:** Not applicable
|
||||
|
||||
|
||||
|
||||
**1.3.1.E Audio training data size:** Not applicable. Audio is not part of the training data
|
||||
|
||||
|
||||
|
||||
**1.3.1.F Audio training data content:** Not applicable
|
||||
|
||||
|
||||
|
||||
**1.3.1.G Video training data size:** Not applicable. Videos are not part of the training data
|
||||
|
||||
|
||||
|
||||
**1.3.1.H Video training data content:** Not applicable
|
||||
|
||||
|
||||
|
||||
**1.3.1.I Other training data size:** Not applicable
|
||||
|
||||
|
||||
|
||||
**1.3.1.J Other training data content:** Not applicable
|
||||
|
||||
|
||||
|
||||
**1.3.2 Latest date of data acquisition/collection for model training:** February 2025
|
||||
|
||||
|
||||
|
||||
**1.3.3 Is data collection ongoing to update the model with new data collection after deployment?** No
|
||||
|
||||
|
||||
|
||||
**1.3.4 Date the training dataset was first used to train the model:** February 2025
|
||||
|
||||
|
||||
|
||||
**1.3.5 Rationale or purpose of data selection:** Datasets consist of synthetic mathematical problems and verified solutions generated by a stronger reasoning model to distill high-quality reasoning patterns and improve math problem-solving performance across difficulty levels
|
||||
|
||||
|
||||
|
||||
## 2. List of data sources
|
||||
|
||||
### 2.1 Publicly available datasets
|
||||
|
||||
**2.1.1 Have you used publicly available datasets to train the model?** Yes
|
||||
|
||||
|
||||
|
||||
## 2.2 Private non-publicly available datasets obtained from third parties
|
||||
|
||||
### 2.2.1 Datasets commercially licensed by rights holders or their representatives
|
||||
|
||||
**2.2.1.A Have you concluded transactional commercial licensing agreement(s) with rights holder(s) or with their representatives?** Not applicable
|
||||
|
||||
|
||||
|
||||
### 2.2.2 Private datasets obtained from other third-parties
|
||||
|
||||
**2.2.2.A Have you obtained private datasets from third parties that are not licensed as described in Section 2.2.1, such as data obtained from providers of private databases, or data intermediaries?** No
|
||||
|
||||
|
||||
|
||||
## 2.3 Personal Information
|
||||
|
||||
**2.3.1 Was personal data used to train the model?** Microsoft follows all relevant laws and regulations pertaining to personal information
|
||||
|
||||
|
||||
|
||||
## 2.4 Synthetic data
|
||||
|
||||
**2.4.1 Was any synthetic AI-generated data used to train the model?** Yes
|
||||
|
||||
|
||||
|
||||
## 3. Data processing aspects
|
||||
|
||||
### 3.1 Respect of reservation of rights from text and data mining exception or limitation
|
||||
|
||||
**3.1.1 Does this dataset include any data protected by copyright, trademark, or patent?** Microsoft follows all required regulations and laws for processing data protected by copyright, trademark, or patent
|
||||
|
||||
|
||||
|
||||
## 3.2 Other information
|
||||
|
||||
**3.2.1 Does the dataset include information about consumer groups without revealing individual consumer identities?** Microsoft follows all required regulations and laws for protecting consumer identities
|
||||
|
||||
|
||||
|
||||
**3.2.2 Was the dataset cleaned or modified before model training?** Yes
|
||||
|
||||
|
||||
|
||||
|
||||
11
generation_config.json
Normal file
11
generation_config.json
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 199999,
|
||||
"eos_token_id": [
|
||||
200020,
|
||||
199999
|
||||
],
|
||||
"pad_token_id": 199999,
|
||||
"transformers_version": "4.50.0",
|
||||
"use_cache": true
|
||||
}
|
||||
199743
merges.txt
Normal file
199743
merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
model-00001-of-00002.safetensors
Normal file
3
model-00001-of-00002.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a0c24f128e33afb9e406915229af56171e0a2353bc78c1ea1b5260a36b3e6707
|
||||
size 4903637712
|
||||
3
model-00002-of-00002.safetensors
Normal file
3
model-00002-of-00002.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b4bfcc826b3c637333c6bd24b0dfe38fffd45eff7f4718df454366e875a12415
|
||||
size 2768428504
|
||||
201
model.safetensors.index.json
Normal file
201
model.safetensors.index.json
Normal file
@@ -0,0 +1,201 @@
|
||||
{
|
||||
"metadata": {
|
||||
"total_size": 7672043520
|
||||
},
|
||||
"weight_map": {
|
||||
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.18.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.19.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.norm.weight": "model-00002-of-00002.safetensors"
|
||||
}
|
||||
}
|
||||
30
special_tokens_map.json
Normal file
30
special_tokens_map.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"bos_token": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"unk_token": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f08ed885956f70d877a4d9078ec9e3119d8b68a8d579003e230be18cad66911c
|
||||
size 15524194
|
||||
111
tokenizer_config.json
Normal file
111
tokenizer_config.json
Normal file
@@ -0,0 +1,111 @@
|
||||
{
|
||||
"add_bos_token": false,
|
||||
"add_eos_token": false,
|
||||
"add_prefix_space": false,
|
||||
"added_tokens_decoder": {
|
||||
"199999": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"200018": {
|
||||
"content": "<|endofprompt|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"200019": {
|
||||
"content": "<|assistant|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"200020": {
|
||||
"content": "<|end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"200021": {
|
||||
"content": "<|user|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"200022": {
|
||||
"content": "<|system|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"200023": {
|
||||
"content": "<|tool|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"200024": {
|
||||
"content": "<|/tool|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"200025": {
|
||||
"content": "<|tool_call|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"200026": {
|
||||
"content": "<|/tool_call|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"200027": {
|
||||
"content": "<|tool_response|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"200028": {
|
||||
"content": "<|tag|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": true,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"bos_token": "<|endoftext|>",
|
||||
"chat_template": "{{ '<|system|>Your name is Phi, an AI math expert developed by Microsoft.' }}{% for message in messages %}{% if message['role'] == 'system' %} {{ message['content'] }}{% if 'tools' in message and message['tools'] is not none %}{{ '<|tool|>' + message['tools'] + '<|/tool|>' }}{% endif %}{% endif %}{% endfor %}{{ '<|end|>' }}{% for message in messages %}{% if message['role'] != 'system' %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "<|endoftext|>",
|
||||
"model_max_length": 131072,
|
||||
"pad_token": "<|endoftext|>",
|
||||
"tokenizer_class": "GPT2Tokenizer",
|
||||
"unk_token": "<|endoftext|>"
|
||||
}
|
||||
1
vocab.json
Normal file
1
vocab.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user