初始化项目，由ModelHub XC社区提供模型

Model: iic/InspireMusic-1.5B-Long Source: Original Platform
2026-05-18 03:14:50 +08:00
commit 82059c85f7
20 changed files with 455889 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,47 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.gguf* filter=lfs diff=lfs merge=lfs -text
+*.ggml filter=lfs diff=lfs merge=lfs -text
+*.llamafile* filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/202
+++ b/202
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2024 Alibaba Cloud
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,335 @@
+---
+frameworks:
+- Pytorch
+license: Apache License 2.0
+tasks:
+- music-generation
+
+model-type:
+##如 gpt、phi、llama、chatglm、baichuan 等
+- qwen2.5
+
+domain:
+##如 nlp、cv、audio、multi-modal
+- music
+
+---
+<p align="center"> <a href="https://github.com/FunAudioLLM/InspireMusic" target="_blank"> <img alt="logo" src="./asset/logo.png" width="100%"></a></p>
+
+<p align="center">
+ <a href="https://funaudiollm.github.io/inspiremusic" target="_blank"><img alt="Demo" src="https://img.shields.io/badge/Demo-InspireMusic?labelColor=%20%23FDB062&label=InspireMusic&color=%20%23f79009"></a>
+<a href="https://github.com/FunAudioLLM/InspireMusic" target="_blank"><img alt="Code" src="https://img.shields.io/badge/Code-InspireMusic?labelColor=%20%237372EB&label=InspireMusic&color=%20%235462eb"></a>
+<a href="https://modelscope.cn/models/iic/InspireMusic" target="_blank"><img alt="Model" src="https://img.shields.io/badge/InspireMusic-Model-green"></a>
+<a href="https://modelscope.cn/studios/iic/InspireMusic/summary" target="_blank"><img alt="Space" src="https://img.shields.io/badge/Spaces-ModelScope-pink?labelColor=%20%237b8afb&label=Spaces&color=%20%230a5af8"></a>
+<a href="https://huggingface.co/spaces/FunAudioLLM/InspireMusic" target="_blank"><img alt="Space" src="https://img.shields.io/badge/HuggingFace-Spaces?labelColor=%20%239b8afb&label=Spaces&color=%20%237a5af8"></a>
+<a href="http://arxiv.org/abs/2503.00084" target="_blank"><img alt="Paper" src="https://img.shields.io/badge/arXiv-Paper-green"></a>
+</p>
+
+![GitHub Repo stars](https://img.shields.io/github/stars/FunAudioLLM/InspireMusic) Please support our community by starring it 感谢大家支持
+
+[**Highlights**](#highlights)
+| [**Introduction**](#introduction)
+| [**Installation**](#installation)
+| [**Quick Start**](#quick-start)
+| [**Tutorial**](https://github.com/FunAudioLLM/InspireMusic#tutorial)
+| [**Models**](#model-zoo)
+| [**Contact**](#contact)
+
+---
+<a name="highlights"></a>
+## Highlights
+**InspireMusic** focuses on music generation, song generation, and audio generation.
+- A unified toolkit designed for music, song, and audio generation.
+- Music generation tasks with high audio quality. 
+- Long-form music generation.
+
+<a name="introduction"></a>
+## Introduction
+> [!Note]
+> This repo contains the algorithm infrastructure and some simple examples. Currently only support English text prompts.
+
+> [!Tip]
+> To preview the performance, please refer to [InspireMusic Demo Page](https://funaudiollm.github.io/inspiremusic).
+
+InspireMusic is a unified music, song, and audio generation framework through the audio tokenization integrated with autoregressive transformer and flow-matching based model. The original motive of this toolkit is to empower the common users to innovate soundscapes and enhance euphony in research through music, song, and audio crafting. The toolkit provides both training and inference codes for AI-based generative models that create high-quality music. Featuring a unified framework, InspireMusic incorporates audio tokenizers with autoregressive transformer and super-resolution flow-matching modeling, allowing for the controllable generation of music, song, and audio with both text and audio prompts. The toolkit currently supports music generation, will support song generation, audio generation in the future.
+
+## InspireMusic
+<p align="center"><table><tr><td style="text-align:center;"><img alt="Light" src="asset/InspireMusic.png" width="100%" /></tr><tr><td style="text-align:center;">
+Figure 1: An overview of the InspireMusic framework. We introduce InspireMusic, a unified framework for music, song, audio generation capable of producing high-quality long-form audio. InspireMusic consists of the following three key components. <b>Audio Tokenizers</b> convert the raw audio waveform into discrete audio tokens that can be efficiently processed and trained by the autoregressive transformer model. Audio waveform of lower sampling rate has converted to discrete tokens via a high bitrate compression audio tokenizer<a href="https://openreview.net/forum?id=yBlVlS2Fd9" target="_blank"><sup>[1]</sup></a>. <b>Autoregressive Transformer</b> model is based on Qwen2.5<a href="https://arxiv.org/abs/2412.15115" target="_blank"><sup>[2]</sup></a> as the backbone model and is trained using a next-token prediction approach on both text and audio tokens, enabling it to generate coherent and contextually relevant token sequences. The audio and text tokens are the inputs of an autoregressive model with the next token prediction to generate tokens. <b>Super-Resolution Flow-Matching Model</b> based on flow modeling method, maps the generated tokens to latent features with high-resolution fine-grained acoustic details<a href="https://arxiv.org/abs/2305.02765" target="_blank"><sup>[3]</sup></a> obtained from a higher sampling rate of audio to ensure the acoustic information flow connected with high fidelity through models. A vocoder then generates the final audio waveform from these enhanced latent features. InspireMusic supports a range of tasks including text-to-music, music continuation, music reconstruction, and music super-resolution.
+</td></tr></table></p>
+
+<a name="installation"></a>
+## Installation
+### Clone
+- Clone the repo
+``` sh
+git clone --recursive https://github.com/FunAudioLLM/InspireMusic.git
+# If you failed to clone submodule due to network failures, please run the following command until success
+cd InspireMusic
+git submodule update --recursive
+# or you can download the third_party repo Matcha-TTS manually
+cd third_party && git clone https://github.com/shivammehta25/Matcha-TTS.git
+```
+
+### Install from Source
+InspireMusic requires Python>=3.8, PyTorch>=2.0.1, flash attention==2.6.2/2.6.3, CUDA>=11.2. You can install the dependencies with the following commands:
+
+- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
+- Create Conda env:
+``` shell
+conda create -n inspiremusic python=3.8
+conda activate inspiremusic
+cd InspireMusic
+# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platforms.
+conda install -y -c conda-forge pynini==2.1.5
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
+# install flash attention to speedup training
+pip install flash-attn --no-build-isolation
+```
+
+- Install within the package:
+```shell
+cd InspireMusic
+# You can run to install the packages
+python setup.py install
+pip install flash-attn --no-build-isolation
+```
+We also recommend having `sox` or `ffmpeg` installed, either through your system or Anaconda:
+```shell
+# # Install sox
+# ubuntu
+sudo apt-get install sox libsox-dev
+# centos
+sudo yum install sox sox-devel
+
+# Install ffmpeg
+# ubuntu
+sudo apt-get install ffmpeg
+# centos
+sudo yum install ffmpeg
+```
+
+### Use Docker
+Run the following command to build a docker image from Dockerfile provided.
+```shell
+docker build -t inspiremusic .
+```
+Run the following command to start the docker container in interactive mode.
+```shell
+docker run -ti --gpus all -v .:/workspace/InspireMusic inspiremusic
+```
+
+### Use Docker Compose
+Run the following command to build a docker compose environment and docker image from the docker-compose.yml file.
+```shell
+docker compose up -d --build
+```
+Run the following command to attach to the docker container in interactive mode.
+```shell
+docker exec -ti inspire-music bash
+```
+
+<a name="quick-start"></a>
+### Quick Start
+Here is a quick example inference script for music generation. 
+``` shell
+cd InspireMusic
+mkdir -p pretrained_models
+
+# Download models
+# ModelScope
+git clone https://www.modelscope.cn/iic/InspireMusic-1.5B-Long.git pretrained_models/InspireMusic-1.5B-Long
+# HuggingFace
+git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git pretrained_models/InspireMusic-1.5B-Long
+
+cd examples/music_generation
+# run a quick inference example
+sh infer_1.5b_long.sh
+```
+
+Here is a quick start running script to run music generation task including data preparation pipeline, model training, inference. 
+``` shell
+cd InspireMusic/examples/music_generation/
+sh run.sh
+```
+
+### One-line Inference
+#### Text-to-music Task
+One-line Shell script for text-to-music task.
+``` shell
+cd examples/music_generation
+# with flow matching, use one-line command to get a quick try
+python -m inspiremusic.cli.inference
+
+# custom the config like the following one-line command
+python -m inspiremusic.cli.inference --task text-to-music -m "InspireMusic-1.5B-Long" -g 0 -t "Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance." -c intro -s 0.0 -e 30.0 -r "exp/inspiremusic" -o output -f wav 
+
+# without flow matching, use one-line command to get a quick try
+python -m inspiremusic.cli.inference --task text-to-music -g 0 -t "Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance." --fast True
+```
+
+Alternatively, you can run the inference with just a few lines of Python code.
+```python
+from inspiremusic.cli.inference import InspireMusic
+from inspiremusic.cli.inference import env_variables
+if __name__ == "__main__":
+  env_variables()
+  model = InspireMusic(model_name = "InspireMusic-Base")
+  model.inference("text-to-music", "Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.")
+```
+
+#### Music Continuation Task
+One-line Shell script for music continuation task.
+```shell
+cd examples/music_generation
+# with flow matching
+python -m inspiremusic.cli.inference --task continuation -g 0 -a audio_prompt.wav
+# without flow matching
+python -m inspiremusic.cli.inference --task continuation -g 0 -a audio_prompt.wav --fast True
+```
+
+Alternatively, you can run the inference with just a few lines of Python code.
+```python
+from inspiremusic.cli.inference import InspireMusic
+from inspiremusic.cli.inference import env_variables
+if __name__ == "__main__":
+  env_variables()
+  model = InspireMusic(model_name = "InspireMusic-Base")
+  # just use audio prompt
+  model.inference("continuation", None, "audio_prompt.wav")
+  # use both text prompt and audio prompt
+  model.inference("continuation", "Continue to generate jazz music.", "audio_prompt.wav")
+```
+<a name="model-zoo"></a>
+## Models
+### Download Models
+You may download our pretrained InspireMusic models for music generation.
+```shell
+# use git to download models，please make sure git lfs is installed.
+mkdir -p pretrained_models
+git clone https://www.modelscope.cn/iic/InspireMusic.git pretrained_models/InspireMusic
+```
+
+### Available Models
+Currently, we open source the music generation models support 24KHz mono and 48KHz stereo audio. 
+The table below presents the links to the ModelScope and Huggingface model hub.
+
+| Model name                           | Model Links                                                                                                                                                                                                                                                                                                                                   | Remarks                                                                                                  |
+|--------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------|
+| InspireMusic-Base-24kHz              | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-Base-24kHz/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz)                                                                        | Pre-trained Music Generation Model, 24kHz mono, 30s                                                      |
+| InspireMusic-Base                    | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-Base)                                                                                         | Pre-trained Music Generation Model, 48kHz, 30s                                                           |
+| InspireMusic-1.5B-24kHz              | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-24kHz/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz)                                                                        | Pre-trained Music Generation 1.5B Model, 24kHz mono, 30s                                                 |
+| InspireMusic-1.5B                    | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B)                                                                                    | Pre-trained Music Generation 1.5B Model, 48kHz, 30s                                                      |
+| InspireMusic-1.5B-Long               | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-Long/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long)                                                                          | Pre-trained Music Generation 1.5B Model, 48kHz, support long-form music generation up to several minutes |
+| InspireSong-1.5B                     | [![model](https://img.shields.io/badge/ModelScope-Model-lightgrey.svg)]() [![model](https://img.shields.io/badge/HuggingFace-Model-lightgrey.svg)]()                                                                                                                                                                                          | Pre-trained Song Generation 1.5B Model, 48kHz stereo                                                     |
+| InspireAudio-1.5B                    | [![model](https://img.shields.io/badge/ModelScope-Model-lightgrey.svg)]() [![model](https://img.shields.io/badge/HuggingFace-Model-lightgrey.svg)]()                                                                                                                                                                                          | Pre-trained Audio Generation 1.5B Model, 48kHz stereo                                                    |
+| Wavtokenizer[<sup>[1]</sup>](https://openreview.net/forum?id=yBlVlS2Fd9) (75Hz) | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-Long/file/view/master?fileName=wavtokenizer%252Fmodel.pt) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long/tree/main/wavtokenizer)       | An extreme low bitrate audio tokenizer for music with one codebook at 24kHz audio.                       |
+| Music_tokenizer (75Hz)               | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-24kHz/file/view/master?fileName=music_tokenizer%252Fmodel.pt) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz/tree/main/music_tokenizer) | A music tokenizer based on HifiCodec<sup>[3]</sup> at 24kHz audio.                                       |
+| Music_tokenizer (150Hz)              | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-Long/file/view/master?fileName=music_tokenizer%252Fmodel.pt) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long/tree/main/music_tokenizer) | A music tokenizer based on HifiCodec<sup>[3]</sup> at 48kHz audio.                                       |
+
+<a name="tutorial"></a>
+## Basic Usage
+At the moment, InspireMusic contains the training and inference codes for [music generation](https://github.com/FunAudioLLM/InspireMusic/tree/main/examples/music_generation).
+
+### Training
+Here is an example to train LLM model, support BF16/FP16 training. 
+```shell
+torchrun --nnodes=1 --nproc_per_node=8 \
+    --rdzv_id=1024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
+    inspiremusic/bin/train.py \
+    --train_engine "torch_ddp" \
+    --config conf/inspiremusic.yaml \
+    --train_data data/train.data.list \
+    --cv_data data/dev.data.list \
+    --model llm \
+    --model_dir `pwd`/exp/music_generation/llm/ \
+    --tensorboard_dir `pwd`/tensorboard/music_generation/llm/ \
+    --ddp.dist_backend "nccl" \
+    --num_workers 8 \
+    --prefetch 100 \
+    --pin_memory \
+    --deepspeed_config ./conf/ds_stage2.json \
+    --deepspeed.save_states model+optimizer \
+    --fp16
+```
+
+Here is an example code to train flow matching model, does not support FP16 training.
+```shell
+torchrun --nnodes=1 --nproc_per_node=8 \
+    --rdzv_id=1024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
+    inspiremusic/bin/train.py \
+    --train_engine "torch_ddp" \
+    --config conf/inspiremusic.yaml \
+    --train_data data/train.data.list \
+    --cv_data data/dev.data.list \
+    --model flow \
+    --model_dir `pwd`/exp/music_generation/flow/ \
+    --tensorboard_dir `pwd`/tensorboard/music_generation/flow/ \
+    --ddp.dist_backend "nccl" \
+    --num_workers 8 \
+    --prefetch 100 \
+    --pin_memory \
+    --deepspeed_config ./conf/ds_stage2.json \
+    --deepspeed.save_states model+optimizer
+```
+
+### Inference
+
+Here is an example script to quickly do model inference.
+```shell
+cd InspireMusic/examples/music_generation/
+sh infer.sh
+```
+Here is an example code to run inference with normal mode, i.e., with flow matching model for text-to-music and music continuation tasks.
+```shell
+pretrained_model_dir = "pretrained_models/InspireMusic/"
+for task in 'text-to-music' 'continuation'; do
+  python inspiremusic/bin/inference.py --task $task \
+      --gpu 0 \
+      --config conf/inspiremusic.yaml \
+      --prompt_data data/test/parquet/data.list \
+      --flow_model $pretrained_model_dir/flow.pt \
+      --llm_model $pretrained_model_dir/llm.pt \
+      --music_tokenizer $pretrained_model_dir/music_tokenizer \
+      --wavtokenizer $pretrained_model_dir/wavtokenizer \
+      --result_dir `pwd`/exp/inspiremusic/${task}_test \
+      --chorus verse 
+done
+```
+Here is an example code to run inference with fast mode, i.e., without flow matching model for text-to-music and music continuation tasks.
+```shell
+pretrained_model_dir = "pretrained_models/InspireMusic/"
+for task in 'text-to-music' 'continuation'; do
+  python inspiremusic/bin/inference.py --task $task \
+      --gpu 0 \
+      --config conf/inspiremusic.yaml \
+      --prompt_data data/test/parquet/data.list \
+      --flow_model $pretrained_model_dir/flow.pt \
+      --llm_model $pretrained_model_dir/llm.pt \
+      --music_tokenizer $pretrained_model_dir/music_tokenizer \
+      --wavtokenizer $pretrained_model_dir/wavtokenizer \
+      --result_dir `pwd`/exp/inspiremusic/${task}_test \
+      --chorus verse \
+      --fast 
+done
+```
+
+### Hardware requirements
+Previous test on H800 GPU, InspireMusic could generate 30 seconds audio with real-time factor (RTF) around 1.6~1.8. For normal mode, we recommend using hardware with at least 24GB of GPU memory for better experience. For fast mode, 12GB GPU memory is enough.
+
+## Citation
+```bibtex
+@misc{InspireMusic2025,
+      title={InspireMusic: Integrating Super Resolution and Large Language Model for High-Fidelity Long-Form Music Generation}, 
+      author={Chong Zhang and Yukun Ma and Qian Chen and Wen Wang and Shengkui Zhao and Zexu Pan and Hao Wang and Chongjia Ni and Trung Hieu Nguyen and Kun Zhou and Yidi Jiang and Chaohong Tan and Zhifu Gao and Zhihao Du and Bin Ma},
+      year={2025},
+      eprint={2503.00084},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2503.00084}, 
+}
+```
+
+---
+## Disclaimer
+The content provided above is for research purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
--- a/asset/.DS_Store
+++ b/asset/.DS_Store
--- a/asset/logo.png
+++ b/asset/logo.png
--- a/config.json
+++ b/config.json
@@ -0,0 +1,28 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
--- a/configuration.json
+++ b/configuration.json
@@ -0,0 +1 @@
+{"task":"audio-generation"}
--- a/flow.pt
+++ b/flow.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3a58f28ac60b6771690c87bc0461325bb9b071975b20e520b33f61f0fcb7687
+size 306396789
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "bos_token_id": 151643,
+  "do_sample": false,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.37.0"
+}
--- a/inspiremusic.yaml
+++ b/inspiremusic.yaml
@@ -0,0 +1,171 @@
+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 1536
+llm_output_size: 1536
+
+basemodel_path: '../../pretrained_models/InspireMusic-1.5B-Long/'
+generator_path: '../../pretrained_models/InspireMusic-1.5B-Long/music_tokenizer'
+
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+        
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.5"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>, 
+    !ref <batch>,
+    !ref <padding>,
+]
+
+
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500
--- a/llm.pt
+++ b/llm.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f78e470893e3d6affe5e0b15210af79d16c2ed676f7a7dc4690723bec65696c
+size 6226588239
--- a/merges.txt
+++ b/merges.txt
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a961db72e75d52b18e6b0c9d379e51a26973b233385e0e127fdda7d648aec796
+size 3087467144
--- a/music_tokenizer/config.json
+++ b/music_tokenizer/config.json
@@ -0,0 +1,42 @@
+{
+    "resblock": "1",
+    "num_gpus": 8,
+    "batch_size": 140,
+    "learning_rate": 0.00002,
+    "adam_b1": 0.5,
+    "adam_b2": 0.9,
+    "lr_decay": 0.98,
+    "seed": 1234,
+
+    "upsample_rates": [8,5,4,2],
+    "upsample_kernel_sizes": [16,11,8,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,5,7,9,11,13],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5], [1,3,5], [1,3,5], [1,3,5]],
+
+    "segment_size": 48000,
+    "num_mels": 80,
+    "num_freq": 1024,
+    "n_fft": 1024,
+    "hop_size": 240,
+    "win_size": 1024,
+
+    "sampling_rate": 48000,
+
+    "n_code_groups": 2,
+    "n_codes": 1024,
+    "codebook_loss_lambda": 1.0,
+    "commitment_loss_lambda": 0.25,
+
+    "fmin": 0,
+    "fmax": 48000,
+    "fmax_for_loss": null,
+
+    "num_workers": 24,
+
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}
--- a/music_tokenizer/model.pt
+++ b/music_tokenizer/model.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba71efdc50378baf9776d607eb11566907c3810e6f221c316719c02591135626
+size 537087507
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,207 @@
+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
--- a/vocab.json
+++ b/vocab.json
--- a/wavtokenizer/config.yaml
+++ b/wavtokenizer/config.yaml
@@ -0,0 +1,164 @@
+# pytorch_lightning==1.8.6
+seed_everything: 3407
+trainer:
+  logger:
+    class_path: pytorch_lightning.loggers.TensorBoardLogger
+    init_args:
+      save_dir: ./result/
+      name: lightning_logs
+      version: null
+      log_graph: false
+      default_hp_metric: true
+      prefix: ''
+      sub_dir: null
+      logdir: null
+      comment: ''
+      purge_step: null
+      max_queue: 10
+      flush_secs: 120
+      filename_suffix: ''
+      write_to_disk: true
+      comet_config:
+        disabled: true
+  enable_checkpointing: true
+  callbacks:
+  - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+    init_args:
+      logging_interval: null
+      log_momentum: false
+  - class_path: pytorch_lightning.callbacks.ModelSummary
+    init_args:
+      max_depth: 2
+  - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+    init_args:
+      dirpath: null
+      filename: wavtokenizer_checkpoint_{epoch}_{step}_{val_loss:.4f}
+      monitor: val_loss
+      verbose: false
+      save_last: true
+      save_top_k: 10
+      save_weights_only: false
+      mode: min
+      auto_insert_metric_name: true
+      every_n_train_steps: 1000
+      train_time_interval: null
+      every_n_epochs: null
+      save_on_train_epoch_end: null
+  - class_path: inspiremusic.wavtokenizer.decoder.helpers.GradNormCallback
+  default_root_dir: null
+  gradient_clip_val: null
+  gradient_clip_algorithm: null
+  num_nodes: 1
+  num_processes: null
+  devices: -1
+  gpus: null
+  auto_select_gpus: false
+  tpu_cores: null
+  ipus: null
+  enable_progress_bar: true
+  overfit_batches: 0.0
+  track_grad_norm: -1
+  check_val_every_n_epoch: 1
+  fast_dev_run: false
+  accumulate_grad_batches: null
+  max_epochs: null
+  min_epochs: null
+  max_steps: 20000000
+  min_steps: null
+  max_time: null
+  limit_train_batches: null
+  limit_val_batches: 100
+  limit_test_batches: null
+  limit_predict_batches: null
+  val_check_interval: null
+  log_every_n_steps: 1000
+  accelerator: gpu
+  strategy: ddp
+  sync_batchnorm: false
+  precision: 32
+  enable_model_summary: true
+  num_sanity_val_steps: 2
+  resume_from_checkpoint: null
+  profiler: null
+  benchmark: null
+  deterministic: null
+  reload_dataloaders_every_n_epochs: 0
+  auto_lr_find: false
+  replace_sampler_ddp: true
+  detect_anomaly: false
+  auto_scale_batch_size: false
+  plugins: null
+  amp_backend: native
+  amp_level: null
+  move_metrics_to_cpu: false
+  multiple_trainloader_mode: max_size_cycle
+  inference_mode: true
+ckpt_path: null
+data:
+  class_path: inspiremusic.wavtokenizer.decoder.dataset.VocosDataModule
+  init_args:
+    train_params:
+      filelist_path: train.scp
+      sampling_rate: 24000
+      num_samples: 72000
+      batch_size: 38
+      num_workers: 8
+    val_params:
+      filelist_path: test.scp
+      sampling_rate: 24000
+      num_samples: 72000
+      batch_size: 10
+      num_workers: 8
+model:
+  class_path: inspiremusic.wavtokenizer.decoder.experiment.WavTokenizer
+  init_args:
+    feature_extractor:
+      class_path: inspiremusic.wavtokenizer.decoder.feature_extractors.EncodecFeatures
+      init_args:
+        encodec_model: encodec_24khz
+        bandwidths:
+        - 6.6
+        - 6.6
+        - 6.6
+        - 6.6
+        train_codebooks: true
+        num_quantizers: 1
+        dowmsamples:
+        - 8
+        - 5
+        - 4
+        - 2
+        vq_bins: 4096
+        vq_kmeans: 200
+    backbone:
+      class_path: inspiremusic.wavtokenizer.decoder.models.VocosBackbone
+      init_args:
+        input_channels: 512
+        dim: 768
+        intermediate_dim: 2304
+        num_layers: 12
+        layer_scale_init_value: null
+        adanorm_num_embeddings: 4
+    head:
+      class_path: inspiremusic.wavtokenizer.decoder.heads.ISTFTHead
+      init_args:
+        dim: 768
+        n_fft: 1280
+        hop_length: 320
+        padding: same
+    resume_config: config.yaml
+    resume_model: last.ckpt
+    sample_rate: 24000
+    initial_learning_rate: 0.0001
+    num_warmup_steps: 0
+    mel_loss_coeff: 45.0
+    mrd_loss_coeff: 1.0
+    pretrain_mel_steps: 0
+    decay_mel_coeff: false
+    evaluate_utmos: false
+    evaluate_pesq: true
+    evaluate_periodicty: true
+    resume: true
+
+
+
--- a/wavtokenizer/model.pt
+++ b/wavtokenizer/model.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65dc00edbd293c0b4de81045648688207e5e69f1c32025beaaba0eb273fa851c
+size 1754883448