初始化项目,由ModelHub XC社区提供模型

Model: iic/InspireMusic-1.5B-Long
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-18 03:14:50 +08:00
commit 82059c85f7
20 changed files with 455889 additions and 0 deletions

47
.gitattributes vendored Normal file
View File

@@ -0,0 +1,47 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*.tfevents* filter=lfs diff=lfs merge=lfs -text
*.db* filter=lfs diff=lfs merge=lfs -text
*.ark* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.gguf* filter=lfs diff=lfs merge=lfs -text
*.ggml filter=lfs diff=lfs merge=lfs -text
*.llamafile* filter=lfs diff=lfs merge=lfs -text
*.pt2 filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

202
LICENSE Normal file
View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2024 Alibaba Cloud
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

335
README.md Normal file
View File

@@ -0,0 +1,335 @@
---
frameworks:
- Pytorch
license: Apache License 2.0
tasks:
- music-generation
model-type:
##如 gpt、phi、llama、chatglm、baichuan 等
- qwen2.5
domain:
##如 nlp、cv、audio、multi-modal
- music
---
<p align="center"> <a href="https://github.com/FunAudioLLM/InspireMusic" target="_blank"> <img alt="logo" src="./asset/logo.png" width="100%"></a></p>
<p align="center">
<a href="https://funaudiollm.github.io/inspiremusic" target="_blank"><img alt="Demo" src="https://img.shields.io/badge/Demo-InspireMusic?labelColor=%20%23FDB062&label=InspireMusic&color=%20%23f79009"></a>
<a href="https://github.com/FunAudioLLM/InspireMusic" target="_blank"><img alt="Code" src="https://img.shields.io/badge/Code-InspireMusic?labelColor=%20%237372EB&label=InspireMusic&color=%20%235462eb"></a>
<a href="https://modelscope.cn/models/iic/InspireMusic" target="_blank"><img alt="Model" src="https://img.shields.io/badge/InspireMusic-Model-green"></a>
<a href="https://modelscope.cn/studios/iic/InspireMusic/summary" target="_blank"><img alt="Space" src="https://img.shields.io/badge/Spaces-ModelScope-pink?labelColor=%20%237b8afb&label=Spaces&color=%20%230a5af8"></a>
<a href="https://huggingface.co/spaces/FunAudioLLM/InspireMusic" target="_blank"><img alt="Space" src="https://img.shields.io/badge/HuggingFace-Spaces?labelColor=%20%239b8afb&label=Spaces&color=%20%237a5af8"></a>
<a href="http://arxiv.org/abs/2503.00084" target="_blank"><img alt="Paper" src="https://img.shields.io/badge/arXiv-Paper-green"></a>
</p>
![GitHub Repo stars](https://img.shields.io/github/stars/FunAudioLLM/InspireMusic) Please support our community by starring it 感谢大家支持
[**Highlights**](#highlights)
| [**Introduction**](#introduction)
| [**Installation**](#installation)
| [**Quick Start**](#quick-start)
| [**Tutorial**](https://github.com/FunAudioLLM/InspireMusic#tutorial)
| [**Models**](#model-zoo)
| [**Contact**](#contact)
---
<a name="highlights"></a>
## Highlights
**InspireMusic** focuses on music generation, song generation, and audio generation.
- A unified toolkit designed for music, song, and audio generation.
- Music generation tasks with high audio quality.
- Long-form music generation.
<a name="introduction"></a>
## Introduction
> [!Note]
> This repo contains the algorithm infrastructure and some simple examples. Currently only support English text prompts.
> [!Tip]
> To preview the performance, please refer to [InspireMusic Demo Page](https://funaudiollm.github.io/inspiremusic).
InspireMusic is a unified music, song, and audio generation framework through the audio tokenization integrated with autoregressive transformer and flow-matching based model. The original motive of this toolkit is to empower the common users to innovate soundscapes and enhance euphony in research through music, song, and audio crafting. The toolkit provides both training and inference codes for AI-based generative models that create high-quality music. Featuring a unified framework, InspireMusic incorporates audio tokenizers with autoregressive transformer and super-resolution flow-matching modeling, allowing for the controllable generation of music, song, and audio with both text and audio prompts. The toolkit currently supports music generation, will support song generation, audio generation in the future.
## InspireMusic
<p align="center"><table><tr><td style="text-align:center;"><img alt="Light" src="asset/InspireMusic.png" width="100%" /></tr><tr><td style="text-align:center;">
Figure 1: An overview of the InspireMusic framework. We introduce InspireMusic, a unified framework for music, song, audio generation capable of producing high-quality long-form audio. InspireMusic consists of the following three key components. <b>Audio Tokenizers</b> convert the raw audio waveform into discrete audio tokens that can be efficiently processed and trained by the autoregressive transformer model. Audio waveform of lower sampling rate has converted to discrete tokens via a high bitrate compression audio tokenizer<a href="https://openreview.net/forum?id=yBlVlS2Fd9" target="_blank"><sup>[1]</sup></a>. <b>Autoregressive Transformer</b> model is based on Qwen2.5<a href="https://arxiv.org/abs/2412.15115" target="_blank"><sup>[2]</sup></a> as the backbone model and is trained using a next-token prediction approach on both text and audio tokens, enabling it to generate coherent and contextually relevant token sequences. The audio and text tokens are the inputs of an autoregressive model with the next token prediction to generate tokens. <b>Super-Resolution Flow-Matching Model</b> based on flow modeling method, maps the generated tokens to latent features with high-resolution fine-grained acoustic details<a href="https://arxiv.org/abs/2305.02765" target="_blank"><sup>[3]</sup></a> obtained from a higher sampling rate of audio to ensure the acoustic information flow connected with high fidelity through models. A vocoder then generates the final audio waveform from these enhanced latent features. InspireMusic supports a range of tasks including text-to-music, music continuation, music reconstruction, and music super-resolution.
</td></tr></table></p>
<a name="installation"></a>
## Installation
### Clone
- Clone the repo
``` sh
git clone --recursive https://github.com/FunAudioLLM/InspireMusic.git
# If you failed to clone submodule due to network failures, please run the following command until success
cd InspireMusic
git submodule update --recursive
# or you can download the third_party repo Matcha-TTS manually
cd third_party && git clone https://github.com/shivammehta25/Matcha-TTS.git
```
### Install from Source
InspireMusic requires Python>=3.8, PyTorch>=2.0.1, flash attention==2.6.2/2.6.3, CUDA>=11.2. You can install the dependencies with the following commands:
- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
- Create Conda env:
``` shell
conda create -n inspiremusic python=3.8
conda activate inspiremusic
cd InspireMusic
# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platforms.
conda install -y -c conda-forge pynini==2.1.5
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
# install flash attention to speedup training
pip install flash-attn --no-build-isolation
```
- Install within the package:
```shell
cd InspireMusic
# You can run to install the packages
python setup.py install
pip install flash-attn --no-build-isolation
```
We also recommend having `sox` or `ffmpeg` installed, either through your system or Anaconda:
```shell
# # Install sox
# ubuntu
sudo apt-get install sox libsox-dev
# centos
sudo yum install sox sox-devel
# Install ffmpeg
# ubuntu
sudo apt-get install ffmpeg
# centos
sudo yum install ffmpeg
```
### Use Docker
Run the following command to build a docker image from Dockerfile provided.
```shell
docker build -t inspiremusic .
```
Run the following command to start the docker container in interactive mode.
```shell
docker run -ti --gpus all -v .:/workspace/InspireMusic inspiremusic
```
### Use Docker Compose
Run the following command to build a docker compose environment and docker image from the docker-compose.yml file.
```shell
docker compose up -d --build
```
Run the following command to attach to the docker container in interactive mode.
```shell
docker exec -ti inspire-music bash
```
<a name="quick-start"></a>
### Quick Start
Here is a quick example inference script for music generation.
``` shell
cd InspireMusic
mkdir -p pretrained_models
# Download models
# ModelScope
git clone https://www.modelscope.cn/iic/InspireMusic-1.5B-Long.git pretrained_models/InspireMusic-1.5B-Long
# HuggingFace
git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git pretrained_models/InspireMusic-1.5B-Long
cd examples/music_generation
# run a quick inference example
sh infer_1.5b_long.sh
```
Here is a quick start running script to run music generation task including data preparation pipeline, model training, inference.
``` shell
cd InspireMusic/examples/music_generation/
sh run.sh
```
### One-line Inference
#### Text-to-music Task
One-line Shell script for text-to-music task.
``` shell
cd examples/music_generation
# with flow matching, use one-line command to get a quick try
python -m inspiremusic.cli.inference
# custom the config like the following one-line command
python -m inspiremusic.cli.inference --task text-to-music -m "InspireMusic-1.5B-Long" -g 0 -t "Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance." -c intro -s 0.0 -e 30.0 -r "exp/inspiremusic" -o output -f wav
# without flow matching, use one-line command to get a quick try
python -m inspiremusic.cli.inference --task text-to-music -g 0 -t "Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance." --fast True
```
Alternatively, you can run the inference with just a few lines of Python code.
```python
from inspiremusic.cli.inference import InspireMusic
from inspiremusic.cli.inference import env_variables
if __name__ == "__main__":
env_variables()
model = InspireMusic(model_name = "InspireMusic-Base")
model.inference("text-to-music", "Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.")
```
#### Music Continuation Task
One-line Shell script for music continuation task.
```shell
cd examples/music_generation
# with flow matching
python -m inspiremusic.cli.inference --task continuation -g 0 -a audio_prompt.wav
# without flow matching
python -m inspiremusic.cli.inference --task continuation -g 0 -a audio_prompt.wav --fast True
```
Alternatively, you can run the inference with just a few lines of Python code.
```python
from inspiremusic.cli.inference import InspireMusic
from inspiremusic.cli.inference import env_variables
if __name__ == "__main__":
env_variables()
model = InspireMusic(model_name = "InspireMusic-Base")
# just use audio prompt
model.inference("continuation", None, "audio_prompt.wav")
# use both text prompt and audio prompt
model.inference("continuation", "Continue to generate jazz music.", "audio_prompt.wav")
```
<a name="model-zoo"></a>
## Models
### Download Models
You may download our pretrained InspireMusic models for music generation.
```shell
# use git to download modelsplease make sure git lfs is installed.
mkdir -p pretrained_models
git clone https://www.modelscope.cn/iic/InspireMusic.git pretrained_models/InspireMusic
```
### Available Models
Currently, we open source the music generation models support 24KHz mono and 48KHz stereo audio.
The table below presents the links to the ModelScope and Huggingface model hub.
| Model name | Model Links | Remarks |
|--------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------|
| InspireMusic-Base-24kHz | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-Base-24kHz/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz) | Pre-trained Music Generation Model, 24kHz mono, 30s |
| InspireMusic-Base | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-Base) | Pre-trained Music Generation Model, 48kHz, 30s |
| InspireMusic-1.5B-24kHz | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-24kHz/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz) | Pre-trained Music Generation 1.5B Model, 24kHz mono, 30s |
| InspireMusic-1.5B | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B) | Pre-trained Music Generation 1.5B Model, 48kHz, 30s |
| InspireMusic-1.5B-Long | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-Long/summary) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long) | Pre-trained Music Generation 1.5B Model, 48kHz, support long-form music generation up to several minutes |
| InspireSong-1.5B | [![model](https://img.shields.io/badge/ModelScope-Model-lightgrey.svg)]() [![model](https://img.shields.io/badge/HuggingFace-Model-lightgrey.svg)]() | Pre-trained Song Generation 1.5B Model, 48kHz stereo |
| InspireAudio-1.5B | [![model](https://img.shields.io/badge/ModelScope-Model-lightgrey.svg)]() [![model](https://img.shields.io/badge/HuggingFace-Model-lightgrey.svg)]() | Pre-trained Audio Generation 1.5B Model, 48kHz stereo |
| Wavtokenizer[<sup>[1]</sup>](https://openreview.net/forum?id=yBlVlS2Fd9) (75Hz) | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-Long/file/view/master?fileName=wavtokenizer%252Fmodel.pt) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long/tree/main/wavtokenizer) | An extreme low bitrate audio tokenizer for music with one codebook at 24kHz audio. |
| Music_tokenizer (75Hz) | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-24kHz/file/view/master?fileName=music_tokenizer%252Fmodel.pt) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz/tree/main/music_tokenizer) | A music tokenizer based on HifiCodec<sup>[3]</sup> at 24kHz audio. |
| Music_tokenizer (150Hz) | [![model](https://img.shields.io/badge/ModelScope-Model-green.svg)](https://modelscope.cn/models/iic/InspireMusic-1.5B-Long/file/view/master?fileName=music_tokenizer%252Fmodel.pt) [![model](https://img.shields.io/badge/HuggingFace-Model-green.svg)](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long/tree/main/music_tokenizer) | A music tokenizer based on HifiCodec<sup>[3]</sup> at 48kHz audio. |
<a name="tutorial"></a>
## Basic Usage
At the moment, InspireMusic contains the training and inference codes for [music generation](https://github.com/FunAudioLLM/InspireMusic/tree/main/examples/music_generation).
### Training
Here is an example to train LLM model, support BF16/FP16 training.
```shell
torchrun --nnodes=1 --nproc_per_node=8 \
--rdzv_id=1024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
inspiremusic/bin/train.py \
--train_engine "torch_ddp" \
--config conf/inspiremusic.yaml \
--train_data data/train.data.list \
--cv_data data/dev.data.list \
--model llm \
--model_dir `pwd`/exp/music_generation/llm/ \
--tensorboard_dir `pwd`/tensorboard/music_generation/llm/ \
--ddp.dist_backend "nccl" \
--num_workers 8 \
--prefetch 100 \
--pin_memory \
--deepspeed_config ./conf/ds_stage2.json \
--deepspeed.save_states model+optimizer \
--fp16
```
Here is an example code to train flow matching model, does not support FP16 training.
```shell
torchrun --nnodes=1 --nproc_per_node=8 \
--rdzv_id=1024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
inspiremusic/bin/train.py \
--train_engine "torch_ddp" \
--config conf/inspiremusic.yaml \
--train_data data/train.data.list \
--cv_data data/dev.data.list \
--model flow \
--model_dir `pwd`/exp/music_generation/flow/ \
--tensorboard_dir `pwd`/tensorboard/music_generation/flow/ \
--ddp.dist_backend "nccl" \
--num_workers 8 \
--prefetch 100 \
--pin_memory \
--deepspeed_config ./conf/ds_stage2.json \
--deepspeed.save_states model+optimizer
```
### Inference
Here is an example script to quickly do model inference.
```shell
cd InspireMusic/examples/music_generation/
sh infer.sh
```
Here is an example code to run inference with normal mode, i.e., with flow matching model for text-to-music and music continuation tasks.
```shell
pretrained_model_dir = "pretrained_models/InspireMusic/"
for task in 'text-to-music' 'continuation'; do
python inspiremusic/bin/inference.py --task $task \
--gpu 0 \
--config conf/inspiremusic.yaml \
--prompt_data data/test/parquet/data.list \
--flow_model $pretrained_model_dir/flow.pt \
--llm_model $pretrained_model_dir/llm.pt \
--music_tokenizer $pretrained_model_dir/music_tokenizer \
--wavtokenizer $pretrained_model_dir/wavtokenizer \
--result_dir `pwd`/exp/inspiremusic/${task}_test \
--chorus verse
done
```
Here is an example code to run inference with fast mode, i.e., without flow matching model for text-to-music and music continuation tasks.
```shell
pretrained_model_dir = "pretrained_models/InspireMusic/"
for task in 'text-to-music' 'continuation'; do
python inspiremusic/bin/inference.py --task $task \
--gpu 0 \
--config conf/inspiremusic.yaml \
--prompt_data data/test/parquet/data.list \
--flow_model $pretrained_model_dir/flow.pt \
--llm_model $pretrained_model_dir/llm.pt \
--music_tokenizer $pretrained_model_dir/music_tokenizer \
--wavtokenizer $pretrained_model_dir/wavtokenizer \
--result_dir `pwd`/exp/inspiremusic/${task}_test \
--chorus verse \
--fast
done
```
### Hardware requirements
Previous test on H800 GPU, InspireMusic could generate 30 seconds audio with real-time factor (RTF) around 1.6~1.8. For normal mode, we recommend using hardware with at least 24GB of GPU memory for better experience. For fast mode, 12GB GPU memory is enough.
## Citation
```bibtex
@misc{InspireMusic2025,
title={InspireMusic: Integrating Super Resolution and Large Language Model for High-Fidelity Long-Form Music Generation},
author={Chong Zhang and Yukun Ma and Qian Chen and Wen Wang and Shengkui Zhao and Zexu Pan and Hao Wang and Chongjia Ni and Trung Hieu Nguyen and Kun Zhou and Yidi Jiang and Chaohong Tan and Zhifu Gao and Zhihao Du and Bin Ma},
year={2025},
eprint={2503.00084},
archivePrefix={arXiv},
primaryClass={cs.SD},
url={https://arxiv.org/abs/2503.00084},
}
```
---
## Disclaimer
The content provided above is for research purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.

BIN
asset/.DS_Store vendored Normal file

Binary file not shown.

BIN
asset/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 354 KiB

28
config.json Normal file
View File

@@ -0,0 +1,28 @@
{
"architectures": [
"Qwen2ForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151643,
"hidden_act": "silu",
"hidden_size": 1536,
"initializer_range": 0.02,
"intermediate_size": 8960,
"max_position_embeddings": 131072,
"max_window_layers": 28,
"model_type": "qwen2",
"num_attention_heads": 12,
"num_hidden_layers": 28,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-06,
"rope_theta": 1000000.0,
"sliding_window": 131072,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.40.1",
"use_cache": true,
"use_mrope": false,
"use_sliding_window": false,
"vocab_size": 151936
}

1
configuration.json Normal file
View File

@@ -0,0 +1 @@
{"task":"audio-generation"}

3
flow.pt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b3a58f28ac60b6771690c87bc0461325bb9b071975b20e520b33f61f0fcb7687
size 306396789

7
generation_config.json Normal file
View File

@@ -0,0 +1,7 @@
{
"bos_token_id": 151643,
"do_sample": false,
"eos_token_id": 151643,
"max_new_tokens": 2048,
"transformers_version": "4.37.0"
}

171
inspiremusic.yaml Normal file
View File

@@ -0,0 +1,171 @@
# set random seed, so that you may reproduce your result.
__set_seed1: !apply:random.seed [1024]
__set_seed2: !apply:numpy.random.seed [1024]
__set_seed3: !apply:torch.manual_seed [1024]
__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
# fixed params
sample_rate: 24000
text_encoder_input_size: 512
llm_input_size: 1536
llm_output_size: 1536
basemodel_path: '../../pretrained_models/InspireMusic-1.5B-Long/'
generator_path: '../../pretrained_models/InspireMusic-1.5B-Long/music_tokenizer'
# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
llm: !new:inspiremusic.llm.llm.LLM
text_encoder_input_size: !ref <text_encoder_input_size>
llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size>
audio_token_size: 4096
length_normalized_loss: True
lsm_weight: 0
text_encoder_conf:
name: "none"
llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
input_size: !ref <text_encoder_input_size>
pretrain_path: !ref <basemodel_path>
sampling: !name:inspiremusic.utils.common.topk_sampling
top_k: 350
train_cfg_ratio: 0.2
infer_cfg_ratio: 3.0
flow: !new:inspiremusic.flow.flow.MaskedDiff
input_size: 256
output_size: 80
output_type: 'mel'
vocab_size: 4096
input_frame_rate: 75
only_mask_loss: True
encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
output_size: 512
attention_heads: 4
linear_units: 1024
num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
normalize_before: True
input_layer: 'linear'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
input_size: 256
use_cnn_module: False
macaron_style: False
length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
channels: 512
sampling_ratios: [1, 1, 1, 1]
decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
in_channels: 240
cfm_params: !new:omegaconf.DictConfig
content:
sigma_min: 1e-06
solver: 'euler'
t_scheduler: 'cosine'
training_cfg_rate: 0.2
inference_cfg_rate: 0.7
reg_loss_type: 'l1'
estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
in_channels: 1024
out_channels: 512
channels: [256, 256]
dropout: 0.0
attention_head_dim: 64
n_blocks: 4
num_mid_blocks: 8
num_heads: 8
act_fn: 'gelu'
generator_model_dir: !ref <generator_path>
hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
in_channels: 80
base_channels: 512
nb_harmonics: 8
sampling_rate: !ref <sample_rate>
nsf_alpha: 0.1
nsf_sigma: 0.003
nsf_voiced_threshold: 10
upsample_rates: [8, 8]
upsample_kernel_sizes: [16, 16]
istft_params:
n_fft: 16
hop_len: 4
resblock_kernel_sizes: [3, 7, 11]
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
source_resblock_kernel_sizes: [7, 11]
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
lrelu_slope: 0.1
audio_limit: 0.99
f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
num_class: 1
in_channels: 80
cond_channels: 512
wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
# processor functions
parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
tokenizer_path: !ref <basemodel_path>
tokenizer_name: "qwen-2.5"
allowed_special: 'all'
tokenize: !name:inspiremusic.dataset.processor.tokenize
get_tokenizer: !ref <get_tokenizer>
allowed_special: !ref <allowed_special>
filter: !name:inspiremusic.dataset.processor.filter
max_length: 28000
min_length: 0
token_max_length: 200
token_min_length: 1
resample: !name:inspiremusic.dataset.processor.resample
resample_rate: !ref <sample_rate>
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1024
num_mels: 128
sampling_rate: !ref <sample_rate>
hop_size: 256
win_size: 1024
fmin: 0
fmax: 24000
center: False
compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor>
parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
normalize: True
shuffle: !name:inspiremusic.dataset.processor.shuffle
shuffle_size: 1000
sort: !name:inspiremusic.dataset.processor.sort
sort_size: 500 # sort_size should be less than shuffle_size
batch: !name:inspiremusic.dataset.processor.batch
batch_type: 'dynamic'
max_frames_in_batch: 10000 # llm 12000
padding: !name:inspiremusic.dataset.processor.padding
# dataset processor pipeline
data_pipeline: [
!ref <parquet_opener>,
!ref <tokenize>,
!ref <shuffle>,
!ref <sort>,
!ref <filter>,
!ref <batch>,
!ref <padding>,
]
# train conf
train_conf:
optim: adam
optim_conf:
lr: 0.0001 # change to 0.001 if you want to train flow from scratch
scheduler: warmuplr
scheduler_conf:
warmup_steps: 5000
max_epoch: 200
grad_clip: 5
accum_grad: 2
log_interval: 100
save_per_step: 500

3
llm.pt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0f78e470893e3d6affe5e0b15210af79d16c2ed676f7a7dc4690723bec65696c
size 6226588239

151387
merges.txt Normal file

File diff suppressed because it is too large Load Diff

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a961db72e75d52b18e6b0c9d379e51a26973b233385e0e127fdda7d648aec796
size 3087467144

View File

@@ -0,0 +1,42 @@
{
"resblock": "1",
"num_gpus": 8,
"batch_size": 140,
"learning_rate": 0.00002,
"adam_b1": 0.5,
"adam_b2": 0.9,
"lr_decay": 0.98,
"seed": 1234,
"upsample_rates": [8,5,4,2],
"upsample_kernel_sizes": [16,11,8,4],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3,5,7,9,11,13],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5], [1,3,5], [1,3,5], [1,3,5]],
"segment_size": 48000,
"num_mels": 80,
"num_freq": 1024,
"n_fft": 1024,
"hop_size": 240,
"win_size": 1024,
"sampling_rate": 48000,
"n_code_groups": 2,
"n_codes": 1024,
"codebook_loss_lambda": 1.0,
"commitment_loss_lambda": 0.25,
"fmin": 0,
"fmax": 48000,
"fmax_for_loss": null,
"num_workers": 24,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

3
music_tokenizer/model.pt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ba71efdc50378baf9776d607eb11566907c3810e6f221c316719c02591135626
size 537087507

303282
tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

207
tokenizer_config.json Normal file
View File

@@ -0,0 +1,207 @@
{
"add_bos_token": false,
"add_prefix_space": false,
"added_tokens_decoder": {
"151643": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151644": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151645": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151646": {
"content": "<|object_ref_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151647": {
"content": "<|object_ref_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151648": {
"content": "<|box_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151649": {
"content": "<|box_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151650": {
"content": "<|quad_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151651": {
"content": "<|quad_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151652": {
"content": "<|vision_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151653": {
"content": "<|vision_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151654": {
"content": "<|vision_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151655": {
"content": "<|image_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151656": {
"content": "<|video_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151657": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151658": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151659": {
"content": "<|fim_prefix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151660": {
"content": "<|fim_middle|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151661": {
"content": "<|fim_suffix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151662": {
"content": "<|fim_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151663": {
"content": "<|repo_name|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151664": {
"content": "<|file_sep|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
}
},
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>"
],
"bos_token": null,
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
"clean_up_tokenization_spaces": false,
"eos_token": "<|endoftext|>",
"errors": "replace",
"model_max_length": 131072,
"pad_token": "<|endoftext|>",
"split_special_tokens": false,
"tokenizer_class": "Qwen2Tokenizer",
"unk_token": null
}

1
vocab.json Normal file

File diff suppressed because one or more lines are too long

164
wavtokenizer/config.yaml Normal file
View File

@@ -0,0 +1,164 @@
# pytorch_lightning==1.8.6
seed_everything: 3407
trainer:
logger:
class_path: pytorch_lightning.loggers.TensorBoardLogger
init_args:
save_dir: ./result/
name: lightning_logs
version: null
log_graph: false
default_hp_metric: true
prefix: ''
sub_dir: null
logdir: null
comment: ''
purge_step: null
max_queue: 10
flush_secs: 120
filename_suffix: ''
write_to_disk: true
comet_config:
disabled: true
enable_checkpointing: true
callbacks:
- class_path: pytorch_lightning.callbacks.LearningRateMonitor
init_args:
logging_interval: null
log_momentum: false
- class_path: pytorch_lightning.callbacks.ModelSummary
init_args:
max_depth: 2
- class_path: pytorch_lightning.callbacks.ModelCheckpoint
init_args:
dirpath: null
filename: wavtokenizer_checkpoint_{epoch}_{step}_{val_loss:.4f}
monitor: val_loss
verbose: false
save_last: true
save_top_k: 10
save_weights_only: false
mode: min
auto_insert_metric_name: true
every_n_train_steps: 1000
train_time_interval: null
every_n_epochs: null
save_on_train_epoch_end: null
- class_path: inspiremusic.wavtokenizer.decoder.helpers.GradNormCallback
default_root_dir: null
gradient_clip_val: null
gradient_clip_algorithm: null
num_nodes: 1
num_processes: null
devices: -1
gpus: null
auto_select_gpus: false
tpu_cores: null
ipus: null
enable_progress_bar: true
overfit_batches: 0.0
track_grad_norm: -1
check_val_every_n_epoch: 1
fast_dev_run: false
accumulate_grad_batches: null
max_epochs: null
min_epochs: null
max_steps: 20000000
min_steps: null
max_time: null
limit_train_batches: null
limit_val_batches: 100
limit_test_batches: null
limit_predict_batches: null
val_check_interval: null
log_every_n_steps: 1000
accelerator: gpu
strategy: ddp
sync_batchnorm: false
precision: 32
enable_model_summary: true
num_sanity_val_steps: 2
resume_from_checkpoint: null
profiler: null
benchmark: null
deterministic: null
reload_dataloaders_every_n_epochs: 0
auto_lr_find: false
replace_sampler_ddp: true
detect_anomaly: false
auto_scale_batch_size: false
plugins: null
amp_backend: native
amp_level: null
move_metrics_to_cpu: false
multiple_trainloader_mode: max_size_cycle
inference_mode: true
ckpt_path: null
data:
class_path: inspiremusic.wavtokenizer.decoder.dataset.VocosDataModule
init_args:
train_params:
filelist_path: train.scp
sampling_rate: 24000
num_samples: 72000
batch_size: 38
num_workers: 8
val_params:
filelist_path: test.scp
sampling_rate: 24000
num_samples: 72000
batch_size: 10
num_workers: 8
model:
class_path: inspiremusic.wavtokenizer.decoder.experiment.WavTokenizer
init_args:
feature_extractor:
class_path: inspiremusic.wavtokenizer.decoder.feature_extractors.EncodecFeatures
init_args:
encodec_model: encodec_24khz
bandwidths:
- 6.6
- 6.6
- 6.6
- 6.6
train_codebooks: true
num_quantizers: 1
dowmsamples:
- 8
- 5
- 4
- 2
vq_bins: 4096
vq_kmeans: 200
backbone:
class_path: inspiremusic.wavtokenizer.decoder.models.VocosBackbone
init_args:
input_channels: 512
dim: 768
intermediate_dim: 2304
num_layers: 12
layer_scale_init_value: null
adanorm_num_embeddings: 4
head:
class_path: inspiremusic.wavtokenizer.decoder.heads.ISTFTHead
init_args:
dim: 768
n_fft: 1280
hop_length: 320
padding: same
resume_config: config.yaml
resume_model: last.ckpt
sample_rate: 24000
initial_learning_rate: 0.0001
num_warmup_steps: 0
mel_loss_coeff: 45.0
mrd_loss_coeff: 1.0
pretrain_mel_steps: 0
decay_mel_coeff: false
evaluate_utmos: false
evaluate_pesq: true
evaluate_periodicty: true
resume: true

3
wavtokenizer/model.pt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:65dc00edbd293c0b4de81045648688207e5e69f1c32025beaaba0eb273fa851c
size 1754883448