This commit is contained in:
2025-09-09 18:15:48 +08:00
parent b2c94ecd9d
commit 5088f0b50a
602 changed files with 591000 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

View File

@@ -0,0 +1,71 @@
---
license: mit
---
# Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis
[Audio samples](https://charactr-platform.github.io/vocos/) |
Paper [[abs]](https://arxiv.org/abs/2306.00814) [[pdf]](https://arxiv.org/pdf/2306.00814.pdf)
Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative
Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical
GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral
coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
## Installation
To use Vocos only in inference mode, install it using:
```bash
pip install vocos
```
If you wish to train the model, install it with additional dependencies:
```bash
pip install vocos[train]
```
## Usage
### Reconstruct audio from mel-spectrogram
```python
import torch
from vocos import Vocos
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
mel = torch.randn(1, 100, 256) # B, C, T
audio = vocos.decode(mel)
```
Copy-synthesis from a file:
```python
import torchaudio
y, sr = torchaudio.load(YOUR_AUDIO_FILE)
if y.size(0) > 1: # mix to mono
y = y.mean(dim=0, keepdim=True)
y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=24000)
y_hat = vocos(y)
```
## Citation
If this code contributes to your research, please cite our work:
```
@article{siuzdak2023vocos,
title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
author={Siuzdak, Hubert},
journal={arXiv preprint arXiv:2306.00814},
year={2023}
}
```
## License
The code in this repository is released under the MIT license.

View File

@@ -0,0 +1,24 @@
feature_extractor:
class_path: vocos.feature_extractors.MelSpectrogramFeatures
init_args:
sample_rate: 24000
n_fft: 1024
hop_length: 256
n_mels: 100
padding: center
backbone:
class_path: vocos.models.VocosBackbone
init_args:
input_channels: 100
dim: 512
intermediate_dim: 1536
num_layers: 8
head:
class_path: vocos.heads.ISTFTHead
init_args:
dim: 512
n_fft: 1024
hop_length: 256
padding: center

BIN
f5/charactr/vocos-mel-24khz/pytorch_model.bin (Stored with Git LFS) Normal file

Binary file not shown.