diff --git a/.github/workflows/export-vocos.yaml b/.github/workflows/export-vocos.yaml new file mode 100644 index 00000000..16981876 --- /dev/null +++ b/.github/workflows/export-vocos.yaml @@ -0,0 +1,111 @@ +name: export-vocos-to-onnx + +on: + push: + branches: + - export-vocos + + workflow_dispatch: + +concurrency: + group: export-vocos-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-vocos-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export vocos ${{ matrix.version }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python dependencies + shell: bash + run: | + pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html kaldi_native_fbank + + - name: Run + shell: bash + run: | + cd scripts/vocos + ./run.sh + ls -lh + + - name: Collect results + shell: bash + run: | + cp -v scripts/vocos/vocos-22khz-univ.onnx . + cp -v scripts/vocos/*.wav . + + - uses: actions/upload-artifact@v4 + with: + name: generated-waves + path: ./*.wav + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-onnx-models huggingface + cd huggingface + git fetch + git pull + + d=vocoder-models + mkdir -p $d + + cp -a ../vocos-22khz-univ.onnx $d/ + + git lfs track "*.onnx" + git add . + + ls -lh + + git status + + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-onnx-models main || true + + - name: Release + if: github.repository_owner == 'csukuangfj' + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.onnx + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: vocoder-models + + - name: Release + if: github.repository_owner == 'k2-fsa' + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.onnx + overwrite: true + tag: vocoder-models + diff --git a/scripts/vocos/README.md b/scripts/vocos/README.md new file mode 100644 index 00000000..a3c81f96 --- /dev/null +++ b/scripts/vocos/README.md @@ -0,0 +1,5 @@ +# Introduction + +This folder contains script to export the ONNX model from +https://huggingface.co/BSC-LT +to sherpa-onnx diff --git a/scripts/vocos/add_meta_data.py b/scripts/vocos/add_meta_data.py new file mode 100755 index 00000000..34efb8a2 --- /dev/null +++ b/scripts/vocos/add_meta_data.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + + +import argparse + +import onnx + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--in-model", type=str, required=True, help="input onnx model") + + parser.add_argument( + "--out-model", type=str, required=True, help="output onnx model" + ) + + return parser.parse_args() + + +def main(): + args = get_args() + print(args.in_model, args.out_model) + + model = onnx.load(args.in_model) + + meta_data = { + "model_type": "vocos", + "model_filename": "mel_spec_22khz_univ.onnx", + "sample_rate": 22050, + "version": 1, + "model_author": "BSC-LT", + "maintainer": "k2-fsa", + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "window_type": "hann", + "center": 1, + "pad_mode": "reflect", + "normalized": 0, + "url1": "https://huggingface.co/BSC-LT/vocos-mel-22khz", + "url2": "https://github.com/gemelo-ai/vocos", + } + + print(model.metadata_props) + + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + print("--------------------") + + print(model.metadata_props) + + onnx.save(model, args.out_model) + + print(f"Saved to {args.out_model}") + + +if __name__ == "__main__": + main() diff --git a/scripts/vocos/run.sh b/scripts/vocos/run.sh new file mode 100755 index 00000000..4b4b2b43 --- /dev/null +++ b/scripts/vocos/run.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f mel_spec_22khz_univ.onnx ]; then + curl -SL -O https://huggingface.co/BSC-LT/vocos-mel-22khz/resolve/main/mel_spec_22khz_univ.onnx +fi + +if [ ! -f ./vocos-22khz-univ.onnx ]; then + python3 ./add_meta_data.py --in-model ./mel_spec_22khz_univ.onnx --out-model ./vocos-22khz-univ.onnx +fi + +# The following is for testing +if [ ! -f ./matcha-icefall-en_US-ljspeech/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +python3 ./test.py +ls -lh diff --git a/scripts/vocos/test.py b/scripts/vocos/test.py new file mode 100755 index 00000000..78a7c0d4 --- /dev/null +++ b/scripts/vocos/test.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import datetime as dt + +import kaldi_native_fbank as knf +import numpy as np +import onnxruntime as ort +import soundfile as sf + +try: + from piper_phonemize import phonemize_espeak +except Exception as ex: + raise RuntimeError( + f"{ex}\nPlease run\n" + "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html" + ) + + +class OnnxVocosModel: + def __init__( + self, + filename: str, + ): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.session_opts = session_opts + self.model = ort.InferenceSession( + filename, + sess_options=self.session_opts, + providers=["CPUExecutionProvider"], + ) + + print("----------vocos----------") + for i in self.model.get_inputs(): + print(i) + + print("-----") + + for i in self.model.get_outputs(): + print(i) + print() + + def __call__(self, x: np.ndarray): + """ + Args: + x: (N, feat_dim, num_frames) + Returns: + mag: (N, n_fft/2+1, num_frames) + x: (N, n_fft/2+1, num_frames) + y: (N, n_fft/2+1, num_frames) + + The complex spectrum is mag * (x + j*y) + """ + assert x.ndim == 3, x.shape + assert x.shape[0] == 1, x.shape + + mag, x, y = self.model.run( + [ + self.model.get_outputs()[0].name, + self.model.get_outputs()[1].name, + self.model.get_outputs()[2].name, + ], + { + self.model.get_inputs()[0].name: x, + }, + ) + + return mag, x, y + + +class OnnxHifiGANModel: + def __init__( + self, + filename: str, + ): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.session_opts = session_opts + self.model = ort.InferenceSession( + filename, + sess_options=self.session_opts, + providers=["CPUExecutionProvider"], + ) + + print("----------hifigan----------") + for i in self.model.get_inputs(): + print(i) + + print("-----") + + for i in self.model.get_outputs(): + print(i) + print() + + def __call__(self, x: np.ndarray): + """ + Args: + x: (N, feat_dim, num_frames) + Returns: + audio: (N, num_samples) + """ + assert x.ndim == 3, x.shape + assert x.shape[0] == 1, x.shape + + audio = self.model.run( + [self.model.get_outputs()[0].name], + { + self.model.get_inputs()[0].name: x, + }, + )[0] + # audio: (batch_size, num_samples) + + return audio + + +def load_tokens(filename): + token2id = dict() + with open(filename, encoding="utf-8") as f: + for line in f: + fields = line.strip().split() + if len(fields) == 1: + t = " " + idx = int(fields[0]) + else: + t, idx = line.strip().split() + token2id[t] = int(idx) + return token2id + + +class OnnxModel: + def __init__( + self, + filename: str, + tokens: str, + ): + self.token2id = load_tokens(tokens) + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.session_opts = session_opts + self.model = ort.InferenceSession( + filename, + sess_options=self.session_opts, + providers=["CPUExecutionProvider"], + ) + + print(f"{self.model.get_modelmeta().custom_metadata_map}") + metadata = self.model.get_modelmeta().custom_metadata_map + self.sample_rate = int(metadata["sample_rate"]) + + print("----------matcha----------") + for i in self.model.get_inputs(): + print(i) + + print("-----") + + for i in self.model.get_outputs(): + print(i) + print() + + def __call__(self, x: np.ndim): + """ + Args: + """ + assert x.ndim == 2, x.shape + assert x.shape[0] == 1, x.shape + + x_lengths = np.array([x.shape[1]], dtype=np.int64) + + noise_scale = np.array([1.0], dtype=np.float32) + length_scale = np.array([1.0], dtype=np.float32) + + mel = self.model.run( + [self.model.get_outputs()[0].name], + { + self.model.get_inputs()[0].name: x, + self.model.get_inputs()[1].name: x_lengths, + self.model.get_inputs()[2].name: noise_scale, + self.model.get_inputs()[3].name: length_scale, + }, + )[0] + # mel: (batch_size, feat_dim, num_frames) + + return mel + + +def main(): + am = OnnxModel( + filename="./matcha-icefall-en_US-ljspeech/model-steps-3.onnx", + tokens="./matcha-icefall-en_US-ljspeech/tokens.txt", + ) + vocoder = OnnxHifiGANModel("./hifigan_v2.onnx") + vocos = OnnxVocosModel("./mel_spec_22khz_univ.onnx") + + text = "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." + tokens_list = phonemize_espeak(text, "en-us") + print(tokens_list) + tokens = [] + for t in tokens_list: + tokens.extend(t) + + token_ids = [] + for t in tokens: + if t not in am.token2id: + print(f"Skip OOV '{t}'") + continue + token_ids.append(am.token2id[t]) + + token_ids2 = [am.token2id["_"]] * (len(token_ids) * 2 + 1) + token_ids2[1::2] = token_ids + token_ids = token_ids2 + x = np.array([token_ids], dtype=np.int64) + + mel_start_t = dt.datetime.now() + mel = am(x) + mel_end_t = dt.datetime.now() + + print("mel", mel.shape) + # mel:(1, 80, 78) + + vocos_start_t = dt.datetime.now() + mag, x, y = vocos(mel) + stft_result = knf.StftResult( + real=(mag * x)[0].transpose().reshape(-1).tolist(), + imag=(mag * y)[0].transpose().reshape(-1).tolist(), + num_frames=mag.shape[2], + ) + config = knf.StftConfig( + n_fft=1024, + hop_length=256, + win_length=1024, + window_type="hann", + center=True, + pad_mode="reflect", + normalized=False, + ) + istft = knf.IStft(config) + audio_vocos = istft(stft_result) + vocos_end_t = dt.datetime.now() + + audio_vocos = np.array(audio_vocos) + # audio = audio / 2 + print("vocos max/min", np.max(audio_vocos), np.min(audio_vocos)) + + sf.write("vocos.wav", audio_vocos, am.sample_rate, "PCM_16") + + hifigan_start_t = dt.datetime.now() + audio_hifigan = vocoder(mel) + hifigan_end_t = dt.datetime.now() + audio_hifigan = audio_hifigan.squeeze() + + print("hifigan max/min", np.max(audio_hifigan), np.min(audio_hifigan)) + + sample_rate = am.sample_rate + sf.write("hifigan-v2.wav", audio_hifigan, sample_rate, "PCM_16") + + am_t = (mel_end_t - mel_start_t).total_seconds() + vocos_t = (vocos_end_t - vocos_start_t).total_seconds() + hifigan_t = (hifigan_end_t - hifigan_start_t).total_seconds() + + mean_audio_duration = ( + (audio_vocos.shape[-1] + audio_hifigan.shape[-1]) / 2 / sample_rate + ) + rtf_am = am_t / mean_audio_duration + + rtf_vocos = vocos_t * sample_rate / audio_vocos.shape[-1] + rtf_hifigan = hifigan_t * sample_rate / audio_hifigan.shape[-1] + + print( + "Audio duration for vocos {:.3f} s".format(audio_vocos.shape[-1] / sample_rate) + ) + print( + "Audio duration for hifigan {:.3f} s".format( + audio_hifigan.shape[-1] / sample_rate + ) + ) + print("Mean audio duration: {:.3f} s".format(mean_audio_duration)) + print("RTF for acoustic model {:.3f}".format(rtf_am)) + print("RTF for vocos {:.3f}".format(rtf_vocos)) + print("RTF for hifigan {:.3f}".format(rtf_hifigan)) + + +if __name__ == "__main__": + main()