diff --git a/.github/scripts/test-speaker-recognition-python.sh b/.github/scripts/test-speaker-recognition-python.sh index 6131983d..7d6eff9f 100755 --- a/.github/scripts/test-speaker-recognition-python.sh +++ b/.github/scripts/test-speaker-recognition-python.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -e +set -ex log() { # This function is from espnet @@ -21,18 +21,19 @@ model_dir=$d/wespeaker mkdir -p $model_dir pushd $model_dir models=( -en_voxceleb_CAM++.onnx -en_voxceleb_CAM++_LM.onnx -en_voxceleb_resnet152_LM.onnx -en_voxceleb_resnet221_LM.onnx -en_voxceleb_resnet293_LM.onnx -en_voxceleb_resnet34.onnx -en_voxceleb_resnet34_LM.onnx -zh_cnceleb_resnet34.onnx -zh_cnceleb_resnet34_LM.onnx +wespeaker_en_voxceleb_CAM++.onnx +wespeaker_en_voxceleb_CAM++_LM.onnx +wespeaker_en_voxceleb_resnet152_LM.onnx +wespeaker_en_voxceleb_resnet221_LM.onnx +wespeaker_en_voxceleb_resnet293_LM.onnx +wespeaker_en_voxceleb_resnet34.onnx +wespeaker_en_voxceleb_resnet34_LM.onnx +wespeaker_zh_cnceleb_resnet34.onnx +wespeaker_zh_cnceleb_resnet34_LM.onnx ) for m in ${models[@]}; do wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/$m + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_en_voxceleb_CAM++_LM.onnx done ls -lh popd @@ -42,13 +43,13 @@ model_dir=$d/3dspeaker mkdir -p $model_dir pushd $model_dir models=( -speech_campplus_sv_en_voxceleb_16k.onnx -speech_campplus_sv_zh-cn_16k-common.onnx -speech_eres2net_base_200k_sv_zh-cn_16k-common.onnx -speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx -speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx -speech_eres2net_sv_en_voxceleb_16k.onnx -speech_eres2net_sv_zh-cn_16k-common.onnx +3dspeaker_speech_campplus_sv_en_voxceleb_16k.onnx +3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx +3dspeaker_speech_eres2net_base_200k_sv_zh-cn_16k-common.onnx +3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx +3dspeaker_speech_eres2net_sv_en_voxceleb_16k.onnx +3dspeaker_speech_eres2net_sv_zh-cn_16k-common.onnx ) for m in ${models[@]}; do wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/$m diff --git a/.github/workflows/export-nemo-speaker-verification-to-onnx.yaml b/.github/workflows/export-nemo-speaker-verification-to-onnx.yaml new file mode 100644 index 00000000..f05aa158 --- /dev/null +++ b/.github/workflows/export-nemo-speaker-verification-to-onnx.yaml @@ -0,0 +1,45 @@ +name: export-nemo-speaker-verification-to-onnx + +on: + workflow_dispatch: + +concurrency: + group: export-nemo-speaker-verification-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-nemo-speaker-verification-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export nemo speaker verification models to ONNX + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Run + shell: bash + run: | + cd scripts/nemo/speaker-verification + ./run.sh + + mv -v *.onnx ../../.. + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.onnx + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: speaker-recongition-models diff --git a/python-api-examples/speaker-identification.py b/python-api-examples/speaker-identification.py index 20b46639..f2791a36 100755 --- a/python-api-examples/speaker-identification.py +++ b/python-api-examples/speaker-identification.py @@ -29,7 +29,7 @@ Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models to download a model. An example is given below: - wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/zh_cnceleb_resnet34.onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_zh_cnceleb_resnet34.onnx Note that `zh` means Chinese, while `en` means English. @@ -39,7 +39,7 @@ Assume the filename of the text file is speaker.txt. python3 ./python-api-examples/speaker-identification.py \ --speaker-file ./speaker.txt \ - --model ./zh_cnceleb_resnet34.onnx + --model ./wespeaker_zh_cnceleb_resnet34.onnx """ import argparse import queue diff --git a/scripts/3dspeaker/run.sh b/scripts/3dspeaker/run.sh index f0e875dd..6961215e 100755 --- a/scripts/3dspeaker/run.sh +++ b/scripts/3dspeaker/run.sh @@ -60,4 +60,6 @@ for model in ${models[@]}; do --model ${model}.onnx \ --file1 ./speaker1_a_en_16k.wav \ --file2 ./speaker2_a_en_16k.wav + + mv ${model}.onnx 3dspeaker_${model}.onnx done diff --git a/scripts/nemo/README.md b/scripts/nemo/README.md new file mode 100644 index 00000000..a0eb837a --- /dev/null +++ b/scripts/nemo/README.md @@ -0,0 +1,7 @@ +# Introduction + +This directory contains scripts for exporting models +from [NeMo](https://github.com/NVIDIA/NeMo/) to onnx +so that you can use them in `sherpa-onnx`. + +- [./speaker-verification](./speaker-verification) contains models for speaker verification. diff --git a/scripts/nemo/speaker-verification/README.md b/scripts/nemo/speaker-verification/README.md new file mode 100644 index 00000000..78ae8775 --- /dev/null +++ b/scripts/nemo/speaker-verification/README.md @@ -0,0 +1,14 @@ +# Introduction + +This directory contains script for exporting speaker verification models +from [NeMo](https://github.com/NVIDIA/NeMo/) to onnx +so that you can use them in `sherpa-onnx`. + +Specifically, the following 4 models are exported to `sherpa-onnx` +from +[this page](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_recognition/results.html#speaker-recognition-models): + + - [titanet_large](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large), + - [titanet_small](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_small) + - [speakerverification_speakernet](https://ngc.nvidia.com/catalog/models/nvidia:nemo:speakerverification_speakernet) + - [ecapa_tdnn](https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn) diff --git a/scripts/nemo/speaker-verification/export-onnx.py b/scripts/nemo/speaker-verification/export-onnx.py new file mode 100755 index 00000000..aa28a3b9 --- /dev/null +++ b/scripts/nemo/speaker-verification/export-onnx.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) + +import argparse +from typing import Dict + +import nemo.collections.asr as nemo_asr +import onnx +import torch + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + type=str, + required=True, + choices=[ + "speakerverification_speakernet", + "titanet_large", + "titanet_small", + "ecapa_tdnn", + ], + ) + return parser.parse_args() + + +def add_meta_data(filename: str, meta_data: Dict[str, str]): + """Add meta data to an ONNX model. It is changed in-place. + + Args: + filename: + Filename of the ONNX model to be changed. + meta_data: + Key-value pairs. + """ + model = onnx.load(filename) + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + + onnx.save(model, filename) + + +@torch.no_grad() +def main(): + args = get_args() + speaker_model_config = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained( + model_name=args.model, return_config=True + ) + preprocessor_config = speaker_model_config["preprocessor"] + + print(args.model) + print(speaker_model_config) + print(preprocessor_config) + + assert preprocessor_config["n_fft"] == 512, preprocessor_config + + assert ( + preprocessor_config["_target_"] + == "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor" + ), preprocessor_config + + assert preprocessor_config["frame_splicing"] == 1, preprocessor_config + + speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained( + model_name=args.model + ) + speaker_model.eval() + filename = f"nemo_en_{args.model}.onnx" + speaker_model.export(filename) + + print(f"Adding metadata to {filename}") + + comment = "This model is from NeMo." + url = { + "titanet_large": "https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large", + "titanet_small": "https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_small", + "speakerverification_speakernet": "https://ngc.nvidia.com/catalog/models/nvidia:nemo:speakerverification_speakernet", + "ecapa_tdnn": "https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn", + }[args.model] + + language = "English" + + meta_data = { + "framework": "nemo", + "language": language, + "url": url, + "comment": comment, + "sample_rate": preprocessor_config["sample_rate"], + "output_dim": speaker_model_config["decoder"]["emb_sizes"], + "feature_normalize_type": preprocessor_config["normalize"], + "window_size_ms": int(float(preprocessor_config["window_size"]) * 1000), + "window_stride_ms": int(float(preprocessor_config["window_stride"]) * 1000), + "window_type": preprocessor_config["window"], # e.g., hann + "feat_dim": preprocessor_config["features"], + } + print(meta_data) + add_meta_data(filename=filename, meta_data=meta_data) + + +if __name__ == "__main__": + main() diff --git a/scripts/nemo/speaker-verification/run.sh b/scripts/nemo/speaker-verification/run.sh new file mode 100755 index 00000000..f5a22801 --- /dev/null +++ b/scripts/nemo/speaker-verification/run.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) + +set -ex + +function install_nemo() { + curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py + python3 get-pip.py + + pip install torch==2.1.0+cpu torchaudio==2.1.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + + pip install wget text-unidecode matplotlib>=3.3.2 onnx onnxruntime pybind11 Cython einops kaldi-native-fbank soundfile + + sudo apt-get install -q -y sox libsndfile1 ffmpeg python3-pip + + BRANCH='main' + python3 -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] +} + +install_nemo + +model_list=( +speakerverification_speakernet +titanet_large +titanet_small +# ecapa_tdnn # causes errors, see https://github.com/NVIDIA/NeMo/issues/8168 +) + +for model in ${model_list[@]}; do + python3 ./export-onnx.py --model $model +done + +ls -lh + +function download_test_data() { + wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_a_en_16k.wav + wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_b_en_16k.wav + wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker2_a_en_16k.wav +} + +download_test_data + +for model in ${model_list[@]}; do + python3 ./test-onnx.py \ + --model nemo_en_${model}.onnx \ + --file1 ./speaker1_a_en_16k.wav \ + --file2 ./speaker1_b_en_16k.wav + + python3 ./test-onnx.py \ + --model nemo_en_${model}.onnx \ + --file1 ./speaker1_a_en_16k.wav \ + --file2 ./speaker2_a_en_16k.wav +done diff --git a/scripts/nemo/speaker-verification/test-onnx.py b/scripts/nemo/speaker-verification/test-onnx.py new file mode 100755 index 00000000..891b0a89 --- /dev/null +++ b/scripts/nemo/speaker-verification/test-onnx.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# Copyright 2023-2024 Xiaomi Corp. (authors: Fangjun Kuang) + +""" +This script computes speaker similarity score in the range [0-1] +of two wave files using a speaker embedding model. +""" +import argparse +import wave +from pathlib import Path + +import kaldi_native_fbank as knf +import numpy as np +import onnxruntime as ort +from numpy.linalg import norm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + type=str, + required=True, + help="Path to the input onnx model. Example value: model.onnx", + ) + + parser.add_argument( + "--file1", + type=str, + required=True, + help="Input wave 1", + ) + + parser.add_argument( + "--file2", + type=str, + required=True, + help="Input wave 2", + ) + + return parser.parse_args() + + +def read_wavefile(filename, expected_sample_rate: int = 16000) -> np.ndarray: + """ + Args: + filename: + Path to a wave file, which must be of 16-bit and 16kHz. + expected_sample_rate: + Expected sample rate of the wave file. + Returns: + Return a 1-D float32 array containing audio samples. Each sample is in + the range [-1, 1]. + """ + filename = str(filename) + with wave.open(filename) as f: + wave_file_sample_rate = f.getframerate() + assert wave_file_sample_rate == expected_sample_rate, ( + wave_file_sample_rate, + expected_sample_rate, + ) + + num_channels = f.getnchannels() + assert f.getsampwidth() == 2, f.getsampwidth() # it is in bytes + num_samples = f.getnframes() + samples = f.readframes(num_samples) + samples_int16 = np.frombuffer(samples, dtype=np.int16) + samples_int16 = samples_int16.reshape(-1, num_channels)[:, 0] + samples_float32 = samples_int16.astype(np.float32) + + samples_float32 = samples_float32 / 32768 + + return samples_float32 + + +def compute_features(samples: np.ndarray, model: "OnnxModel") -> np.ndarray: + fbank_opts = knf.FbankOptions() + fbank_opts.frame_opts.samp_freq = model.sample_rate + fbank_opts.frame_opts.frame_length_ms = model.window_size_ms + fbank_opts.frame_opts.frame_shift_ms = model.window_stride_ms + fbank_opts.frame_opts.dither = 0 + fbank_opts.frame_opts.remove_dc_offset = False + fbank_opts.frame_opts.window_type = model.window_type + + fbank_opts.mel_opts.num_bins = model.feat_dim + fbank_opts.mel_opts.low_freq = 0 + fbank_opts.mel_opts.is_librosa = True + + fbank = knf.OnlineFbank(fbank_opts) + fbank.accept_waveform(model.sample_rate, samples) + fbank.input_finished() + + features = [] + for i in range(fbank.num_frames_ready): + f = fbank.get_frame(i) + features.append(f) + features = np.stack(features, axis=0) + # at this point, the shape of features is (T, C) + + if model.feature_normalize_type != "": + assert model.feature_normalize_type == "per_feature" + mean = np.mean(features, axis=0, keepdims=True) + std = np.std(features, axis=0, keepdims=True) + features = (features - mean) / std + + feature_len = features.shape[0] + pad = 16 - feature_len % 16 + + if pad > 0: + padding = np.zeros((pad, features.shape[1]), dtype=np.float32) + features = np.concatenate([features, padding]) + + features = np.expand_dims(features, axis=0) + + return features, feature_len + + +class OnnxModel: + def __init__( + self, + filename: str, + ): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.session_opts = session_opts + + self.model = ort.InferenceSession( + filename, + sess_options=self.session_opts, + ) + + meta = self.model.get_modelmeta().custom_metadata_map + self.framework = meta["framework"] + self.sample_rate = int(meta["sample_rate"]) + self.output_dim = int(meta["output_dim"]) + self.feature_normalize_type = meta["feature_normalize_type"] + self.window_size_ms = int(meta["window_size_ms"]) + self.window_stride_ms = int(meta["window_stride_ms"]) + self.window_type = meta["window_type"] + self.feat_dim = int(meta["feat_dim"]) + print(meta) + + assert self.framework == "nemo", self.framework + + def __call__(self, x: np.ndarray, x_lens: int) -> np.ndarray: + """ + Args: + x: + A 2-D float32 tensor of shape (T, C). + y: + A 1-D float32 tensor containing model output. + """ + x = x.transpose(0, 2, 1) # (B, T, C) -> (B, C, T) + x_lens = np.asarray([x_lens], dtype=np.int64) + + return self.model.run( + [ + self.model.get_outputs()[1].name, + ], + { + self.model.get_inputs()[0].name: x, + self.model.get_inputs()[1].name: x_lens, + }, + )[0][0] + + +def main(): + args = get_args() + print(args) + filename = Path(args.model) + file1 = Path(args.file1) + file2 = Path(args.file2) + assert filename.is_file(), filename + assert file1.is_file(), file1 + assert file2.is_file(), file2 + + model = OnnxModel(filename) + wave1 = read_wavefile(file1, model.sample_rate) + wave2 = read_wavefile(file2, model.sample_rate) + + features1, features1_len = compute_features(wave1, model) + features2, features2_len = compute_features(wave2, model) + + output1 = model(features1, features1_len) + output2 = model(features2, features2_len) + + similarity = np.dot(output1, output2) / (norm(output1) * norm(output2)) + print(f"similarity in the range [0-1]: {similarity}") + + +if __name__ == "__main__": + main() diff --git a/scripts/wespeaker/run.sh b/scripts/wespeaker/run.sh index 14b45dda..be7aa840 100755 --- a/scripts/wespeaker/run.sh +++ b/scripts/wespeaker/run.sh @@ -24,7 +24,7 @@ ls -lh --file1 ./wespeaker-models/test_wavs/00001_spk1.wav \ --file2 ./wespeaker-models/test_wavs/00010_spk2.wav -mv voxceleb_resnet34.onnx en_voxceleb_resnet34.onnx +mv voxceleb_resnet34.onnx wespeaker_en_voxceleb_resnet34.onnx ./add_meta_data.py \ --model ./voxceleb_resnet34_LM.onnx \ @@ -38,7 +38,7 @@ mv voxceleb_resnet34.onnx en_voxceleb_resnet34.onnx --file1 ./wespeaker-models/test_wavs/00001_spk1.wav \ --file2 ./wespeaker-models/test_wavs/00010_spk2.wav -mv voxceleb_resnet34_LM.onnx en_voxceleb_resnet34_LM.onnx +mv voxceleb_resnet34_LM.onnx wespeaker_en_voxceleb_resnet34_LM.onnx ./add_meta_data.py \ --model ./voxceleb_resnet152_LM.onnx \ @@ -53,7 +53,7 @@ mv voxceleb_resnet34_LM.onnx en_voxceleb_resnet34_LM.onnx --file1 ./wespeaker-models/test_wavs/00001_spk1.wav \ --file2 ./wespeaker-models/test_wavs/00010_spk2.wav -mv voxceleb_resnet152_LM.onnx en_voxceleb_resnet152_LM.onnx +mv voxceleb_resnet152_LM.onnx wespeaker_en_voxceleb_resnet152_LM.onnx ./add_meta_data.py \ --model ./voxceleb_resnet221_LM.onnx \ @@ -68,7 +68,7 @@ mv voxceleb_resnet152_LM.onnx en_voxceleb_resnet152_LM.onnx --file1 ./wespeaker-models/test_wavs/00001_spk1.wav \ --file2 ./wespeaker-models/test_wavs/00010_spk2.wav -mv voxceleb_resnet221_LM.onnx en_voxceleb_resnet221_LM.onnx +mv voxceleb_resnet221_LM.onnx wespeaker_en_voxceleb_resnet221_LM.onnx ./add_meta_data.py \ --model ./voxceleb_resnet293_LM.onnx \ @@ -83,7 +83,7 @@ mv voxceleb_resnet221_LM.onnx en_voxceleb_resnet221_LM.onnx --file1 ./wespeaker-models/test_wavs/00001_spk1.wav \ --file2 ./wespeaker-models/test_wavs/00010_spk2.wav -mv voxceleb_resnet293_LM.onnx en_voxceleb_resnet293_LM.onnx +mv voxceleb_resnet293_LM.onnx wespeaker_en_voxceleb_resnet293_LM.onnx ./add_meta_data.py \ --model ./voxceleb_CAM++.onnx \ @@ -98,7 +98,7 @@ mv voxceleb_resnet293_LM.onnx en_voxceleb_resnet293_LM.onnx --file1 ./wespeaker-models/test_wavs/00001_spk1.wav \ --file2 ./wespeaker-models/test_wavs/00010_spk2.wav -mv voxceleb_CAM++.onnx en_voxceleb_CAM++.onnx +mv voxceleb_CAM++.onnx wespeaker_en_voxceleb_CAM++.onnx ./add_meta_data.py \ --model ./voxceleb_CAM++_LM.onnx \ @@ -113,20 +113,20 @@ mv voxceleb_CAM++.onnx en_voxceleb_CAM++.onnx --file1 ./wespeaker-models/test_wavs/00001_spk1.wav \ --file2 ./wespeaker-models/test_wavs/00010_spk2.wav -mv voxceleb_CAM++_LM.onnx en_voxceleb_CAM++_LM.onnx +mv voxceleb_CAM++_LM.onnx wespeaker_en_voxceleb_CAM++_LM.onnx ./add_meta_data.py \ --model ./cnceleb_resnet34.onnx \ --language Chinese \ --url https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/cnceleb/cnceleb_resnet34.onnx -mv cnceleb_resnet34.onnx zh_cnceleb_resnet34.onnx +mv cnceleb_resnet34.onnx wespeaker_zh_cnceleb_resnet34.onnx ./add_meta_data.py \ --model ./cnceleb_resnet34_LM.onnx \ --language Chinese \ --url https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/cnceleb/cnceleb_resnet34_LM.onnx -mv cnceleb_resnet34_LM.onnx zh_cnceleb_resnet34_LM.onnx +mv cnceleb_resnet34_LM.onnx wespeaker_zh_cnceleb_resnet34_LM.onnx ls -lh