diff --git a/.github/scripts/test-c-api.sh b/.github/scripts/test-c-api.sh index 08a1a1a2..afc66c10 100755 --- a/.github/scripts/test-c-api.sh +++ b/.github/scripts/test-c-api.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -e +set -ex log() { # This function is from espnet @@ -9,6 +9,7 @@ log() { } echo "SLID_EXE is $SLID_EXE" +echo "SID_EXE is $SID_EXE" echo "PATH: $PATH" @@ -24,3 +25,15 @@ rm sherpa-onnx-whisper-tiny.tar.bz2 $SLID_EXE rm -rf sherpa-onnx-whisper-tiny* + +log "------------------------------------------------------------" +log "Download file for speaker identification and verification " +log "------------------------------------------------------------" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx +git clone https://github.com/csukuangfj/sr-data + +$SID_EXE + +rm -fv *.onnx +rm -rf sr-data diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 7df1761a..b1f3fa91 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -124,11 +124,12 @@ jobs: name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} path: build/bin/* - - name: Test spoken language identification (C API) + - name: Test C API shell: bash run: | export PATH=$PWD/build/bin:$PATH export SLID_EXE=spoken-language-identification-c-api + export SID_EXE=speaker-identification-c-api .github/scripts/test-c-api.sh diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 4152cf6c..0d098061 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -103,11 +103,12 @@ jobs: otool -L build/bin/sherpa-onnx otool -l build/bin/sherpa-onnx - - name: Test spoken language identification (C API) + - name: Test C API shell: bash run: | export PATH=$PWD/build/bin:$PATH export SLID_EXE=spoken-language-identification-c-api + export SID_EXE=speaker-identification-c-api .github/scripts/test-c-api.sh diff --git a/.github/workflows/windows-x64.yaml b/.github/workflows/windows-x64.yaml index bd90119c..ea7cf745 100644 --- a/.github/workflows/windows-x64.yaml +++ b/.github/workflows/windows-x64.yaml @@ -70,11 +70,12 @@ jobs: ls -lh ./bin/Release/sherpa-onnx.exe - - name: Test spoken language identification (C API) + - name: Test C API shell: bash run: | export PATH=$PWD/build/bin/Release:$PATH export SLID_EXE=spoken-language-identification-c-api.exe + export SID_EXE=speaker-identification-c-api.exe .github/scripts/test-c-api.sh diff --git a/.gitignore b/.gitignore index 44d78dc4..ea1d57e9 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,4 @@ vits-coqui-* vits-mms-* *.tar.bz2 sherpa-onnx-paraformer-trilingual-zh-cantonese-en +sr-data diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index fd4577f4..06956324 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -12,6 +12,9 @@ endif() add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c) target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api) +add_executable(speaker-identification-c-api speaker-identification-c-api.c) +target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api) + if(SHERPA_ONNX_HAS_ALSA) add_subdirectory(./asr-microphone-example) elseif((UNIX AND NOT APPLE) OR LINUX) diff --git a/c-api-examples/asr-microphone-example/c-api-alsa.cc b/c-api-examples/asr-microphone-example/c-api-alsa.cc index e5d9f434..caa5d8c6 100644 --- a/c-api-examples/asr-microphone-example/c-api-alsa.cc +++ b/c-api-examples/asr-microphone-example/c-api-alsa.cc @@ -188,10 +188,11 @@ int32_t main(int32_t argc, char *argv[]) { } } - SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config); - SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); + const SherpaOnnxOnlineRecognizer *recognizer = + CreateOnlineRecognizer(&config); + const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); - SherpaOnnxDisplay *display = CreateDisplay(50); + const SherpaOnnxDisplay *display = CreateDisplay(50); int32_t segment_id = 0; const char *device_name = argv[context.index]; diff --git a/c-api-examples/decode-file-c-api.c b/c-api-examples/decode-file-c-api.c index c7ea3bfb..c00275cb 100644 --- a/c-api-examples/decode-file-c-api.c +++ b/c-api-examples/decode-file-c-api.c @@ -162,10 +162,11 @@ int32_t main(int32_t argc, char *argv[]) { } } - SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config); - SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); + const SherpaOnnxOnlineRecognizer *recognizer = + CreateOnlineRecognizer(&config); + const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); - SherpaOnnxDisplay *display = CreateDisplay(50); + const SherpaOnnxDisplay *display = CreateDisplay(50); int32_t segment_id = 0; const char *wav_filename = argv[context.index]; diff --git a/c-api-examples/speaker-identification-c-api.c b/c-api-examples/speaker-identification-c-api.c new file mode 100644 index 00000000..d1d428d6 --- /dev/null +++ b/c-api-examples/speaker-identification-c-api.c @@ -0,0 +1,256 @@ +// c-api-examples/speaker-identification-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation + +// We assume you have pre-downloaded the speaker embedding extractor model +// from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models +// +// An example command to download +// "3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx" +// is given below: +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx +// +// clang-format on +// +// Also, please download the test wave files from +// +// https://github.com/csukuangfj/sr-data + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +static const float *ComputeEmbedding( + const SherpaOnnxSpeakerEmbeddingExtractor *ex, const char *wav_filename) { + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + exit(-1); + } + + const SherpaOnnxOnlineStream *stream = + SherpaOnnxSpeakerEmbeddingExtractorCreateStream(ex); + + AcceptWaveform(stream, wave->sample_rate, wave->samples, wave->num_samples); + InputFinished(stream); + + if (!SherpaOnnxSpeakerEmbeddingExtractorIsReady(ex, stream)) { + fprintf(stderr, "The input wave file %s is too short!\n", wav_filename); + exit(-1); + } + + // we will free `v` outside of this function + const float *v = + SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(ex, stream); + + DestroyOnlineStream(stream); + SherpaOnnxFreeWave(wave); + + // Remeber to free v to avoid memory leak + return v; +} + +int32_t main() { + SherpaOnnxSpeakerEmbeddingExtractorConfig config; + + memset(&config, 0, sizeof(config)); + + // please download the model from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models + config.model = "./3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx"; + + config.num_threads = 1; + config.debug = 0; + config.provider = "cpu"; + + const SherpaOnnxSpeakerEmbeddingExtractor *ex = + SherpaOnnxCreateSpeakerEmbeddingExtractor(&config); + if (!ex) { + fprintf(stderr, "Failed to create speaker embedding extractor"); + return -1; + } + + int32_t dim = SherpaOnnxSpeakerEmbeddingExtractorDim(ex); + + const SherpaOnnxSpeakerEmbeddingManager *manager = + SherpaOnnxCreateSpeakerEmbeddingManager(dim); + + // Please download the test data from + // https://github.com/csukuangfj/sr-data + const char *spk1_1 = "./sr-data/enroll/fangjun-sr-1.wav"; + const char *spk1_2 = "./sr-data/enroll/fangjun-sr-2.wav"; + const char *spk1_3 = "./sr-data/enroll/fangjun-sr-3.wav"; + + const char *spk2_1 = "./sr-data/enroll/leijun-sr-1.wav"; + const char *spk2_2 = "./sr-data/enroll/leijun-sr-2.wav"; + + const float *spk1_vec[4] = {NULL}; + spk1_vec[0] = ComputeEmbedding(ex, spk1_1); + spk1_vec[1] = ComputeEmbedding(ex, spk1_2); + spk1_vec[2] = ComputeEmbedding(ex, spk1_3); + + const float *spk2_vec[3] = {NULL}; + spk2_vec[0] = ComputeEmbedding(ex, spk2_1); + spk2_vec[1] = ComputeEmbedding(ex, spk2_2); + + if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "fangjun", spk1_vec)) { + fprintf(stderr, "Failed to register fangjun\n"); + exit(-1); + } + + if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "fangjun")) { + fprintf(stderr, "Failed to find fangjun\n"); + exit(-1); + } + + if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "leijun", spk2_vec)) { + fprintf(stderr, "Failed to register leijun\n"); + exit(-1); + } + + if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "leijun")) { + fprintf(stderr, "Failed to find leijun\n"); + exit(-1); + } + + if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 2) { + fprintf(stderr, "There should be two speakers: fangjun and leijun\n"); + exit(-1); + } + + const char *const *all_speakers = + SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager); + const char *const *p = all_speakers; + fprintf(stderr, "list of registered speakers\n-----\n"); + while (p[0]) { + fprintf(stderr, "speaker: %s\n", p[0]); + ++p; + } + fprintf(stderr, "----\n"); + + SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers); + + const char *test1 = "./sr-data/test/fangjun-test-sr-1.wav"; + const char *test2 = "./sr-data/test/leijun-test-sr-1.wav"; + const char *test3 = "./sr-data/test/liudehua-test-sr-1.wav"; + + const float *v1 = ComputeEmbedding(ex, test1); + const float *v2 = ComputeEmbedding(ex, test2); + const float *v3 = ComputeEmbedding(ex, test3); + + float threshold = 0.6; + + const char *name1 = + SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold); + if (name1) { + fprintf(stderr, "%s: Found %s\n", test1, name1); + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1); + } else { + fprintf(stderr, "%s: Not found\n", test1); + } + + const char *name2 = + SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold); + if (name2) { + fprintf(stderr, "%s: Found %s\n", test2, name2); + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2); + } else { + fprintf(stderr, "%s: Not found\n", test2); + } + + const char *name3 = + SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v3, threshold); + if (name3) { + fprintf(stderr, "%s: Found %s\n", test3, name3); + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name3); + } else { + fprintf(stderr, "%s: Not found\n", test3); + } + + int32_t ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v1, + threshold); + if (ok) { + fprintf(stderr, "%s matches fangjun\n", test1); + } else { + fprintf(stderr, "%s does NOT match fangjun\n", test1); + } + + ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v2, + threshold); + if (ok) { + fprintf(stderr, "%s matches fangjun\n", test2); + } else { + fprintf(stderr, "%s does NOT match fangjun\n", test2); + } + + fprintf(stderr, "Removing fangjun\n"); + if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "fangjun")) { + fprintf(stderr, "Failed to remove fangjun\n"); + exit(-1); + } + + if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 1) { + fprintf(stderr, "There should be only 1 speaker left\n"); + exit(-1); + } + + name1 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold); + if (name1) { + fprintf(stderr, "%s: Found %s\n", test1, name1); + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1); + } else { + fprintf(stderr, "%s: Not found\n", test1); + } + + fprintf(stderr, "Removing leijun\n"); + if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "leijun")) { + fprintf(stderr, "Failed to remove leijun\n"); + exit(-1); + } + + if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 0) { + fprintf(stderr, "There should be only 1 speaker left\n"); + exit(-1); + } + + name2 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold); + if (name2) { + fprintf(stderr, "%s: Found %s\n", test2, name2); + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2); + } else { + fprintf(stderr, "%s: Not found\n", test2); + } + + all_speakers = SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager); + + p = all_speakers; + fprintf(stderr, "list of registered speakers\n-----\n"); + while (p[0]) { + fprintf(stderr, "speaker: %s\n", p[0]); + ++p; + } + fprintf(stderr, "----\n"); + + SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers); + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v1); + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v2); + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v3); + + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[0]); + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[1]); + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[2]); + + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[0]); + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[1]); + + SherpaOnnxDestroySpeakerEmbeddingManager(manager); + SherpaOnnxDestroySpeakerEmbeddingExtractor(ex); + + return 0; +} diff --git a/c-api-examples/spoken-language-identification-c-api.c b/c-api-examples/spoken-language-identification-c-api.c index b9911303..0c640c47 100644 --- a/c-api-examples/spoken-language-identification-c-api.c +++ b/c-api-examples/spoken-language-identification-c-api.c @@ -1,3 +1,6 @@ +// c-api-examples/spoken-language-identification-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation // We assume you have pre-downloaded the whisper multi-lingual models // from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models diff --git a/ios-swift/SherpaOnnx/SherpaOnnx/ViewController.swift b/ios-swift/SherpaOnnx/SherpaOnnx/ViewController.swift index 7153a354..200f6a20 100644 --- a/ios-swift/SherpaOnnx/SherpaOnnx/ViewController.swift +++ b/ios-swift/SherpaOnnx/SherpaOnnx/ViewController.swift @@ -83,7 +83,7 @@ class ViewController: UIViewController { // Please select one model that is best suitable for you. // // You can also modify Model.swift to add new pre-trained models from - // https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html // let modelConfig = getBilingualStreamZhEnZipformer20230220() // let modelConfig = getZhZipformer20230615() diff --git a/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ContentView.swift b/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ContentView.swift index fca8db7b..df1b3d34 100644 --- a/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ContentView.swift +++ b/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ContentView.swift @@ -4,7 +4,7 @@ // // Created by fangjun on 2023/11/23. // -// Speech-to-text with Next-gen Kaldi on iOS without Internet connection +// Text-to-speech with Next-gen Kaldi on iOS without Internet connection import SwiftUI import AVFoundation diff --git a/python-api-examples/offline-tts-play.py b/python-api-examples/offline-tts-play.py index 02442e69..ca3d66de 100755 --- a/python-api-examples/offline-tts-play.py +++ b/python-api-examples/offline-tts-play.py @@ -183,7 +183,7 @@ event = threading.Event() first_message_time = None -def generated_audio_callback(samples: np.ndarray): +def generated_audio_callback(samples: np.ndarray, progress: float): """This function is called whenever max_num_sentences sentences have been processed. diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 14550fc8..685091c1 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -16,6 +16,8 @@ #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-recognizer.h" #include "sherpa-onnx/csrc/online-recognizer.h" +#include "sherpa-onnx/csrc/speaker-embedding-extractor.h" +#include "sherpa-onnx/csrc/speaker-embedding-manager.h" #include "sherpa-onnx/csrc/spoken-language-identification.h" #include "sherpa-onnx/csrc/voice-activity-detector.h" #include "sherpa-onnx/csrc/wave-reader.h" @@ -114,7 +116,7 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( return recognizer; } -void DestroyOnlineRecognizer(SherpaOnnxOnlineRecognizer *recognizer) { +void DestroyOnlineRecognizer(const SherpaOnnxOnlineRecognizer *recognizer) { delete recognizer; } @@ -132,25 +134,28 @@ SherpaOnnxOnlineStream *CreateOnlineStreamWithHotwords( return stream; } -void DestroyOnlineStream(SherpaOnnxOnlineStream *stream) { delete stream; } +void DestroyOnlineStream(const SherpaOnnxOnlineStream *stream) { + delete stream; +} -void AcceptWaveform(SherpaOnnxOnlineStream *stream, int32_t sample_rate, +void AcceptWaveform(const SherpaOnnxOnlineStream *stream, int32_t sample_rate, const float *samples, int32_t n) { stream->impl->AcceptWaveform(sample_rate, samples, n); } -int32_t IsOnlineStreamReady(SherpaOnnxOnlineRecognizer *recognizer, - SherpaOnnxOnlineStream *stream) { +int32_t IsOnlineStreamReady(const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream) { return recognizer->impl->IsReady(stream->impl.get()); } -void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer, - SherpaOnnxOnlineStream *stream) { +void DecodeOnlineStream(const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream) { recognizer->impl->DecodeStream(stream->impl.get()); } -void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer, - SherpaOnnxOnlineStream **streams, int32_t n) { +void DecodeMultipleOnlineStreams(const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream **streams, + int32_t n) { std::vector ss(n); for (int32_t i = 0; i != n; ++i) { ss[i] = streams[i]->impl.get(); @@ -159,7 +164,8 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer, } const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( - SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) { + const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream) { sherpa_onnx::OnlineRecognizerResult result = recognizer->impl->GetResult(stream->impl.get()); const auto &text = result.text; @@ -232,29 +238,30 @@ void DestroyOnlineRecognizerResult(const SherpaOnnxOnlineRecognizerResult *r) { } } -void Reset(SherpaOnnxOnlineRecognizer *recognizer, - SherpaOnnxOnlineStream *stream) { +void Reset(const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream) { recognizer->impl->Reset(stream->impl.get()); } -void InputFinished(SherpaOnnxOnlineStream *stream) { +void InputFinished(const SherpaOnnxOnlineStream *stream) { stream->impl->InputFinished(); } -int32_t IsEndpoint(SherpaOnnxOnlineRecognizer *recognizer, - SherpaOnnxOnlineStream *stream) { +int32_t IsEndpoint(const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream) { return recognizer->impl->IsEndpoint(stream->impl.get()); } -SherpaOnnxDisplay *CreateDisplay(int32_t max_word_per_line) { +const SherpaOnnxDisplay *CreateDisplay(int32_t max_word_per_line) { SherpaOnnxDisplay *ans = new SherpaOnnxDisplay; ans->impl = std::make_unique(max_word_per_line); return ans; } -void DestroyDisplay(SherpaOnnxDisplay *display) { delete display; } +void DestroyDisplay(const SherpaOnnxDisplay *display) { delete display; } -void SherpaOnnxPrint(SherpaOnnxDisplay *display, int32_t idx, const char *s) { +void SherpaOnnxPrint(const SherpaOnnxDisplay *display, int32_t idx, + const char *s) { display->impl->Print(idx, s); } @@ -808,9 +815,8 @@ int32_t SherpaOnnxOfflineTtsNumSpeakers(const SherpaOnnxOfflineTts *tts) { } static const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateInternal( - const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, - float speed, std::function callback) -{ + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, + std::function callback) { sherpa_onnx::GeneratedAudio audio = tts->impl->Generate(text, sid, speed, callback); @@ -833,36 +839,37 @@ static const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateInternal( const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed) { - return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, nullptr ); + return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, nullptr); } const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallback( const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, SherpaOnnxGeneratedAudioCallback callback) { - auto wrapper = [callback](const float *samples, int32_t n, float /*progress*/) { - callback(samples, n ); - }; + auto wrapper = [callback](const float *samples, int32_t n, + float /*progress*/) { callback(samples, n); }; - return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, wrapper ); + return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper); } -const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithProgressCallback( +const SherpaOnnxGeneratedAudio * +SherpaOnnxOfflineTtsGenerateWithProgressCallback( const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, SherpaOnnxGeneratedAudioProgressCallback callback) { auto wrapper = [callback](const float *samples, int32_t n, float progress) { - callback(samples, n, progress ); + callback(samples, n, progress); }; - return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, wrapper ); + return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper); } const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallbackWithArg( const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, SherpaOnnxGeneratedAudioCallbackWithArg callback, void *arg) { - auto wrapper = [callback, arg](const float *samples, int32_t n, float /*progress*/) { + auto wrapper = [callback, arg](const float *samples, int32_t n, + float /*progress*/) { callback(samples, n, arg); }; - return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, wrapper ); + return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper); } void SherpaOnnxDestroyOfflineTtsGeneratedAudio( @@ -972,3 +979,200 @@ void SherpaOnnxDestroySpokenLanguageIdentificationResult( delete r; } } + +struct SherpaOnnxSpeakerEmbeddingExtractor { + std::unique_ptr impl; +}; + +const SherpaOnnxSpeakerEmbeddingExtractor * +SherpaOnnxCreateSpeakerEmbeddingExtractor( + const SherpaOnnxSpeakerEmbeddingExtractorConfig *config) { + sherpa_onnx::SpeakerEmbeddingExtractorConfig c; + c.model = SHERPA_ONNX_OR(config->model, ""); + + c.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); + c.debug = SHERPA_ONNX_OR(config->debug, 0); + c.provider = SHERPA_ONNX_OR(config->provider, "cpu"); + + if (config->debug) { + SHERPA_ONNX_LOGE("%s\n", c.ToString().c_str()); + } + + if (!c.Validate()) { + SHERPA_ONNX_LOGE("Errors in config!"); + return nullptr; + } + + auto p = new SherpaOnnxSpeakerEmbeddingExtractor; + + p->impl = std::make_unique(c); + + return p; +} + +void SherpaOnnxDestroySpeakerEmbeddingExtractor( + const SherpaOnnxSpeakerEmbeddingExtractor *p) { + delete p; +} + +int32_t SherpaOnnxSpeakerEmbeddingExtractorDim( + const SherpaOnnxSpeakerEmbeddingExtractor *p) { + return p->impl->Dim(); +} + +const SherpaOnnxOnlineStream *SherpaOnnxSpeakerEmbeddingExtractorCreateStream( + const SherpaOnnxSpeakerEmbeddingExtractor *p) { + SherpaOnnxOnlineStream *stream = + new SherpaOnnxOnlineStream(p->impl->CreateStream()); + return stream; +} + +int32_t SherpaOnnxSpeakerEmbeddingExtractorIsReady( + const SherpaOnnxSpeakerEmbeddingExtractor *p, + const SherpaOnnxOnlineStream *s) { + return p->impl->IsReady(s->impl.get()); +} + +const float *SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding( + const SherpaOnnxSpeakerEmbeddingExtractor *p, + const SherpaOnnxOnlineStream *s) { + std::vector v = p->impl->Compute(s->impl.get()); + float *ans = new float[v.size()]; + std::copy(v.begin(), v.end(), ans); + return ans; +} + +void SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(const float *v) { + delete[] v; +} + +struct SherpaOnnxSpeakerEmbeddingManager { + std::unique_ptr impl; +}; + +const SherpaOnnxSpeakerEmbeddingManager * +SherpaOnnxCreateSpeakerEmbeddingManager(int32_t dim) { + auto p = new SherpaOnnxSpeakerEmbeddingManager; + p->impl = std::make_unique(dim); + return p; +} + +void SherpaOnnxDestroySpeakerEmbeddingManager( + const SherpaOnnxSpeakerEmbeddingManager *p) { + delete p; +} + +int32_t SherpaOnnxSpeakerEmbeddingManagerAdd( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, + const float *v) { + return p->impl->Add(name, v); +} + +int32_t SherpaOnnxSpeakerEmbeddingManagerAddList( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, + const float **v) { + int32_t n = 0; + auto q = v; + while (q && q[0]) { + ++n; + ++q; + } + + if (n == 0) { + SHERPA_ONNX_LOGE("Empty embedding!"); + return 0; + } + + std::vector> vec(n); + int32_t dim = p->impl->Dim(); + + for (int32_t i = 0; i != n; ++i) { + vec[i] = std::vector(v[i], v[i] + dim); + } + + return p->impl->Add(name, vec); +} + +int32_t SherpaOnnxSpeakerEmbeddingManagerAddListFlattened( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, + const float *v, int32_t n) { + std::vector> vec(n); + + int32_t dim = p->impl->Dim(); + + for (int32_t i = 0; i != n; ++i, v += dim) { + vec[i] = std::vector(v, v + dim); + } + + return p->impl->Add(name, vec); +} + +int32_t SherpaOnnxSpeakerEmbeddingManagerRemove( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name) { + return p->impl->Remove(name); +} + +const char *SherpaOnnxSpeakerEmbeddingManagerSearch( + const SherpaOnnxSpeakerEmbeddingManager *p, const float *v, + float threshold) { + auto r = p->impl->Search(v, threshold); + if (r.empty()) { + return nullptr; + } + + char *name = new char[r.size() + 1]; + std::copy(r.begin(), r.end(), name); + name[r.size()] = '\0'; + + return name; +} + +void SherpaOnnxSpeakerEmbeddingManagerFreeSearch(const char *name) { + delete[] name; +} + +int32_t SherpaOnnxSpeakerEmbeddingManagerVerify( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, + const float *v, float threshold) { + return p->impl->Verify(name, v, threshold); +} + +int32_t SherpaOnnxSpeakerEmbeddingManagerContains( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name) { + return p->impl->Contains(name); +} + +int32_t SherpaOnnxSpeakerEmbeddingManagerNumSpeakers( + const SherpaOnnxSpeakerEmbeddingManager *p) { + return p->impl->NumSpeakers(); +} + +const char *const *SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers( + const SherpaOnnxSpeakerEmbeddingManager *manager) { + std::vector all_speakers = manager->impl->GetAllSpeakers(); + int32_t num_speakers = all_speakers.size(); + char **p = new char *[num_speakers + 1]; + p[num_speakers] = nullptr; + + int32_t i = 0; + for (const auto &name : all_speakers) { + p[i] = new char[name.size() + 1]; + std::copy(name.begin(), name.end(), p[i]); + p[i][name.size()] = '\0'; + + i += 1; + } + return p; +} + +void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers( + const char *const *names) { + auto p = names; + + while (p && p[0]) { + delete[] p[0]; + ++p; + } + + delete[] names; +} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 8c86f353..66c33bf2 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -186,7 +186,7 @@ SHERPA_ONNX_API SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( /// /// @param p A pointer returned by CreateOnlineRecognizer() SHERPA_ONNX_API void DestroyOnlineRecognizer( - SherpaOnnxOnlineRecognizer *recognizer); + const SherpaOnnxOnlineRecognizer *recognizer); /// Create an online stream for accepting wave samples. /// @@ -208,7 +208,7 @@ SHERPA_ONNX_API SherpaOnnxOnlineStream *CreateOnlineStreamWithHotwords( /// Destroy an online stream. /// /// @param stream A pointer returned by CreateOnlineStream() -SHERPA_ONNX_API void DestroyOnlineStream(SherpaOnnxOnlineStream *stream); +SHERPA_ONNX_API void DestroyOnlineStream(const SherpaOnnxOnlineStream *stream); /// Accept input audio samples and compute the features. /// The user has to invoke DecodeOnlineStream() to run the neural network and @@ -221,7 +221,7 @@ SHERPA_ONNX_API void DestroyOnlineStream(SherpaOnnxOnlineStream *stream); /// @param samples A pointer to a 1-D array containing audio samples. /// The range of samples has to be normalized to [-1, 1]. /// @param n Number of elements in the samples array. -SHERPA_ONNX_API void AcceptWaveform(SherpaOnnxOnlineStream *stream, +SHERPA_ONNX_API void AcceptWaveform(const SherpaOnnxOnlineStream *stream, int32_t sample_rate, const float *samples, int32_t n); @@ -230,8 +230,9 @@ SHERPA_ONNX_API void AcceptWaveform(SherpaOnnxOnlineStream *stream, /// /// @param recognizer A pointer returned by CreateOnlineRecognizer /// @param stream A pointer returned by CreateOnlineStream -SHERPA_ONNX_API int32_t IsOnlineStreamReady( - SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream); +SHERPA_ONNX_API int32_t +IsOnlineStreamReady(const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream); /// Call this function to run the neural network model and decoding. // @@ -243,8 +244,9 @@ SHERPA_ONNX_API int32_t IsOnlineStreamReady( /// DecodeOnlineStream(recognizer, stream); /// } /// -SHERPA_ONNX_API void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer, - SherpaOnnxOnlineStream *stream); +SHERPA_ONNX_API void DecodeOnlineStream( + const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream); /// This function is similar to DecodeOnlineStream(). It decodes multiple /// OnlineStream in parallel. @@ -257,8 +259,8 @@ SHERPA_ONNX_API void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer, /// CreateOnlineRecognizer() /// @param n Number of elements in the given streams array. SHERPA_ONNX_API void DecodeMultipleOnlineStreams( - SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream **streams, - int32_t n); + const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream **streams, int32_t n); /// Get the decoding results so far for an OnlineStream. /// @@ -268,7 +270,8 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams( /// DestroyOnlineRecognizerResult() to free the returned pointer to /// avoid memory leak. SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( - SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream); + const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream); /// Destroy the pointer returned by GetOnlineStreamResult(). /// @@ -281,35 +284,36 @@ SHERPA_ONNX_API void DestroyOnlineRecognizerResult( /// /// @param recognizer A pointer returned by CreateOnlineRecognizer(). /// @param stream A pointer returned by CreateOnlineStream -SHERPA_ONNX_API void Reset(SherpaOnnxOnlineRecognizer *recognizer, - SherpaOnnxOnlineStream *stream); +SHERPA_ONNX_API void Reset(const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream); /// Signal that no more audio samples would be available. /// After this call, you cannot call AcceptWaveform() any more. /// /// @param stream A pointer returned by CreateOnlineStream() -SHERPA_ONNX_API void InputFinished(SherpaOnnxOnlineStream *stream); +SHERPA_ONNX_API void InputFinished(const SherpaOnnxOnlineStream *stream); /// Return 1 if an endpoint has been detected. /// /// @param recognizer A pointer returned by CreateOnlineRecognizer() /// @param stream A pointer returned by CreateOnlineStream() /// @return Return 1 if an endpoint is detected. Return 0 otherwise. -SHERPA_ONNX_API int32_t IsEndpoint(SherpaOnnxOnlineRecognizer *recognizer, - SherpaOnnxOnlineStream *stream); +SHERPA_ONNX_API int32_t IsEndpoint(const SherpaOnnxOnlineRecognizer *recognizer, + const SherpaOnnxOnlineStream *stream); // for displaying results on Linux/macOS. SHERPA_ONNX_API typedef struct SherpaOnnxDisplay SherpaOnnxDisplay; /// Create a display object. Must be freed using DestroyDisplay to avoid /// memory leak. -SHERPA_ONNX_API SherpaOnnxDisplay *CreateDisplay(int32_t max_word_per_line); +SHERPA_ONNX_API const SherpaOnnxDisplay *CreateDisplay( + int32_t max_word_per_line); -SHERPA_ONNX_API void DestroyDisplay(SherpaOnnxDisplay *display); +SHERPA_ONNX_API void DestroyDisplay(const SherpaOnnxDisplay *display); /// Print the result. -SHERPA_ONNX_API void SherpaOnnxPrint(SherpaOnnxDisplay *display, int32_t idx, - const char *s); +SHERPA_ONNX_API void SherpaOnnxPrint(const SherpaOnnxDisplay *display, + int32_t idx, const char *s); // ============================================================ // For offline ASR (i.e., non-streaming ASR) // ============================================================ @@ -769,7 +773,7 @@ typedef void (*SherpaOnnxGeneratedAudioCallbackWithArg)(const float *samples, int32_t n, void *arg); typedef void (*SherpaOnnxGeneratedAudioProgressCallback)(const float *samples, - int32_t n, float p); + int32_t n, float p); SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; @@ -839,7 +843,9 @@ SHERPA_ONNX_API const SherpaOnnxWave *SherpaOnnxReadWave(const char *filename); SHERPA_ONNX_API void SherpaOnnxFreeWave(const SherpaOnnxWave *wave); -// Spoken language identification +// ============================================================ +// For spoken language identification +// ============================================================ SHERPA_ONNX_API typedef struct SherpaOnnxSpokenLanguageIdentificationWhisperConfig { @@ -893,6 +899,169 @@ SherpaOnnxSpokenLanguageIdentificationCompute( SHERPA_ONNX_API void SherpaOnnxDestroySpokenLanguageIdentificationResult( const SherpaOnnxSpokenLanguageIdentificationResult *r); +// ============================================================ +// For speaker embedding extraction +// ============================================================ +SHERPA_ONNX_API typedef struct SherpaOnnxSpeakerEmbeddingExtractorConfig { + const char *model; + int32_t num_threads; + int32_t debug; + const char *provider; +} SherpaOnnxSpeakerEmbeddingExtractorConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxSpeakerEmbeddingExtractor + SherpaOnnxSpeakerEmbeddingExtractor; + +// The user has to invoke SherpaOnnxDestroySpeakerEmbeddingExtractor() +// to free the returned pointer to avoid memory leak +SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingExtractor * +SherpaOnnxCreateSpeakerEmbeddingExtractor( + const SherpaOnnxSpeakerEmbeddingExtractorConfig *config); + +SHERPA_ONNX_API void SherpaOnnxDestroySpeakerEmbeddingExtractor( + const SherpaOnnxSpeakerEmbeddingExtractor *p); + +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingExtractorDim( + const SherpaOnnxSpeakerEmbeddingExtractor *p); + +// The user has to invoke DestroyOnlineStream() to free the returned pointer +// to avoid memory leak +SHERPA_ONNX_API const SherpaOnnxOnlineStream * +SherpaOnnxSpeakerEmbeddingExtractorCreateStream( + const SherpaOnnxSpeakerEmbeddingExtractor *p); + +// Return 1 if the stream has enough feature frames for computing embeddings. +// Return 0 otherwise. +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingExtractorIsReady( + const SherpaOnnxSpeakerEmbeddingExtractor *p, + const SherpaOnnxOnlineStream *s); + +// Compute the embedding of the stream. +// +// @return Return a pointer pointing to an array containing the embedding. +// The length of the array is `dim` as returned by +// SherpaOnnxSpeakerEmbeddingExtractorDim(p) +// +// The user has to invoke SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding() +// to free the returned pointer to avoid memory leak. +SHERPA_ONNX_API const float * +SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding( + const SherpaOnnxSpeakerEmbeddingExtractor *p, + const SherpaOnnxOnlineStream *s); + +SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding( + const float *v); + +SHERPA_ONNX_API typedef struct SherpaOnnxSpeakerEmbeddingManager + SherpaOnnxSpeakerEmbeddingManager; + +// The user has to invoke SherpaOnnxDestroySpeakerEmbeddingManager() +// to free the returned pointer to avoid memory leak +SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingManager * +SherpaOnnxCreateSpeakerEmbeddingManager(int32_t dim); + +SHERPA_ONNX_API void SherpaOnnxDestroySpeakerEmbeddingManager( + const SherpaOnnxSpeakerEmbeddingManager *p); + +// Register the embedding of a user +// +// @param name The name of the user +// @param p Pointer to an array containing the embeddings. The length of the +// array must be equal to `dim` used to construct the manager `p`. +// +// @return Return 1 if added successfully. Return 0 on error +SHERPA_ONNX_API int32_t +SherpaOnnxSpeakerEmbeddingManagerAdd(const SherpaOnnxSpeakerEmbeddingManager *p, + const char *name, const float *v); + +// @param v Pointer to an array of embeddings. If there are n embeddings, then +// v[0] is the pointer to the 0-th array containing the embeddings +// v[1] is the pointer to the 1-st array containing the embeddings +// v[n-1] is the pointer to the last array containing the embeddings +// v[n] is a NULL pointer +// @return Return 1 if added successfully. Return 0 on error +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerAddList( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, + const float **v); + +// Similar to SherpaOnnxSpeakerEmbeddingManagerAddList() but the memory +// is flattened. +// +// The length of the input array should be `n * dim`. +// +// @return Return 1 if added successfully. Return 0 on error +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerAddListFlattened( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, + const float *v, int32_t n); + +// Remove a user. +// @param naem The name of the user to remove. +// @return Return 1 if removed successfully; return 0 on error. +// +// Note if the user does not exist, it also returns 0. +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerRemove( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name); + +// Search if an existing users' embedding matches the given one. +// +// @param p Pointer to an array containing the embedding. The dim +// of the array must equal to `dim` used to construct the manager `p`. +// @param threshold A value between 0 and 1. If the similarity score exceeds +// this threshold, we say a match is found. +// @return Returns the name of the user if found. Return NULL if not found. +// If not NULL, the caller has to invoke +// SherpaOnnxSpeakerEmbeddingManagerFreeSearch() to free the returned +// pointer to avoid memory leak. +SHERPA_ONNX_API const char *SherpaOnnxSpeakerEmbeddingManagerSearch( + const SherpaOnnxSpeakerEmbeddingManager *p, const float *v, + float threshold); + +SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingManagerFreeSearch( + const char *name); + +// Check whether the input embedding matches the embedding of the input +// speaker. +// +// It is for speaker verification. +// +// @param name The target speaker name. +// @param p The input embedding to check. +// @param threshold A value between 0 and 1. +// @return Return 1 if it matches. Otherwise, it returns 0. +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerVerify( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, + const float *v, float threshold); + +// Return 1 if the user with the name is in the manager. +// Return 0 if the user does not exist. +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerContains( + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name); + +// Return number of speakers in the manager. +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerNumSpeakers( + const SherpaOnnxSpeakerEmbeddingManager *p); + +// Return the name of all speakers in the manager. +// +// @return Return an array of pointers `ans`. If there are n speakers, then +// - ans[0] contains the name of the 0-th speaker +// - ans[1] contains the name of the 1-st speaker +// - ans[n-1] contains the name of the last speaker +// - ans[n] is NULL +// If there are no users at all, then ans[0] is NULL. In any case, +// `ans` is not NULL. +// +// Each name is NULL-terminated +// +// The caller has to invoke SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers() +// to free the returned pointer to avoid memory leak. +SHERPA_ONNX_API const char *const * +SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers( + const SherpaOnnxSpeakerEmbeddingManager *p); + +SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers( + const char *const *names); + #if defined(__GNUC__) #pragma GCC diagnostic pop #endif diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index cdd33e18..6bcfc0ca 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -168,7 +168,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { ans.samples.insert(ans.samples.end(), audio.samples.begin(), audio.samples.end()); if (callback) { - callback(audio.samples.data(), audio.samples.size(), b * 1.0 / num_batches); + callback(audio.samples.data(), audio.samples.size(), + b * 1.0 / num_batches); // Caution(fangjun): audio is freed when the callback returns, so users // should copy the data if they want to access the data after // the callback returns to avoid segmentation fault. diff --git a/sherpa-onnx/csrc/offline-tts.h b/sherpa-onnx/csrc/offline-tts.h index c39dfdae..354057bf 100644 --- a/sherpa-onnx/csrc/offline-tts.h +++ b/sherpa-onnx/csrc/offline-tts.h @@ -54,8 +54,8 @@ struct GeneratedAudio { class OfflineTtsImpl; -using GeneratedAudioCallback = - std::function; +using GeneratedAudioCallback = std::function; class OfflineTts { public: diff --git a/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc b/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc index 2cb17bd6..33b4f193 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc @@ -44,7 +44,8 @@ static void Handler(int32_t /*sig*/) { fprintf(stderr, "\nCaught Ctrl + C. Exiting\n"); } -static void AudioGeneratedCallback(const float *s, int32_t n) { +static void AudioGeneratedCallback(const float *s, int32_t n, + float /*progress*/) { if (n > 0) { std::lock_guard lock(g_buffer.mutex); g_buffer.samples.push({s, s + n}); diff --git a/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc b/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc index c6dee345..8c919702 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc @@ -47,7 +47,8 @@ static void Handler(int32_t /*sig*/) { fprintf(stderr, "\nCaught Ctrl + C. Exiting\n"); } -static void AudioGeneratedCallback(const float *s, int32_t n, float /*progress*/) { +static void AudioGeneratedCallback(const float *s, int32_t n, + float /*progress*/) { if (n > 0) { Samples samples; samples.data = std::vector{s, s + n}; diff --git a/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc b/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc index aeab20ff..6f5a1225 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc @@ -9,9 +9,8 @@ #include "sherpa-onnx/csrc/parse-options.h" #include "sherpa-onnx/csrc/wave-writer.h" -void audioCallback(const float *samples, int32_t n, float progress) -{ - printf( "sample=%d, progress=%f\n", n, progress ); +void audioCallback(const float *samples, int32_t n, float progress) { + printf("sample=%d, progress=%f\n", n, progress); } int main(int32_t argc, char *argv[]) { diff --git a/sherpa-onnx/csrc/speaker-embedding-manager.cc b/sherpa-onnx/csrc/speaker-embedding-manager.cc index fd60f4fd..e067a2eb 100644 --- a/sherpa-onnx/csrc/speaker-embedding-manager.cc +++ b/sherpa-onnx/csrc/speaker-embedding-manager.cc @@ -93,7 +93,7 @@ class SpeakerEmbeddingManager::Impl { int32_t num_rows = embedding_matrix_.rows(); if (row_idx < num_rows - 1) { - embedding_matrix_.block(row_idx, 0, num_rows - -1 - row_idx, dim_) = + embedding_matrix_.block(row_idx, 0, num_rows - 1 - row_idx, dim_) = embedding_matrix_.bottomRows(num_rows - 1 - row_idx); } diff --git a/sherpa-onnx/jni/jni.cc b/sherpa-onnx/jni/jni.cc index 5d874bc6..281fd4ee 100644 --- a/sherpa-onnx/jni/jni.cc +++ b/sherpa-onnx/jni/jni.cc @@ -795,9 +795,10 @@ class SherpaOnnxOfflineTts { explicit SherpaOnnxOfflineTts(const OfflineTtsConfig &config) : tts_(config) {} - GeneratedAudio Generate( - const std::string &text, int64_t sid = 0, float speed = 1.0, - std::function callback = nullptr) const { + GeneratedAudio Generate(const std::string &text, int64_t sid = 0, + float speed = 1.0, + std::function + callback = nullptr) const { return tts_.Generate(text, sid, speed, callback); } diff --git a/sherpa-onnx/python/csrc/offline-tts.cc b/sherpa-onnx/python/csrc/offline-tts.cc index 82006330..ff31ded9 100644 --- a/sherpa-onnx/python/csrc/offline-tts.cc +++ b/sherpa-onnx/python/csrc/offline-tts.cc @@ -55,14 +55,16 @@ void PybindOfflineTts(py::module *m) { .def( "generate", [](const PyClass &self, const std::string &text, int64_t sid, - float speed, std::function, float)> callback) + float speed, + std::function, float)> callback) -> GeneratedAudio { if (!callback) { return self.Generate(text, sid, speed); } - std::function callback_wrapper = - [callback](const float *samples, int32_t n, float progress) { + std::function + callback_wrapper = [callback](const float *samples, int32_t n, + float progress) { // CAUTION(fangjun): we have to copy samples since it is // freed once the call back returns.