Support including TTS conditionally. (#699)

This commit is contained in:
Fangjun Kuang
2024-03-26 17:21:35 +08:00
committed by GitHub
parent bd66f7a7d0
commit 4e040c596e
14 changed files with 413 additions and 77 deletions

View File

@@ -15,13 +15,16 @@
#include "sherpa-onnx/csrc/keyword-spotter.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-recognizer.h"
#include "sherpa-onnx/csrc/offline-tts.h"
#include "sherpa-onnx/csrc/online-recognizer.h"
#include "sherpa-onnx/csrc/spoken-language-identification.h"
#include "sherpa-onnx/csrc/voice-activity-detector.h"
#include "sherpa-onnx/csrc/wave-reader.h"
#include "sherpa-onnx/csrc/wave-writer.h"
#if SHERPA_ONNX_ENABLE_TTS == 1
#include "sherpa-onnx/csrc/offline-tts.h"
#endif
struct SherpaOnnxOnlineRecognizer {
std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
};
@@ -742,6 +745,7 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
p->impl->Reset();
}
#if SHERPA_ONNX_ENABLE_TTS == 1
struct SherpaOnnxOfflineTts {
std::unique_ptr<sherpa_onnx::OfflineTts> impl;
};
@@ -857,6 +861,7 @@ void SherpaOnnxDestroyOfflineTtsGeneratedAudio(
delete p;
}
}
#endif // SHERPA_ONNX_ENABLE_TTS == 1
int32_t SherpaOnnxWriteWave(const float *samples, int32_t n,
int32_t sample_rate, const char *filename) {

View File

@@ -43,7 +43,6 @@ set(sources
offline-transducer-model-config.cc
offline-transducer-model.cc
offline-transducer-modified-beam-search-decoder.cc
offline-tts-character-frontend.cc
offline-wenet-ctc-model-config.cc
offline-wenet-ctc-model.cc
offline-whisper-greedy-search-decoder.cc
@@ -79,7 +78,6 @@ set(sources
packed-sequence.cc
pad-sequence.cc
parse-options.cc
piper-phonemize-lexicon.cc
provider.cc
resample.cc
session.cc
@@ -99,6 +97,7 @@ set(sources
vad-model.cc
voice-activity-detector.cc
wave-reader.cc
wave-writer.cc
)
# speaker embedding extractor
@@ -110,15 +109,18 @@ list(APPEND sources
speaker-embedding-manager.cc
)
list(APPEND sources
lexicon.cc
offline-tts-impl.cc
offline-tts-model-config.cc
offline-tts-vits-model-config.cc
offline-tts-vits-model.cc
offline-tts.cc
wave-writer.cc
)
if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND sources
lexicon.cc
offline-tts-character-frontend.cc
offline-tts-impl.cc
offline-tts-model-config.cc
offline-tts-vits-model-config.cc
offline-tts-vits-model.cc
offline-tts.cc
piper-phonemize-lexicon.cc
)
endif()
if(SHERPA_ONNX_ENABLE_CHECK)
list(APPEND sources log.cc)
@@ -130,14 +132,21 @@ if(APPLE)
)
endif()
if(ANDROID_NDK)
target_link_libraries(sherpa-onnx-core android log)
endif()
target_link_libraries(sherpa-onnx-core kaldi-native-fbank-core)
target_link_libraries(sherpa-onnx-core
kaldi-native-fbank-core
kaldi-decoder-core
)
target_link_libraries(sherpa-onnx-core kaldi-decoder-core)
if(SHERPA_ONNX_ENABLE_GPU)
target_link_libraries(sherpa-onnx-core
onnxruntime_providers_cuda
onnxruntime_providers_shared
)
endif()
if(BUILD_SHARED_LIBS)
target_link_libraries(sherpa-onnx-core onnxruntime)
@@ -151,15 +160,10 @@ if(NOT BUILD_SHARED_LIBS AND APPLE)
target_link_libraries(sherpa-onnx-core "-framework Foundation")
endif()
if(SHERPA_ONNX_ENABLE_GPU)
target_link_libraries(sherpa-onnx-core
onnxruntime_providers_cuda
onnxruntime_providers_shared
)
if(SHERPA_ONNX_ENABLE_TTS)
target_link_libraries(sherpa-onnx-core piper_phonemize)
endif()
target_link_libraries(sherpa-onnx-core piper_phonemize)
if(SHERPA_ONNX_ENABLE_CHECK)
target_compile_definitions(sherpa-onnx-core PUBLIC SHERPA_ONNX_ENABLE_CHECK=1)
@@ -185,17 +189,24 @@ if(SHERPA_ONNX_ENABLE_BINARY)
add_executable(sherpa-onnx-keyword-spotter sherpa-onnx-keyword-spotter.cc)
add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc)
add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc)
add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc)
add_executable(sherpa-onnx-offline-language-identification sherpa-onnx-offline-language-identification.cc)
if(SHERPA_ONNX_ENABLE_TTS)
add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc)
endif()
set(main_exes
sherpa-onnx
sherpa-onnx-keyword-spotter
sherpa-onnx-offline
sherpa-onnx-offline-parallel
sherpa-onnx-offline-tts
sherpa-onnx-offline-language-identification
)
if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND main_exes
sherpa-onnx-offline-tts
)
endif()
foreach(exe IN LISTS main_exes)
target_link_libraries(${exe} sherpa-onnx-core)
@@ -235,17 +246,27 @@ endif()
if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY)
add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc)
add_executable(sherpa-onnx-keyword-spotter-alsa sherpa-onnx-keyword-spotter-alsa.cc alsa.cc)
add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)
add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc)
add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc)
if(SHERPA_ONNX_ENABLE_TTS)
add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)
endif()
set(exes
sherpa-onnx-alsa
sherpa-onnx-keyword-spotter-alsa
sherpa-onnx-alsa-offline
sherpa-onnx-offline-tts-play-alsa
sherpa-onnx-alsa-offline-speaker-identification
)
if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND exes
sherpa-onnx-offline-tts-play-alsa
)
endif()
foreach(exe IN LISTS exes)
target_link_libraries(${exe} sherpa-onnx-core)
endforeach()
@@ -279,10 +300,12 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY)
endif()
if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
add_executable(sherpa-onnx-offline-tts-play
sherpa-onnx-offline-tts-play.cc
microphone.cc
)
if(SHERPA_ONNX_ENABLE_TTS)
add_executable(sherpa-onnx-offline-tts-play
sherpa-onnx-offline-tts-play.cc
microphone.cc
)
endif()
add_executable(sherpa-onnx-keyword-spotter-microphone
sherpa-onnx-keyword-spotter-microphone.cc
@@ -325,10 +348,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
sherpa-onnx-keyword-spotter-microphone
sherpa-onnx-microphone-offline
sherpa-onnx-microphone-offline-speaker-identification
sherpa-onnx-offline-tts-play
sherpa-onnx-vad-microphone
sherpa-onnx-vad-microphone-offline-asr
)
if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND exes
sherpa-onnx-offline-tts-play
)
endif()
foreach(exe IN LISTS exes)
target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core)
endforeach()
@@ -369,10 +397,8 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET AND SHERPA_ONNX_ENABLE_BINARY)
target_link_libraries(sherpa-onnx-online-websocket-client sherpa-onnx-core)
if(NOT WIN32)
target_link_libraries(sherpa-onnx-online-websocket-server -pthread)
target_compile_options(sherpa-onnx-online-websocket-server PRIVATE -Wno-deprecated-declarations)
target_link_libraries(sherpa-onnx-online-websocket-client -pthread)
target_compile_options(sherpa-onnx-online-websocket-client PRIVATE -Wno-deprecated-declarations)
endif()
@@ -384,7 +410,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET AND SHERPA_ONNX_ENABLE_BINARY)
target_link_libraries(sherpa-onnx-offline-websocket-server sherpa-onnx-core)
if(NOT WIN32)
target_link_libraries(sherpa-onnx-offline-websocket-server -pthread)
target_compile_options(sherpa-onnx-offline-websocket-server PRIVATE -Wno-deprecated-declarations)
endif()
@@ -422,13 +447,17 @@ if(SHERPA_ONNX_ENABLE_TESTS)
context-graph-test.cc
packed-sequence-test.cc
pad-sequence-test.cc
piper-phonemize-test.cc
slice-test.cc
stack-test.cc
transpose-test.cc
unbind-test.cc
utfcpp-test.cc
)
if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND sherpa_onnx_test_srcs
piper-phonemize-test.cc
)
endif()
list(APPEND sherpa_onnx_test_srcs
speaker-embedding-manager-test.cc

View File

@@ -24,7 +24,6 @@
#include "sherpa-onnx/csrc/keyword-spotter.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-recognizer.h"
#include "sherpa-onnx/csrc/offline-tts.h"
#include "sherpa-onnx/csrc/online-recognizer.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
@@ -33,6 +32,10 @@
#include "sherpa-onnx/csrc/wave-reader.h"
#include "sherpa-onnx/csrc/wave-writer.h"
#if SHERPA_ONNX_ENABLE_TTS == 1
#include "sherpa-onnx/csrc/offline-tts.h"
#endif
#define SHERPA_ONNX_EXTERN_C extern "C"
namespace sherpa_onnx {
@@ -629,8 +632,8 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(whisper_config_cls, "tailPaddings", "I");
ans.model_config.whisper.tail_paddings = env->GetIntField(whisper_config,
fid);
ans.model_config.whisper.tail_paddings =
env->GetIntField(whisper_config, fid);
return ans;
}
@@ -782,6 +785,7 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
return ans;
}
#if SHERPA_ONNX_ENABLE_TTS == 1
class SherpaOnnxOfflineTts {
public:
#if __ANDROID_API__ >= 9
@@ -878,6 +882,7 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
return ans;
}
#endif
} // namespace sherpa_onnx
@@ -1209,6 +1214,15 @@ Java_com_k2fsa_sherpa_onnx_SpeakerEmbeddingManager_allSpeakerNames(
return obj_arr;
}
// see
// https://stackoverflow.com/questions/29043872/android-jni-return-multiple-variables
static jobject NewInteger(JNIEnv *env, int32_t value) {
jclass cls = env->FindClass("java/lang/Integer");
jmethodID constructor = env->GetMethodID(cls, "<init>", "(I)V");
return env->NewObject(cls, constructor, value);
}
#if SHERPA_ONNX_ENABLE_TTS == 1
SHERPA_ONNX_EXTERN_C
JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_new(
JNIEnv *env, jobject /*obj*/, jobject asset_manager, jobject _config) {
@@ -1265,14 +1279,6 @@ JNIEXPORT jint JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_getNumSpeakers(
->NumSpeakers();
}
// see
// https://stackoverflow.com/questions/29043872/android-jni-return-multiple-variables
static jobject NewInteger(JNIEnv *env, int32_t value) {
jclass cls = env->FindClass("java/lang/Integer");
jmethodID constructor = env->GetMethodID(cls, "<init>", "(I)V");
return env->NewObject(cls, constructor, value);
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT jobjectArray JNICALL
Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/,
@@ -1336,6 +1342,7 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl(
return obj_arr;
}
#endif
SHERPA_ONNX_EXTERN_C
JNIEXPORT jboolean JNICALL Java_com_k2fsa_sherpa_onnx_GeneratedAudio_saveImpl(

View File

@@ -15,9 +15,6 @@ set(srcs
offline-stream.cc
offline-tdnn-model-config.cc
offline-transducer-model-config.cc
offline-tts-model-config.cc
offline-tts-vits-model-config.cc
offline-tts.cc
offline-wenet-ctc-model-config.cc
offline-whisper-model-config.cc
offline-zipformer-ctc-model-config.cc
@@ -44,6 +41,14 @@ else()
list(APPEND srcs faked-alsa.cc)
endif()
if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND srcs
offline-tts-model-config.cc
offline-tts-vits-model-config.cc
offline-tts.cc
)
endif()
pybind11_add_module(_sherpa_onnx ${srcs})
if(APPLE)

View File

@@ -15,7 +15,6 @@
#include "sherpa-onnx/python/csrc/offline-model-config.h"
#include "sherpa-onnx/python/csrc/offline-recognizer.h"
#include "sherpa-onnx/python/csrc/offline-stream.h"
#include "sherpa-onnx/python/csrc/offline-tts.h"
#include "sherpa-onnx/python/csrc/online-lm-config.h"
#include "sherpa-onnx/python/csrc/online-model-config.h"
#include "sherpa-onnx/python/csrc/online-recognizer.h"
@@ -27,6 +26,10 @@
#include "sherpa-onnx/python/csrc/vad-model.h"
#include "sherpa-onnx/python/csrc/voice-activity-detector.h"
#if SHERPA_ONNX_ENABLE_TTS == 1
#include "sherpa-onnx/python/csrc/offline-tts.h"
#endif
namespace sherpa_onnx {
PYBIND11_MODULE(_sherpa_onnx, m) {
@@ -53,7 +56,10 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
PybindCircularBuffer(&m);
PybindVoiceActivityDetector(&m);
#if SHERPA_ONNX_ENABLE_TTS == 1
PybindOfflineTts(&m);
#endif
PybindSpeakerEmbeddingExtractor(&m);
PybindSpeakerEmbeddingManager(&m);
PybindSpokenLanguageIdentification(&m);