Support including TTS conditionally. (#699)

2024-03-26 17:21:35 +08:00
parent bd66f7a7d0
commit 4e040c596e
14 changed files with 413 additions and 77 deletions
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -15,13 +15,16 @@
 #include "sherpa-onnx/csrc/keyword-spotter.h"
 #include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/offline-recognizer.h"
-#include "sherpa-onnx/csrc/offline-tts.h"
 #include "sherpa-onnx/csrc/online-recognizer.h"
 #include "sherpa-onnx/csrc/spoken-language-identification.h"
 #include "sherpa-onnx/csrc/voice-activity-detector.h"
 #include "sherpa-onnx/csrc/wave-reader.h"
 #include "sherpa-onnx/csrc/wave-writer.h"

+#if SHERPA_ONNX_ENABLE_TTS == 1
+#include "sherpa-onnx/csrc/offline-tts.h"
+#endif
+
 struct SherpaOnnxOnlineRecognizer {
  std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
 };
@@ -742,6 +745,7 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Reset();
 }

+#if SHERPA_ONNX_ENABLE_TTS == 1
 struct SherpaOnnxOfflineTts {
  std::unique_ptr<sherpa_onnx::OfflineTts> impl;
 };
@@ -857,6 +861,7 @@ void SherpaOnnxDestroyOfflineTtsGeneratedAudio(
    delete p;
  }
 }
+#endif  // SHERPA_ONNX_ENABLE_TTS == 1

 int32_t SherpaOnnxWriteWave(const float *samples, int32_t n,
                            int32_t sample_rate, const char *filename) {
--- a/sherpa-onnx/csrc/CMakeLists.txt
+++ b/sherpa-onnx/csrc/CMakeLists.txt
@@ -43,7 +43,6 @@ set(sources
  offline-transducer-model-config.cc
  offline-transducer-model.cc
  offline-transducer-modified-beam-search-decoder.cc
-  offline-tts-character-frontend.cc
  offline-wenet-ctc-model-config.cc
  offline-wenet-ctc-model.cc
  offline-whisper-greedy-search-decoder.cc
@@ -79,7 +78,6 @@ set(sources
  packed-sequence.cc
  pad-sequence.cc
  parse-options.cc
-  piper-phonemize-lexicon.cc
  provider.cc
  resample.cc
  session.cc
@@ -99,6 +97,7 @@ set(sources
  vad-model.cc
  voice-activity-detector.cc
  wave-reader.cc
+  wave-writer.cc
 )

 # speaker embedding extractor
@@ -110,15 +109,18 @@ list(APPEND sources
  speaker-embedding-manager.cc
 )

-list(APPEND sources
-  lexicon.cc
-  offline-tts-impl.cc
-  offline-tts-model-config.cc
-  offline-tts-vits-model-config.cc
-  offline-tts-vits-model.cc
-  offline-tts.cc
-  wave-writer.cc
-)
+if(SHERPA_ONNX_ENABLE_TTS)
+  list(APPEND sources
+    lexicon.cc
+    offline-tts-character-frontend.cc
+    offline-tts-impl.cc
+    offline-tts-model-config.cc
+    offline-tts-vits-model-config.cc
+    offline-tts-vits-model.cc
+    offline-tts.cc
+    piper-phonemize-lexicon.cc
+  )
+endif()

 if(SHERPA_ONNX_ENABLE_CHECK)
  list(APPEND sources log.cc)
@@ -130,14 +132,21 @@ if(APPLE)
  )
 endif()

-
 if(ANDROID_NDK)
  target_link_libraries(sherpa-onnx-core android log)
 endif()

-target_link_libraries(sherpa-onnx-core kaldi-native-fbank-core)
+target_link_libraries(sherpa-onnx-core
+  kaldi-native-fbank-core
+  kaldi-decoder-core
+)

-target_link_libraries(sherpa-onnx-core kaldi-decoder-core)
+if(SHERPA_ONNX_ENABLE_GPU)
+  target_link_libraries(sherpa-onnx-core
+    onnxruntime_providers_cuda
+    onnxruntime_providers_shared
+  )
+endif()

 if(BUILD_SHARED_LIBS)
  target_link_libraries(sherpa-onnx-core onnxruntime)
@@ -151,15 +160,10 @@ if(NOT BUILD_SHARED_LIBS AND APPLE)
  target_link_libraries(sherpa-onnx-core "-framework Foundation")
 endif()

-if(SHERPA_ONNX_ENABLE_GPU)
-  target_link_libraries(sherpa-onnx-core
-    onnxruntime_providers_cuda
-    onnxruntime_providers_shared
-  )
+if(SHERPA_ONNX_ENABLE_TTS)
+  target_link_libraries(sherpa-onnx-core piper_phonemize)
 endif()

-target_link_libraries(sherpa-onnx-core piper_phonemize)
-
 if(SHERPA_ONNX_ENABLE_CHECK)
  target_compile_definitions(sherpa-onnx-core PUBLIC SHERPA_ONNX_ENABLE_CHECK=1)

@@ -185,17 +189,24 @@ if(SHERPA_ONNX_ENABLE_BINARY)
  add_executable(sherpa-onnx-keyword-spotter sherpa-onnx-keyword-spotter.cc)
  add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc)
  add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc)
-  add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc)
  add_executable(sherpa-onnx-offline-language-identification sherpa-onnx-offline-language-identification.cc)

+  if(SHERPA_ONNX_ENABLE_TTS)
+    add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc)
+  endif()
+
  set(main_exes
    sherpa-onnx
    sherpa-onnx-keyword-spotter
    sherpa-onnx-offline
    sherpa-onnx-offline-parallel
-    sherpa-onnx-offline-tts
    sherpa-onnx-offline-language-identification
  )
+  if(SHERPA_ONNX_ENABLE_TTS)
+    list(APPEND main_exes
+      sherpa-onnx-offline-tts
+    )
+  endif()

  foreach(exe IN LISTS main_exes)
    target_link_libraries(${exe} sherpa-onnx-core)
@@ -235,17 +246,27 @@ endif()
 if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY)
  add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc)
  add_executable(sherpa-onnx-keyword-spotter-alsa sherpa-onnx-keyword-spotter-alsa.cc alsa.cc)
-  add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)
  add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc)
  add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc)

+
+  if(SHERPA_ONNX_ENABLE_TTS)
+    add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)
+  endif()
+
  set(exes
    sherpa-onnx-alsa
    sherpa-onnx-keyword-spotter-alsa
    sherpa-onnx-alsa-offline
-    sherpa-onnx-offline-tts-play-alsa
    sherpa-onnx-alsa-offline-speaker-identification
  )
+
+  if(SHERPA_ONNX_ENABLE_TTS)
+    list(APPEND exes
+      sherpa-onnx-offline-tts-play-alsa
+    )
+  endif()
+
  foreach(exe IN LISTS exes)
    target_link_libraries(${exe} sherpa-onnx-core)
  endforeach()
@@ -279,10 +300,12 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY)
 endif()

 if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
-  add_executable(sherpa-onnx-offline-tts-play
-    sherpa-onnx-offline-tts-play.cc
-    microphone.cc
-  )
+  if(SHERPA_ONNX_ENABLE_TTS)
+    add_executable(sherpa-onnx-offline-tts-play
+      sherpa-onnx-offline-tts-play.cc
+      microphone.cc
+    )
+  endif()

  add_executable(sherpa-onnx-keyword-spotter-microphone
    sherpa-onnx-keyword-spotter-microphone.cc
@@ -325,10 +348,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
    sherpa-onnx-keyword-spotter-microphone
    sherpa-onnx-microphone-offline
    sherpa-onnx-microphone-offline-speaker-identification
-    sherpa-onnx-offline-tts-play
    sherpa-onnx-vad-microphone
    sherpa-onnx-vad-microphone-offline-asr
  )
+  if(SHERPA_ONNX_ENABLE_TTS)
+    list(APPEND exes
+      sherpa-onnx-offline-tts-play
+    )
+  endif()
+
  foreach(exe IN LISTS exes)
    target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core)
  endforeach()
@@ -369,10 +397,8 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET AND SHERPA_ONNX_ENABLE_BINARY)
  target_link_libraries(sherpa-onnx-online-websocket-client sherpa-onnx-core)

  if(NOT WIN32)
-    target_link_libraries(sherpa-onnx-online-websocket-server -pthread)
    target_compile_options(sherpa-onnx-online-websocket-server PRIVATE -Wno-deprecated-declarations)

-    target_link_libraries(sherpa-onnx-online-websocket-client -pthread)
    target_compile_options(sherpa-onnx-online-websocket-client PRIVATE -Wno-deprecated-declarations)
  endif()

@@ -384,7 +410,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET AND SHERPA_ONNX_ENABLE_BINARY)
  target_link_libraries(sherpa-onnx-offline-websocket-server sherpa-onnx-core)

  if(NOT WIN32)
-    target_link_libraries(sherpa-onnx-offline-websocket-server -pthread)
    target_compile_options(sherpa-onnx-offline-websocket-server PRIVATE -Wno-deprecated-declarations)
  endif()

@@ -422,13 +447,17 @@ if(SHERPA_ONNX_ENABLE_TESTS)
    context-graph-test.cc
    packed-sequence-test.cc
    pad-sequence-test.cc
-    piper-phonemize-test.cc
    slice-test.cc
    stack-test.cc
    transpose-test.cc
    unbind-test.cc
    utfcpp-test.cc
  )
+  if(SHERPA_ONNX_ENABLE_TTS)
+    list(APPEND sherpa_onnx_test_srcs
+      piper-phonemize-test.cc
+    )
+  endif()

  list(APPEND sherpa_onnx_test_srcs
    speaker-embedding-manager-test.cc
--- a/sherpa-onnx/jni/jni.cc
+++ b/sherpa-onnx/jni/jni.cc
@@ -24,7 +24,6 @@
 #include "sherpa-onnx/csrc/keyword-spotter.h"
 #include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/offline-recognizer.h"
-#include "sherpa-onnx/csrc/offline-tts.h"
 #include "sherpa-onnx/csrc/online-recognizer.h"
 #include "sherpa-onnx/csrc/onnx-utils.h"
 #include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
@@ -33,6 +32,10 @@
 #include "sherpa-onnx/csrc/wave-reader.h"
 #include "sherpa-onnx/csrc/wave-writer.h"

+#if SHERPA_ONNX_ENABLE_TTS == 1
+#include "sherpa-onnx/csrc/offline-tts.h"
+#endif
+
 #define SHERPA_ONNX_EXTERN_C extern "C"

 namespace sherpa_onnx {
@@ -629,8 +632,8 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
  env->ReleaseStringUTFChars(s, p);

  fid = env->GetFieldID(whisper_config_cls, "tailPaddings", "I");
-  ans.model_config.whisper.tail_paddings = env->GetIntField(whisper_config,
-                                                            fid);
+  ans.model_config.whisper.tail_paddings =
+      env->GetIntField(whisper_config, fid);

  return ans;
 }
@@ -782,6 +785,7 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
  return ans;
 }

+#if SHERPA_ONNX_ENABLE_TTS == 1
 class SherpaOnnxOfflineTts {
 public:
 #if __ANDROID_API__ >= 9
@@ -878,6 +882,7 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {

  return ans;
 }
+#endif

 }  // namespace sherpa_onnx

@@ -1209,6 +1214,15 @@ Java_com_k2fsa_sherpa_onnx_SpeakerEmbeddingManager_allSpeakerNames(
  return obj_arr;
 }

+// see
+// https://stackoverflow.com/questions/29043872/android-jni-return-multiple-variables
+static jobject NewInteger(JNIEnv *env, int32_t value) {
+  jclass cls = env->FindClass("java/lang/Integer");
+  jmethodID constructor = env->GetMethodID(cls, "<init>", "(I)V");
+  return env->NewObject(cls, constructor, value);
+}
+
+#if SHERPA_ONNX_ENABLE_TTS == 1
 SHERPA_ONNX_EXTERN_C
 JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_new(
    JNIEnv *env, jobject /*obj*/, jobject asset_manager, jobject _config) {
@@ -1265,14 +1279,6 @@ JNIEXPORT jint JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_getNumSpeakers(
      ->NumSpeakers();
 }

-// see
-// https://stackoverflow.com/questions/29043872/android-jni-return-multiple-variables
-static jobject NewInteger(JNIEnv *env, int32_t value) {
-  jclass cls = env->FindClass("java/lang/Integer");
-  jmethodID constructor = env->GetMethodID(cls, "<init>", "(I)V");
-  return env->NewObject(cls, constructor, value);
-}
-
 SHERPA_ONNX_EXTERN_C
 JNIEXPORT jobjectArray JNICALL
 Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/,
@@ -1336,6 +1342,7 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl(

  return obj_arr;
 }
+#endif

 SHERPA_ONNX_EXTERN_C
 JNIEXPORT jboolean JNICALL Java_com_k2fsa_sherpa_onnx_GeneratedAudio_saveImpl(
--- a/sherpa-onnx/python/csrc/CMakeLists.txt
+++ b/sherpa-onnx/python/csrc/CMakeLists.txt
@@ -15,9 +15,6 @@ set(srcs
  offline-stream.cc
  offline-tdnn-model-config.cc
  offline-transducer-model-config.cc
-  offline-tts-model-config.cc
-  offline-tts-vits-model-config.cc
-  offline-tts.cc
  offline-wenet-ctc-model-config.cc
  offline-whisper-model-config.cc
  offline-zipformer-ctc-model-config.cc
@@ -44,6 +41,14 @@ else()
  list(APPEND srcs faked-alsa.cc)
 endif()

+if(SHERPA_ONNX_ENABLE_TTS)
+  list(APPEND srcs
+    offline-tts-model-config.cc
+    offline-tts-vits-model-config.cc
+    offline-tts.cc
+  )
+endif()
+
 pybind11_add_module(_sherpa_onnx ${srcs})

 if(APPLE)
--- a/sherpa-onnx/python/csrc/sherpa-onnx.cc
+++ b/sherpa-onnx/python/csrc/sherpa-onnx.cc
@@ -15,7 +15,6 @@
 #include "sherpa-onnx/python/csrc/offline-model-config.h"
 #include "sherpa-onnx/python/csrc/offline-recognizer.h"
 #include "sherpa-onnx/python/csrc/offline-stream.h"
-#include "sherpa-onnx/python/csrc/offline-tts.h"
 #include "sherpa-onnx/python/csrc/online-lm-config.h"
 #include "sherpa-onnx/python/csrc/online-model-config.h"
 #include "sherpa-onnx/python/csrc/online-recognizer.h"
@@ -27,6 +26,10 @@
 #include "sherpa-onnx/python/csrc/vad-model.h"
 #include "sherpa-onnx/python/csrc/voice-activity-detector.h"

+#if SHERPA_ONNX_ENABLE_TTS == 1
+#include "sherpa-onnx/python/csrc/offline-tts.h"
+#endif
+
 namespace sherpa_onnx {

 PYBIND11_MODULE(_sherpa_onnx, m) {
@@ -53,7 +56,10 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
  PybindCircularBuffer(&m);
  PybindVoiceActivityDetector(&m);

+#if SHERPA_ONNX_ENABLE_TTS == 1
  PybindOfflineTts(&m);
+#endif
+
  PybindSpeakerEmbeddingExtractor(&m);
  PybindSpeakerEmbeddingManager(&m);
  PybindSpokenLanguageIdentification(&m);