Add CXX API for MatchaTTS models (#1676)

2025-01-03 14:16:36 +08:00
parent 9aa4897a9e
commit 648903834b
12 changed files with 403 additions and 8 deletions
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -1114,7 +1114,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
  return tts_config;
 }

-SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
+const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
    const SherpaOnnxOfflineTtsConfig *config) {
  auto tts_config = GetOfflineTtsConfig(config);

@@ -1130,7 +1130,9 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
  return tts;
 }

-void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; }
+void SherpaOnnxDestroyOfflineTts(const SherpaOnnxOfflineTts *tts) {
+  delete tts;
+}

 int32_t SherpaOnnxOfflineTtsSampleRate(const SherpaOnnxOfflineTts *tts) {
  return tts->impl->SampleRate();
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -950,11 +950,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts;

 // Create an instance of offline TTS. The user has to use DestroyOfflineTts()
 // to free the returned pointer to avoid memory leak.
-SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
+SHERPA_ONNX_API const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
    const SherpaOnnxOfflineTtsConfig *config);

 // Free the pointer returned by SherpaOnnxCreateOfflineTts()
-SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts);
+SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(
+    const SherpaOnnxOfflineTts *tts);

 // Return the sample rate of the current TTS object
 SHERPA_ONNX_API int32_t
@@ -984,7 +985,6 @@ SHERPA_ONNX_API
 const SherpaOnnxGeneratedAudio *
 SherpaOnnxOfflineTtsGenerateWithProgressCallback(
    const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
-
    SherpaOnnxGeneratedAudioProgressCallback callback);

 SHERPA_ONNX_API
--- a/sherpa-onnx/c-api/cxx-api.cc
+++ b/sherpa-onnx/c-api/cxx-api.cc
@@ -24,6 +24,11 @@ Wave ReadWave(const std::string &filename) {
  return ans;
 }

+bool WriteWave(const std::string &filename, const Wave &wave) {
+  return SherpaOnnxWriteWave(wave.samples.data(), wave.samples.size(),
+                             wave.sample_rate, filename.c_str());
+}
+
 OnlineStream::OnlineStream(const SherpaOnnxOnlineStream *p)
    : MoveOnly<OnlineStream, SherpaOnnxOnlineStream>(p) {}

@@ -311,4 +316,73 @@ OfflineRecognizerResult OfflineRecognizer::GetResult(
  return ans;
 }

+OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
+  struct SherpaOnnxOfflineTtsConfig c;
+  memset(&c, 0, sizeof(c));
+
+  c.model.vits.model = config.model.vits.model.c_str();
+  c.model.vits.lexicon = config.model.vits.lexicon.c_str();
+  c.model.vits.tokens = config.model.vits.tokens.c_str();
+  c.model.vits.data_dir = config.model.vits.data_dir.c_str();
+  c.model.vits.noise_scale = config.model.vits.noise_scale;
+  c.model.vits.noise_scale_w = config.model.vits.noise_scale_w;
+  c.model.vits.length_scale = config.model.vits.length_scale;
+  c.model.vits.dict_dir = config.model.vits.dict_dir.c_str();
+
+  c.model.matcha.acoustic_model = config.model.matcha.acoustic_model.c_str();
+  c.model.matcha.vocoder = config.model.matcha.vocoder.c_str();
+  c.model.matcha.lexicon = config.model.matcha.lexicon.c_str();
+  c.model.matcha.tokens = config.model.matcha.tokens.c_str();
+  c.model.matcha.data_dir = config.model.matcha.data_dir.c_str();
+  c.model.matcha.noise_scale = config.model.matcha.noise_scale;
+  c.model.matcha.length_scale = config.model.matcha.length_scale;
+  c.model.matcha.dict_dir = config.model.matcha.dict_dir.c_str();
+
+  c.model.num_threads = config.model.num_threads;
+  c.model.debug = config.model.debug;
+  c.model.provider = config.model.provider.c_str();
+
+  c.rule_fsts = config.rule_fsts.c_str();
+  c.max_num_sentences = config.max_num_sentences;
+  c.rule_fars = config.rule_fars.c_str();
+
+  auto p = SherpaOnnxCreateOfflineTts(&c);
+  return OfflineTts(p);
+}
+
+OfflineTts::OfflineTts(const SherpaOnnxOfflineTts *p)
+    : MoveOnly<OfflineTts, SherpaOnnxOfflineTts>(p) {}
+
+void OfflineTts::Destroy(const SherpaOnnxOfflineTts *p) const {
+  SherpaOnnxDestroyOfflineTts(p);
+}
+
+int32_t OfflineTts::SampleRate() const {
+  return SherpaOnnxOfflineTtsSampleRate(p_);
+}
+
+int32_t OfflineTts::NumSpeakers() const {
+  return SherpaOnnxOfflineTtsNumSpeakers(p_);
+}
+
+GeneratedAudio OfflineTts::Generate(const std::string &text,
+                                    int32_t sid /*= 0*/, float speed /*= 1.0*/,
+                                    OfflineTtsCallback callback /*= nullptr*/,
+                                    void *arg /*= nullptr*/) const {
+  const SherpaOnnxGeneratedAudio *audio;
+  if (!callback) {
+    audio = SherpaOnnxOfflineTtsGenerate(p_, text.c_str(), sid, speed);
+  } else {
+    audio = SherpaOnnxOfflineTtsGenerateWithProgressCallbackWithArg(
+        p_, text.c_str(), sid, speed, callback, arg);
+  }
+
+  GeneratedAudio ans;
+  ans.samples = std::vector<float>{audio->samples, audio->samples + audio->n};
+  ans.sample_rate = audio->sample_rate;
+
+  SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
+  return ans;
+}
+
 }  // namespace sherpa_onnx::cxx
--- a/sherpa-onnx/c-api/cxx-api.h
+++ b/sherpa-onnx/c-api/cxx-api.h
@@ -97,6 +97,10 @@ struct Wave {

 SHERPA_ONNX_API Wave ReadWave(const std::string &filename);

+// Return true on success;
+// Return false on failure
+SHERPA_ONNX_API bool WriteWave(const std::string &filename, const Wave &wave);
+
 template <typename Derived, typename T>
 class SHERPA_ONNX_API MoveOnly {
 public:
@@ -307,6 +311,91 @@ class SHERPA_ONNX_API OfflineRecognizer
  explicit OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p);
 };

+// ============================================================================
+// Non-streaming TTS
+// ============================================================================
+struct OfflineTtsVitsModelConfig {
+  std::string model;
+  std::string lexicon;
+  std::string tokens;
+  std::string data_dir;
+  std::string dict_dir;
+
+  float noise_scale = 0.667;
+  float noise_scale_w = 0.8;
+  float length_scale = 1.0;  // < 1, faster in speed; > 1, slower in speed
+};
+
+struct OfflineTtsMatchaModelConfig {
+  std::string acoustic_model;
+  std::string vocoder;
+  std::string lexicon;
+  std::string tokens;
+  std::string data_dir;
+  std::string dict_dir;
+
+  float noise_scale = 0.667;
+  float length_scale = 1.0;  // < 1, faster in speed; > 1, slower in speed
+};
+
+struct OfflineTtsModelConfig {
+  OfflineTtsVitsModelConfig vits;
+  OfflineTtsMatchaModelConfig matcha;
+  int32_t num_threads = 1;
+  bool debug = false;
+  std::string provider = "cpu";
+};
+
+struct OfflineTtsConfig {
+  OfflineTtsModelConfig model;
+  std::string rule_fsts;
+  std::string rule_fars;
+  int32_t max_num_sentences = 1;
+};
+
+struct GeneratedAudio {
+  std::vector<float> samples;  // in the range [-1, 1]
+  int32_t sample_rate;
+};
+
+// Return 1 to continue generating
+// Return 0 to stop generating
+using OfflineTtsCallback = int32_t (*)(const float *samples,
+                                       int32_t num_samples, float progress,
+                                       void *arg);
+
+class SHERPA_ONNX_API OfflineTts
+    : public MoveOnly<OfflineTts, SherpaOnnxOfflineTts> {
+ public:
+  static OfflineTts Create(const OfflineTtsConfig &config);
+
+  void Destroy(const SherpaOnnxOfflineTts *p) const;
+
+  // Return the sample rate of the generated audio
+  int32_t SampleRate() const;
+
+  // Number of supported speakers.
+  // If it supports only a single speaker, then it return 0 or 1.
+  int32_t NumSpeakers() const;
+
+  // @param text A string containing words separated by spaces
+  // @param sid Speaker ID. Used only for multi-speaker models, e.g., models
+  //            trained using the VCTK dataset. It is not used for
+  //            single-speaker models, e.g., models trained using the ljspeech
+  //            dataset.
+  // @param speed The speed for the generated speech. E.g., 2 means 2x faster.
+  // @param callback If not NULL, it is called whenever config.max_num_sentences
+  //                 sentences have been processed. The callback is called in
+  //                 the current thread.
+  GeneratedAudio Generate(const std::string &text, int32_t sid = 0,
+                          float speed = 1.0,
+                          OfflineTtsCallback callback = nullptr,
+                          void *arg = nullptr) const;
+
+ private:
+  explicit OfflineTts(const SherpaOnnxOfflineTts *p);
+};
+
 }  // namespace sherpa_onnx::cxx

 #endif  // SHERPA_ONNX_C_API_CXX_API_H_