Use espeak-ng for coqui-ai/TTS VITS English models. (#466)

2023-12-06 11:00:38 +08:00
parent 3b90e85ef2
commit 23cf92daf7
10 changed files with 230 additions and 93 deletions
--- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc
+++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc
@@ -57,10 +57,17 @@ static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) {

    s = conv.from_bytes(sym);
    if (s.size() != 1) {
+      // for tokens.txt from coqui-ai/TTS, the last token is <BLNK>
+      if (s.size() == 6 && s[0] == '<' && s[1] == 'B' && s[2] == 'L' &&
+          s[3] == 'N' && s[4] == 'K' && s[5] == '>') {
+        continue;
+      }
+
      SHERPA_ONNX_LOGE("Error when reading tokens at Line %s. size: %d",
                       line.c_str(), static_cast<int32_t>(s.size()));
      exit(-1);
    }
+
    char32_t c = s[0];

    if (token2id.count(c)) {
@@ -77,7 +84,7 @@ static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) {

 // see the function "phonemes_to_ids" from
 // https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb
-static std::vector<int64_t> PhonemesToIds(
+static std::vector<int64_t> PiperPhonemesToIds(
    const std::unordered_map<char32_t, int32_t> &token2id,
    const std::vector<piper::Phoneme> &phonemes) {
  // see
@@ -104,6 +111,65 @@ static std::vector<int64_t> PhonemesToIds(
  return ans;
 }

+static std::vector<int64_t> CoquiPhonemesToIds(
+    const std::unordered_map<char32_t, int32_t> &token2id,
+    const std::vector<piper::Phoneme> &phonemes,
+    const OfflineTtsVitsModelMetaData &meta_data) {
+  // see
+  // https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/utils/text/tokenizer.py#L87
+  int32_t use_eos_bos = meta_data.use_eos_bos;
+  int32_t bos_id = meta_data.bos_id;
+  int32_t eos_id = meta_data.eos_id;
+  int32_t blank_id = meta_data.blank_id;
+  int32_t add_blank = meta_data.add_blank;
+  int32_t comma_id = token2id.at(',');
+  SHERPA_ONNX_LOGE("comma id: %d", comma_id);
+
+  std::vector<int64_t> ans;
+  if (add_blank) {
+    ans.reserve(phonemes.size() * 2 + 3);
+  } else {
+    ans.reserve(phonemes.size() + 2);
+  }
+
+  if (use_eos_bos) {
+    ans.push_back(bos_id);
+  }
+
+  if (add_blank) {
+    ans.push_back(blank_id);
+
+    for (auto p : phonemes) {
+      if (token2id.count(p)) {
+        ans.push_back(token2id.at(p));
+        ans.push_back(blank_id);
+      } else {
+        SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.",
+                         static_cast<uint32_t>(p));
+      }
+    }
+  } else {
+    // not adding blank
+    for (auto p : phonemes) {
+      if (token2id.count(p)) {
+        ans.push_back(token2id.at(p));
+      } else {
+        SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.",
+                         static_cast<uint32_t>(p));
+      }
+    }
+  }
+
+  // add a comma at the end of a sentence so that we can have a longer pause.
+  ans.push_back(comma_id);
+
+  if (use_eos_bos) {
+    ans.push_back(eos_id);
+  }
+
+  return ans;
+}
+
 void InitEspeak(const std::string &data_dir) {
  static std::once_flag init_flag;
  std::call_once(init_flag, [data_dir]() {
@@ -119,21 +185,23 @@ void InitEspeak(const std::string &data_dir) {
  });
 }

-PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &tokens,
-                                             const std::string &data_dir)
-    : data_dir_(data_dir) {
+PiperPhonemizeLexicon::PiperPhonemizeLexicon(
+    const std::string &tokens, const std::string &data_dir,
+    const OfflineTtsVitsModelMetaData &meta_data)
+    : meta_data_(meta_data) {
  {
    std::ifstream is(tokens);
    token2id_ = ReadTokens(is);
  }

-  InitEspeak(data_dir_);
+  InitEspeak(data_dir);
 }

 #if __ANDROID_API__ >= 9
-PiperPhonemizeLexicon::PiperPhonemizeLexicon(AAssetManager *mgr,
-                                             const std::string &tokens,
-                                             const std::string &data_dir) {
+PiperPhonemizeLexicon::PiperPhonemizeLexicon(
+    AAssetManager *mgr, const std::string &tokens, const std::string &data_dir,
+    const OfflineTtsVitsModelMetaData &meta_data)
+    : meta_data_(meta_data) {
  {
    auto buf = ReadFile(mgr, tokens);
    std::istrstream is(buf.data(), buf.size());
@@ -141,8 +209,9 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(AAssetManager *mgr,
  }

  // We should copy the directory of espeak-ng-data from the asset to
-  // some internal or external storage and then pass the directory to data_dir.
-  InitEspeak(data_dir_);
+  // some internal or external storage and then pass the directory to
+  // data_dir.
+  InitEspeak(data_dir);
 }
 #endif

@@ -160,9 +229,21 @@ std::vector<std::vector<int64_t>> PiperPhonemizeLexicon::ConvertTextToTokenIds(
  std::vector<std::vector<int64_t>> ans;

  std::vector<int64_t> phoneme_ids;
-  for (const auto &p : phonemes) {
-    phoneme_ids = PhonemesToIds(token2id_, p);
-    ans.push_back(std::move(phoneme_ids));
+
+  if (meta_data_.is_piper) {
+    for (const auto &p : phonemes) {
+      phoneme_ids = PiperPhonemesToIds(token2id_, p);
+      ans.push_back(std::move(phoneme_ids));
+    }
+  } else if (meta_data_.is_coqui) {
+    for (const auto &p : phonemes) {
+      phoneme_ids = CoquiPhonemesToIds(token2id_, p, meta_data_);
+      ans.push_back(std::move(phoneme_ids));
+    }
+
+  } else {
+    SHERPA_ONNX_LOGE("Unsupported model");
+    exit(-1);
  }

  return ans;