Remove spaces after punctuations for TTS (#1666)
This commit is contained in:
@@ -6,6 +6,7 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <regex> // NOLINT
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
@@ -16,6 +17,14 @@
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
static bool IsPunct(const std::string &s) {
|
||||
static const std::unordered_set<std::string> puncts = {
|
||||
",", ".", "!", "?", ":", "\"", "'", ",",
|
||||
"。", "!", "?", "“", "”", "‘", "’",
|
||||
};
|
||||
return puncts.count(s);
|
||||
}
|
||||
|
||||
class JiebaLexicon::Impl {
|
||||
public:
|
||||
Impl(const std::string &lexicon, const std::string &tokens,
|
||||
@@ -67,8 +76,13 @@ class JiebaLexicon::Impl {
|
||||
jieba_->Cut(text, words, is_hmm);
|
||||
|
||||
if (debug_) {
|
||||
SHERPA_ONNX_LOGE("input text: %s", text.c_str());
|
||||
SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str());
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("input text:\n%{public}s", text.c_str());
|
||||
SHERPA_ONNX_LOGE("after replacing punctuations:\n%{public}s", s.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("input text:\n%s", text.c_str());
|
||||
SHERPA_ONNX_LOGE("after replacing punctuations:\n%s", s.c_str());
|
||||
#endif
|
||||
|
||||
std::ostringstream os;
|
||||
std::string sep = "";
|
||||
@@ -77,7 +91,52 @@ class JiebaLexicon::Impl {
|
||||
sep = "_";
|
||||
}
|
||||
|
||||
SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str());
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("after jieba processing:\n%{public}s", os.str().c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("after jieba processing:\n%s", os.str().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
// remove spaces after punctuations
|
||||
std::vector<std::string> words2 = std::move(words);
|
||||
words.reserve(words2.size());
|
||||
|
||||
for (int32_t i = 0; i < words2.size(); ++i) {
|
||||
if (i == 0) {
|
||||
words.push_back(std::move(words2[i]));
|
||||
} else if (words2[i] == " ") {
|
||||
if (words.back() == " " || IsPunct(words.back())) {
|
||||
continue;
|
||||
} else {
|
||||
words.push_back(std::move(words2[i]));
|
||||
}
|
||||
} else if (IsPunct(words2[i])) {
|
||||
if (words.back() == " " || IsPunct(words.back())) {
|
||||
continue;
|
||||
} else {
|
||||
words.push_back(std::move(words2[i]));
|
||||
}
|
||||
} else {
|
||||
words.push_back(std::move(words2[i]));
|
||||
}
|
||||
}
|
||||
|
||||
if (debug_) {
|
||||
std::ostringstream os;
|
||||
std::string sep = "";
|
||||
for (const auto &w : words) {
|
||||
os << sep << w;
|
||||
sep = "_";
|
||||
}
|
||||
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%{public}s",
|
||||
os.str().c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%s",
|
||||
os.str().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> ans;
|
||||
@@ -86,7 +145,11 @@ class JiebaLexicon::Impl {
|
||||
for (const auto &w : words) {
|
||||
auto ids = ConvertWordToIds(w);
|
||||
if (ids.empty()) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("Ignore OOV '%{public}s'", w.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str());
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -173,8 +236,15 @@ class JiebaLexicon::Impl {
|
||||
ToLowerCase(&word);
|
||||
|
||||
if (word2ids_.count(word)) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Duplicated word: %{public}s at line %{public}d:%{public}s. Ignore "
|
||||
"it.",
|
||||
word.c_str(), line_num, line.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
|
||||
word.c_str(), line_num, line.c_str());
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user