Remove spaces after punctuations for TTS (#1666)
This commit is contained in:
@@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <regex> // NOLINT
|
#include <regex> // NOLINT
|
||||||
|
#include <unordered_set>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "cppjieba/Jieba.hpp"
|
#include "cppjieba/Jieba.hpp"
|
||||||
@@ -16,6 +17,14 @@
|
|||||||
|
|
||||||
namespace sherpa_onnx {
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
static bool IsPunct(const std::string &s) {
|
||||||
|
static const std::unordered_set<std::string> puncts = {
|
||||||
|
",", ".", "!", "?", ":", "\"", "'", ",",
|
||||||
|
"。", "!", "?", "“", "”", "‘", "’",
|
||||||
|
};
|
||||||
|
return puncts.count(s);
|
||||||
|
}
|
||||||
|
|
||||||
class JiebaLexicon::Impl {
|
class JiebaLexicon::Impl {
|
||||||
public:
|
public:
|
||||||
Impl(const std::string &lexicon, const std::string &tokens,
|
Impl(const std::string &lexicon, const std::string &tokens,
|
||||||
@@ -67,8 +76,13 @@ class JiebaLexicon::Impl {
|
|||||||
jieba_->Cut(text, words, is_hmm);
|
jieba_->Cut(text, words, is_hmm);
|
||||||
|
|
||||||
if (debug_) {
|
if (debug_) {
|
||||||
SHERPA_ONNX_LOGE("input text: %s", text.c_str());
|
#if __OHOS__
|
||||||
SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str());
|
SHERPA_ONNX_LOGE("input text:\n%{public}s", text.c_str());
|
||||||
|
SHERPA_ONNX_LOGE("after replacing punctuations:\n%{public}s", s.c_str());
|
||||||
|
#else
|
||||||
|
SHERPA_ONNX_LOGE("input text:\n%s", text.c_str());
|
||||||
|
SHERPA_ONNX_LOGE("after replacing punctuations:\n%s", s.c_str());
|
||||||
|
#endif
|
||||||
|
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
std::string sep = "";
|
std::string sep = "";
|
||||||
@@ -77,7 +91,52 @@ class JiebaLexicon::Impl {
|
|||||||
sep = "_";
|
sep = "_";
|
||||||
}
|
}
|
||||||
|
|
||||||
SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str());
|
#if __OHOS__
|
||||||
|
SHERPA_ONNX_LOGE("after jieba processing:\n%{public}s", os.str().c_str());
|
||||||
|
#else
|
||||||
|
SHERPA_ONNX_LOGE("after jieba processing:\n%s", os.str().c_str());
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove spaces after punctuations
|
||||||
|
std::vector<std::string> words2 = std::move(words);
|
||||||
|
words.reserve(words2.size());
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < words2.size(); ++i) {
|
||||||
|
if (i == 0) {
|
||||||
|
words.push_back(std::move(words2[i]));
|
||||||
|
} else if (words2[i] == " ") {
|
||||||
|
if (words.back() == " " || IsPunct(words.back())) {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
words.push_back(std::move(words2[i]));
|
||||||
|
}
|
||||||
|
} else if (IsPunct(words2[i])) {
|
||||||
|
if (words.back() == " " || IsPunct(words.back())) {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
words.push_back(std::move(words2[i]));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
words.push_back(std::move(words2[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug_) {
|
||||||
|
std::ostringstream os;
|
||||||
|
std::string sep = "";
|
||||||
|
for (const auto &w : words) {
|
||||||
|
os << sep << w;
|
||||||
|
sep = "_";
|
||||||
|
}
|
||||||
|
|
||||||
|
#if __OHOS__
|
||||||
|
SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%{public}s",
|
||||||
|
os.str().c_str());
|
||||||
|
#else
|
||||||
|
SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%s",
|
||||||
|
os.str().c_str());
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<TokenIDs> ans;
|
std::vector<TokenIDs> ans;
|
||||||
@@ -86,7 +145,11 @@ class JiebaLexicon::Impl {
|
|||||||
for (const auto &w : words) {
|
for (const auto &w : words) {
|
||||||
auto ids = ConvertWordToIds(w);
|
auto ids = ConvertWordToIds(w);
|
||||||
if (ids.empty()) {
|
if (ids.empty()) {
|
||||||
|
#if __OHOS__
|
||||||
|
SHERPA_ONNX_LOGE("Ignore OOV '%{public}s'", w.c_str());
|
||||||
|
#else
|
||||||
SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str());
|
SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str());
|
||||||
|
#endif
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -173,8 +236,15 @@ class JiebaLexicon::Impl {
|
|||||||
ToLowerCase(&word);
|
ToLowerCase(&word);
|
||||||
|
|
||||||
if (word2ids_.count(word)) {
|
if (word2ids_.count(word)) {
|
||||||
|
#if __OHOS__
|
||||||
|
SHERPA_ONNX_LOGE(
|
||||||
|
"Duplicated word: %{public}s at line %{public}d:%{public}s. Ignore "
|
||||||
|
"it.",
|
||||||
|
word.c_str(), line_num, line.c_str());
|
||||||
|
#else
|
||||||
SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
|
SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
|
||||||
word.c_str(), line_num, line.c_str());
|
word.c_str(), line_num, line.c_str());
|
||||||
|
#endif
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user