Use piper-phonemize to convert text to token IDs (#453)

This commit is contained in:
Fangjun Kuang
2023-11-30 23:57:43 +08:00
committed by GitHub
parent db41778e99
commit 62dc3c3e46
55 changed files with 1048 additions and 192 deletions

View File

@@ -129,8 +129,8 @@ Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon,
}
#endif
std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
const std::string &text) const {
std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIds(
const std::string &text, const std::string & /*voice*/ /*= ""*/) const {
switch (language_) {
case Language::kEnglish:
return ConvertTextToTokenIdsEnglish(text);
@@ -150,7 +150,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
return {};
}
std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese(
std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsChinese(
const std::string &text) const {
std::vector<std::string> words;
if (pattern_) {
@@ -245,10 +245,10 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese(
ans.push_back(eos);
}
return ans;
return {ans};
}
std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsEnglish(
const std::string &_text) const {
std::string text(_text);
ToLowerCase(&text);
@@ -301,7 +301,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
ans.push_back(token2id_.at("$")); // eos
}
return ans;
return {ans};
}
void Lexicon::InitTokens(std::istream &is) { token2id_ = ReadTokens(is); }