Support Spanish in TTS (#396)

This commit is contained in:
Fangjun Kuang
2023-10-28 11:09:34 +08:00
committed by GitHub
parent 69e985f701
commit 64ab1ea9f8
3 changed files with 43 additions and 6 deletions

View File

@@ -131,6 +131,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
return ConvertTextToTokenIdsEnglish(text); return ConvertTextToTokenIdsEnglish(text);
case Language::kGerman: case Language::kGerman:
return ConvertTextToTokenIdsGerman(text); return ConvertTextToTokenIdsGerman(text);
case Language::kSpanish:
return ConvertTextToTokenIdsSpanish(text);
case Language::kChinese: case Language::kChinese:
return ConvertTextToTokenIdsChinese(text); return ConvertTextToTokenIdsChinese(text);
default: default:
@@ -250,6 +252,8 @@ void Lexicon::InitLanguage(const std::string &_lang) {
language_ = Language::kEnglish; language_ = Language::kEnglish;
} else if (lang == "german") { } else if (lang == "german") {
language_ = Language::kGerman; language_ = Language::kGerman;
} else if (lang == "spanish") {
language_ = Language::kSpanish;
} else if (lang == "chinese") { } else if (lang == "chinese") {
language_ = Language::kChinese; language_ = Language::kChinese;
} else { } else {

View File

@@ -41,6 +41,11 @@ class Lexicon {
return ConvertTextToTokenIdsEnglish(text); return ConvertTextToTokenIdsEnglish(text);
} }
std::vector<int64_t> ConvertTextToTokenIdsSpanish(
const std::string &text) const {
return ConvertTextToTokenIdsEnglish(text);
}
std::vector<int64_t> ConvertTextToTokenIdsEnglish( std::vector<int64_t> ConvertTextToTokenIdsEnglish(
const std::string &text) const; const std::string &text) const;
@@ -56,6 +61,7 @@ class Lexicon {
enum class Language { enum class Language {
kEnglish, kEnglish,
kGerman, kGerman,
kSpanish,
kChinese, kChinese,
kUnknown, kUnknown,
}; };

View File

@@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
std::vector<double> *out); std::vector<double> *out);
static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); }
static bool IsGermanUmlauts(const std::string &words) { static bool IsGermanUmlauts(const std::string &word) {
// ä 0xC3 0xA4 // ä 0xC3 0xA4
// ö 0xC3 0xB6 // ö 0xC3 0xB6
// ü 0xC3 0xBC // ü 0xC3 0xBC
@@ -173,12 +173,12 @@ static bool IsGermanUmlauts(const std::string &words) {
// Ü 0xC3 0x9C // Ü 0xC3 0x9C
// ß 0xC3 0x9F // ß 0xC3 0x9F
if (words.size() != 2 || static_cast<uint8_t>(words[0]) != 0xc3) { if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) {
return false; return false;
} }
auto c = static_cast<uint8_t>(words[1]); auto c = static_cast<uint8_t>(word[1]);
if (c == 0xa4 || c == 0xb6 || c == 0xbC || c == 0x84 || c == 0x96 || if (c == 0xa4 || c == 0xb6 || c == 0xbc || c == 0x84 || c == 0x96 ||
c == 0x9c || c == 0x9f) { c == 0x9c || c == 0x9f) {
return true; return true;
} }
@@ -186,6 +186,33 @@ static bool IsGermanUmlauts(const std::string &words) {
return false; return false;
} }
// see https://www.tandem.net/blog/spanish-accents
static bool IsSpanishDiacritic(const std::string &word) {
// á 0xC3 0xA1
// é 0xC3 0xA9
// í 0xC3 0xAD
// ó 0xC3 0xB3
// ú 0xC3 0xBA
// ü 0xC3 0xBC
// ñ 0xC3 0xB1
if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) {
return false;
}
auto c = static_cast<uint8_t>(word[1]);
if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba ||
c == 0xbc || c == 0xb1) {
return true;
}
return false;
}
static bool IsSpecial(const std::string &w) {
return IsGermanUmlauts(w) || IsSpanishDiacritic(w);
}
static std::vector<std::string> MergeCharactersIntoWords( static std::vector<std::string> MergeCharactersIntoWords(
const std::vector<std::string> &words) { const std::vector<std::string> &words) {
std::vector<std::string> ans; std::vector<std::string> ans;
@@ -196,7 +223,7 @@ static std::vector<std::string> MergeCharactersIntoWords(
while (i < n) { while (i < n) {
const auto &w = words[i]; const auto &w = words[i];
if (w.size() >= 3 || (w.size() == 2 && !IsGermanUmlauts(w)) || if (w.size() >= 3 || (w.size() == 2 && !IsSpecial(w)) ||
(w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) { (w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) {
if (prev != -1) { if (prev != -1) {
std::string t; std::string t;
@@ -215,7 +242,7 @@ static std::vector<std::string> MergeCharactersIntoWords(
} }
// e.g., öffnen // e.g., öffnen
if (w.size() == 1 || (w.size() == 2 && IsGermanUmlauts(w))) { if (w.size() == 1 || (w.size() == 2 && IsSpecial(w))) {
if (prev == -1) { if (prev == -1) {
prev = i; prev = i;
} }