Support French in TTS (#397)

This commit is contained in:
Fangjun Kuang
2023-10-28 22:22:00 +08:00
committed by GitHub
parent 64ab1ea9f8
commit 157628b257
4 changed files with 96 additions and 4 deletions

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR) cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx) project(sherpa-onnx)
set(SHERPA_ONNX_VERSION "1.8.6") set(SHERPA_ONNX_VERSION "1.8.7")
# Disable warning about # Disable warning about
# #

View File

@@ -133,6 +133,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
return ConvertTextToTokenIdsGerman(text); return ConvertTextToTokenIdsGerman(text);
case Language::kSpanish: case Language::kSpanish:
return ConvertTextToTokenIdsSpanish(text); return ConvertTextToTokenIdsSpanish(text);
case Language::kFrench:
return ConvertTextToTokenIdsFrench(text);
case Language::kChinese: case Language::kChinese:
return ConvertTextToTokenIdsChinese(text); return ConvertTextToTokenIdsChinese(text);
default: default:
@@ -254,6 +256,8 @@ void Lexicon::InitLanguage(const std::string &_lang) {
language_ = Language::kGerman; language_ = Language::kGerman;
} else if (lang == "spanish") { } else if (lang == "spanish") {
language_ = Language::kSpanish; language_ = Language::kSpanish;
} else if (lang == "french") {
language_ = Language::kFrench;
} else if (lang == "chinese") { } else if (lang == "chinese") {
language_ = Language::kChinese; language_ = Language::kChinese;
} else { } else {

View File

@@ -46,6 +46,11 @@ class Lexicon {
return ConvertTextToTokenIdsEnglish(text); return ConvertTextToTokenIdsEnglish(text);
} }
std::vector<int64_t> ConvertTextToTokenIdsFrench(
const std::string &text) const {
return ConvertTextToTokenIdsEnglish(text);
}
std::vector<int64_t> ConvertTextToTokenIdsEnglish( std::vector<int64_t> ConvertTextToTokenIdsEnglish(
const std::string &text) const; const std::string &text) const;
@@ -62,6 +67,7 @@ class Lexicon {
kEnglish, kEnglish,
kGerman, kGerman,
kSpanish, kSpanish,
kFrench,
kChinese, kChinese,
kUnknown, kUnknown,
}; };

View File

@@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
std::vector<double> *out); std::vector<double> *out);
static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); }
static bool IsGermanUmlauts(const std::string &word) { static bool IsGermanUmlaut(const std::string &word) {
// ä 0xC3 0xA4 // ä 0xC3 0xA4
// ö 0xC3 0xB6 // ö 0xC3 0xB6
// ü 0xC3 0xBC // ü 0xC3 0xBC
@@ -187,6 +187,7 @@ static bool IsGermanUmlauts(const std::string &word) {
} }
// see https://www.tandem.net/blog/spanish-accents // see https://www.tandem.net/blog/spanish-accents
// https://www.compart.com/en/unicode/U+00DC
static bool IsSpanishDiacritic(const std::string &word) { static bool IsSpanishDiacritic(const std::string &word) {
// á 0xC3 0xA1 // á 0xC3 0xA1
// é 0xC3 0xA9 // é 0xC3 0xA9
@@ -195,6 +196,16 @@ static bool IsSpanishDiacritic(const std::string &word) {
// ú 0xC3 0xBA // ú 0xC3 0xBA
// ü 0xC3 0xBC // ü 0xC3 0xBC
// ñ 0xC3 0xB1 // ñ 0xC3 0xB1
//
// uppercase
//
// Á 0xC3 0x81
// É 0xC3 0x89
// Í 0xC3 0x8D
// Ó 0xC3 0x93
// Ú 0xC3 0x9A
// Ü 0xC3 0x9C
// Ñ 0xC3 0x91
if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) { if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) {
return false; return false;
@@ -202,15 +213,86 @@ static bool IsSpanishDiacritic(const std::string &word) {
auto c = static_cast<uint8_t>(word[1]); auto c = static_cast<uint8_t>(word[1]);
if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba || if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba ||
c == 0xbc || c == 0xb1) { c == 0xbc || c == 0xb1 || c == 0x81 || c == 0x89 || c == 0x8d ||
c == 0x93 || c == 0x9a || c == 0x9c || c == 0x91) {
return true; return true;
} }
return false; return false;
} }
// see https://www.busuu.com/en/french/accent-marks
static bool IsFrenchDiacritic(const std::string &word) {
// acute accent
// é 0xC3 0xA9
//
// grave accent
// à 0xC3 0xA0
// è 0xC3 0xA8
// ù 0xC3 0xB9
//
// cedilla
// ç 0xC3 0xA7
//
// circumflex
// â 0xC3 0xA2
// ê 0xC3 0xAA
// î 0xC3 0xAE
// ô 0xC3 0xB4
// û 0xC3 0xBB
//
// trema
// ë 0xC3 0xAB
// ï 0xC3 0xAF
// ü 0xC3 0xBC
//
// É 0xC3 0x89
//
// À 0xC3 0x80
// È 0xC3 0x88
// Ù 0xC3 0x99
// Ç 0xC3 0x87
// Â 0xC3 0x82
// Ê 0xC3 0x8A
// Î 0xC3 0x8E
// Ô 0xC3 0x94
// Û 0xC3 0x9B
// Ë 0xC3 0x8B
// Ï 0xC3 0x8F
// Ü 0xC3 0x9C
if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) {
return false;
}
auto c = static_cast<uint8_t>(word[1]);
if (c == 0xa9 || c == 0xa0 || c == 0xa8 || c == 0xb9 || c == 0xa7 ||
c == 0xa2 || c == 0xaa || c == 0xae || c == 0xb4 || c == 0xbb ||
c == 0xab || c == 0xaf || c == 0xbc || c == 0x89 || c == 0x80 ||
c == 0x88 || c == 0x99 || c == 0x87 || c == 0x82 || c == 0x8a ||
c == 0x8e || c == 0x94 || c == 0x9b || c == 0x8b || c == 0x8f ||
c == 0x9c) {
return true;
}
return false;
}
static bool IsSpecial(const std::string &w) { static bool IsSpecial(const std::string &w) {
return IsGermanUmlauts(w) || IsSpanishDiacritic(w); bool ans = IsGermanUmlaut(w) || IsSpanishDiacritic(w) || IsFrenchDiacritic(w);
// for french dimpossible
// 0xE2 0x80 0x99
bool ans2 = false;
if (w.size() == 3) {
auto c0 = static_cast<uint8_t>(w[0]);
auto c1 = static_cast<uint8_t>(w[1]);
auto c2 = static_cast<uint8_t>(w[2]);
if (c0 == 0xe2 && c1 == 0x80 && c2 == 0x99) {
ans2 = true;
}
}
return ans || ans2;
} }
static std::vector<std::string> MergeCharactersIntoWords( static std::vector<std::string> MergeCharactersIntoWords(