diff --git a/.github/workflows/test-build-wheel.yaml b/.github/workflows/test-build-wheel.yaml index b7e23ba3..ce18a3c5 100644 --- a/.github/workflows/test-build-wheel.yaml +++ b/.github/workflows/test-build-wheel.yaml @@ -35,18 +35,18 @@ jobs: matrix: # See https://github.com/actions/runner-images include: - - os: ubuntu-22.04 - python-version: "3.7" - - os: ubuntu-22.04 + - os: ubuntu-latest python-version: "3.8" - - os: ubuntu-22.04 + - os: ubuntu-latest python-version: "3.9" - - os: ubuntu-22.04 + - os: ubuntu-latest python-version: "3.10" - - os: ubuntu-22.04 + - os: ubuntu-latest python-version: "3.11" - - os: ubuntu-22.04 + - os: ubuntu-latest python-version: "3.12" + - os: ubuntu-latest + python-version: "3.13" - os: macos-13 python-version: "3.8" @@ -103,7 +103,7 @@ jobs: export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" cmake --version - export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j" + export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j2" python3 setup.py bdist_wheel ls -lh dist diff --git a/README.md b/README.md index b98194ee..8136d51f 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ ### Supported functions -|Speech recognition| Speech synthesis | Source separation | +|Speech recognition| [Speech synthesis][tts-url] | [Source separation][ss-url] | |------------------|------------------|-------------------| | ✔️ | ✔️ | ✔️ | -|Speaker identification| Speaker diarization | Speaker verification | +|Speaker identification| [Speaker diarization][sd-url] | Speaker verification | |----------------------|-------------------- |------------------------| | ✔️ | ✔️ | ✔️ | -| Spoken Language identification | Audio tagging | Voice activity detection | +| [Spoken Language identification][slid-url] | [Audio tagging][at-url] | [Voice activity detection][vad-url] | |--------------------------------|---------------|--------------------------| | ✔️ | ✔️ | ✔️ | -| Keyword spotting | Add punctuation | Speech enhancement | +| [Keyword spotting][kws-url] | [Add punctuation][punct-url] | [Speech enhancement][se-url] | |------------------|-----------------|--------------------| | ✔️ | ✔️ | ✔️ | @@ -501,3 +501,12 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. [spleeter]: https://github.com/deezer/spleeter [UVR]: https://github.com/Anjok07/ultimatevocalremovergui [gtcrn]: https://github.com/Xiaobin-Rong/gtcrn +[tts-url]: https://k2-fsa.github.io/sherpa/onnx/tts/all-in-one.html +[ss-url]: https://k2-fsa.github.io/sherpa/onnx/source-separation/index.html +[sd-url]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/index.html +[slid-url]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/index.html +[at-url]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/index.html +[vad-url]: https://k2-fsa.github.io/sherpa/onnx/vad/index.html +[kws-url]: https://k2-fsa.github.io/sherpa/onnx/kws/index.html +[punct-url]: https://k2-fsa.github.io/sherpa/onnx/punctuation/index.html +[se-url]: https://k2-fsa.github.io/sherpa/onnx/speech-enhancment/index.html diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index f33891ee..71921a8d 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -201,6 +201,7 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct { external double lengthScale; external Pointer dictDir; external Pointer lexicon; + external Pointer lang; } final class SherpaOnnxOfflineTtsModelConfig extends Struct { diff --git a/flutter/sherpa_onnx/lib/src/tts.dart b/flutter/sherpa_onnx/lib/src/tts.dart index d3099944..c117eb91 100644 --- a/flutter/sherpa_onnx/lib/src/tts.dart +++ b/flutter/sherpa_onnx/lib/src/tts.dart @@ -117,6 +117,7 @@ class OfflineTtsKokoroModelConfig { this.lengthScale = 1.0, this.dictDir = '', this.lexicon = '', + this.lang = '', }); factory OfflineTtsKokoroModelConfig.fromJson(Map json) { @@ -128,12 +129,13 @@ class OfflineTtsKokoroModelConfig { lengthScale: (json['lengthScale'] as num?)?.toDouble() ?? 1.0, dictDir: json['dictDir'] as String? ?? '', lexicon: json['lexicon'] as String? ?? '', + lang: json['lang'] as String? ?? '', ); } @override String toString() { - return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon)'; + return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon, lang: $lang)'; } Map toJson() => { @@ -144,6 +146,7 @@ class OfflineTtsKokoroModelConfig { 'lengthScale': lengthScale, 'dictDir': dictDir, 'lexicon': lexicon, + 'lang': lang, }; final String model; @@ -153,6 +156,7 @@ class OfflineTtsKokoroModelConfig { final double lengthScale; final String dictDir; final String lexicon; + final String lang; } class OfflineTtsModelConfig { @@ -286,6 +290,7 @@ class OfflineTts { c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale; c.ref.model.kokoro.dictDir = config.model.kokoro.dictDir.toNativeUtf8(); c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8(); + c.ref.model.kokoro.lang = config.model.kokoro.lang.toNativeUtf8(); c.ref.model.numThreads = config.model.numThreads; c.ref.model.debug = config.model.debug ? 1 : 0; @@ -302,6 +307,7 @@ class OfflineTts { calloc.free(c.ref.ruleFsts); calloc.free(c.ref.model.provider); + calloc.free(c.ref.model.kokoro.lang); calloc.free(c.ref.model.kokoro.lexicon); calloc.free(c.ref.model.kokoro.dictDir); calloc.free(c.ref.model.kokoro.dataDir); diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc index 62b0422d..5ceebaae 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc @@ -70,6 +70,7 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig( SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale); SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir); SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon); + SHERPA_ONNX_ASSIGN_ATTR_STR(lang, lang); return c; } @@ -177,6 +178,7 @@ static Napi::External CreateOfflineTtsWrapper( SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.data_dir); SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.dict_dir); SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lexicon); + SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lang); SHERPA_ONNX_DELETE_C_STR(c.model.provider); diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets index 48e2e3a9..1c3c91b1 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets @@ -36,6 +36,7 @@ export class OfflineTtsKokoroModelConfig { public lengthScale: number = 1.0; public dictDir: string = ''; public lexicon: string = ''; + public lang: string = ''; } export class OfflineTtsModelConfig { diff --git a/scripts/dotnet/OfflineTtsKokoroModelConfig.cs b/scripts/dotnet/OfflineTtsKokoroModelConfig.cs index 2ac40242..1a76288a 100644 --- a/scripts/dotnet/OfflineTtsKokoroModelConfig.cs +++ b/scripts/dotnet/OfflineTtsKokoroModelConfig.cs @@ -18,6 +18,7 @@ namespace SherpaOnnx DictDir = ""; Lexicon = ""; + Lang = ""; } [MarshalAs(UnmanagedType.LPStr)] public string Model; @@ -38,5 +39,8 @@ namespace SherpaOnnx [MarshalAs(UnmanagedType.LPStr)] public string Lexicon; + + [MarshalAs(UnmanagedType.LPStr)] + public string Lang; } } diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index e1d50a81..01dcaab5 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -857,6 +857,7 @@ type OfflineTtsKokoroModelConfig struct { DataDir string // Path to espeak-ng-data directory DictDir string // Path to dict directory Lexicon string // Path to lexicon files + Lang string // Example: es for Spanish, fr-fr for French. Can be empty LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed } @@ -1006,6 +1007,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { c.model.kokoro.lexicon = C.CString(config.Model.Kokoro.Lexicon) defer C.free(unsafe.Pointer(c.model.kokoro.lexicon)) + c.model.kokoro.lang = C.CString(config.Model.Kokoro.Lang) + defer C.free(unsafe.Pointer(c.model.kokoro.lang)) + c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale) c.model.num_threads = C.int(config.Model.NumThreads) diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 2cfa4ac1..d442ce38 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -1164,6 +1164,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( SHERPA_ONNX_OR(config->model.kokoro.dict_dir, ""); tts_config.model.kokoro.lexicon = SHERPA_ONNX_OR(config->model.kokoro.lexicon, ""); + tts_config.model.kokoro.lang = SHERPA_ONNX_OR(config->model.kokoro.lang, ""); tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); tts_config.model.debug = config->model.debug; diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 2134d2b3..920d0e9b 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -958,6 +958,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsKokoroModelConfig { float length_scale; // < 1, faster in speech speed; > 1, slower in speed const char *dict_dir; const char *lexicon; + const char *lang; } SherpaOnnxOfflineTtsKokoroModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { diff --git a/sherpa-onnx/c-api/cxx-api.cc b/sherpa-onnx/c-api/cxx-api.cc index 9e84f3b4..e6278aa3 100644 --- a/sherpa-onnx/c-api/cxx-api.cc +++ b/sherpa-onnx/c-api/cxx-api.cc @@ -366,6 +366,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { c.model.kokoro.length_scale = config.model.kokoro.length_scale; c.model.kokoro.dict_dir = config.model.kokoro.dict_dir.c_str(); c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str(); + c.model.kokoro.lang = config.model.kokoro.lang.c_str(); c.model.num_threads = config.model.num_threads; c.model.debug = config.model.debug; diff --git a/sherpa-onnx/c-api/cxx-api.h b/sherpa-onnx/c-api/cxx-api.h index 28ea4ee2..7fdf0b8e 100644 --- a/sherpa-onnx/c-api/cxx-api.h +++ b/sherpa-onnx/c-api/cxx-api.h @@ -367,6 +367,7 @@ struct OfflineTtsKokoroModelConfig { std::string data_dir; std::string dict_dir; std::string lexicon; + std::string lang; float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed }; diff --git a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc index 707e68ac..fa4129c1 100644 --- a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc +++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc @@ -67,7 +67,8 @@ class KokoroMultiLangLexicon::Impl { InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc } - std::vector ConvertTextToTokenIds(const std::string &_text) const { + std::vector ConvertTextToTokenIds(const std::string &_text, + const std::string &voice) const { std::string text = ToLowerCase(_text); if (debug_) { SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str()); @@ -124,7 +125,7 @@ class KokoroMultiLangLexicon::Impl { SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); } - ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice); + ids_vec = ConvertNonChineseToTokenIDs(ms, voice); } for (const auto &ids : ids_vec) { @@ -255,8 +256,30 @@ class KokoroMultiLangLexicon::Impl { return ans; } - std::vector> ConvertEnglishToTokenIDs( + std::vector> ConvertTextToTokenIDsWithEspeak( const std::string &text, const std::string &voice) const { + auto temp = ConvertTextToTokenIdsKokoro( + phoneme2id_, meta_data_.max_token_len, text, voice); + std::vector> ans; + ans.reserve(temp.size()); + + for (const auto &i : temp) { + ans.emplace_back(i.tokens.begin(), i.tokens.end()); + } + + return ans; + } + + std::vector> ConvertNonChineseToTokenIDs( + const std::string &text, const std::string &voice) const { + if (!voice.empty()) { + return ConvertTextToTokenIDsWithEspeak(text, voice); + } + + // If voice is empty, we split the text into words and use the lexicon + // to lookup the pronunciation of each word, fallback to espeak if + // a word is not in the lexicon. + std::vector words = SplitUtf8(text); if (debug_) { std::ostringstream os; @@ -317,7 +340,7 @@ class KokoroMultiLangLexicon::Impl { piper::eSpeakPhonemeConfig config; - config.voice = voice; + config.voice = meta_data_.voice; std::vector> phonemes; @@ -391,9 +414,28 @@ class KokoroMultiLangLexicon::Impl { void InitTokens(std::istream &is) { token2id_ = ReadTokens(is); // defined in ./symbol-table.cc + + std::wstring_convert, char32_t> conv; + std::u32string s; + for (const auto &p : token2id_) { + s = conv.from_bytes(p.first); + + if (s.size() != 1) { + SHERPA_ONNX_LOGE("Error for token %s with id %d", p.first.c_str(), + p.second); + SHERPA_ONNX_EXIT(-1); + } + + char32_t c = s[0]; + phoneme2id_.insert({c, p.second}); + } } void InitLexicon(const std::string &lexicon) { + if (lexicon.empty()) { + return; + } + std::vector files; SplitStringToVector(lexicon, ",", false, &files); for (const auto &f : files) { @@ -404,6 +446,10 @@ class KokoroMultiLangLexicon::Impl { template void InitLexicon(Manager *mgr, const std::string &lexicon) { + if (lexicon.empty()) { + return; + } + std::vector files; SplitStringToVector(lexicon, ",", false, &files); for (const auto &f : files) { @@ -445,7 +491,7 @@ class KokoroMultiLangLexicon::Impl { std::vector ids = ConvertTokensToIds(token2id_, token_list); - if (ids.empty()) { + if (ids.empty() && word != "呣") { SHERPA_ONNX_LOGE( "Invalid pronunciation for word '%s' at line %d:%s. Ignore it", word.c_str(), line_num, line.c_str()); @@ -465,6 +511,8 @@ class KokoroMultiLangLexicon::Impl { // tokens.txt is saved in token2id_ std::unordered_map token2id_; + std::unordered_map phoneme2id_; + std::unique_ptr jieba_; bool debug_ = false; }; @@ -487,8 +535,8 @@ KokoroMultiLangLexicon::KokoroMultiLangLexicon( meta_data, debug)) {} std::vector KokoroMultiLangLexicon::ConvertTextToTokenIds( - const std::string &text, const std::string & /*unused_voice = ""*/) const { - return impl_->ConvertTextToTokenIds(text); + const std::string &text, const std::string &voice /*= ""*/) const { + return impl_->ConvertTextToTokenIds(text, voice); } #if __ANDROID_API__ >= 9 diff --git a/sherpa-onnx/csrc/offline-speech-denoiser-model-config.h b/sherpa-onnx/csrc/offline-speech-denoiser-model-config.h index 0c15e660..16cbcbd6 100644 --- a/sherpa-onnx/csrc/offline-speech-denoiser-model-config.h +++ b/sherpa-onnx/csrc/offline-speech-denoiser-model-config.h @@ -20,9 +20,9 @@ struct OfflineSpeechDenoiserModelConfig { OfflineSpeechDenoiserModelConfig() = default; - OfflineSpeechDenoiserModelConfig(OfflineSpeechDenoiserGtcrnModelConfig gtcrn, - int32_t num_threads, bool debug, - const std::string &provider) + OfflineSpeechDenoiserModelConfig( + const OfflineSpeechDenoiserGtcrnModelConfig >crn, int32_t num_threads, + bool debug, const std::string &provider) : gtcrn(gtcrn), num_threads(num_threads), debug(debug), diff --git a/sherpa-onnx/csrc/offline-tts-frontend.h b/sherpa-onnx/csrc/offline-tts-frontend.h index 43c4501c..218bfeac 100644 --- a/sherpa-onnx/csrc/offline-tts-frontend.h +++ b/sherpa-onnx/csrc/offline-tts-frontend.h @@ -6,6 +6,7 @@ #define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ #include #include +#include #include #include @@ -57,6 +58,12 @@ class OfflineTtsFrontend { // implementation is in ./piper-phonemize-lexicon.cc void InitEspeak(const std::string &data_dir); +// implementation in ./piper-phonemize-lexicon.cc +std::vector ConvertTextToTokenIdsKokoro( + const std::unordered_map &token2id, + int32_t max_token_len, const std::string &text, + const std::string &voice = ""); + } // namespace sherpa_onnx #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h index 276e9423..f88a1739 100644 --- a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h +++ b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h @@ -220,8 +220,9 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { } } - std::vector token_ids = - frontend_->ConvertTextToTokenIds(text, meta_data.voice); + std::vector token_ids = frontend_->ConvertTextToTokenIds( + text, config_.model.kokoro.lang.empty() ? meta_data.voice + : config_.model.kokoro.lang); if (token_ids.empty() || (token_ids.size() == 1 && token_ids[0].tokens.empty())) { @@ -335,12 +336,14 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { if (meta_data.version >= 2) { // this is a multi-lingual model, we require that you pass lexicon // and dict_dir - if (config_.model.kokoro.lexicon.empty() || + if ((config_.model.kokoro.lexicon.empty() && + config_.model.kokoro.lang.empty()) || config_.model.kokoro.dict_dir.empty()) { SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); SHERPA_ONNX_LOGE( "You are using a multi-lingual Kokoro model (e.g., Kokoro >= " - "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir"); + "v1.0). Please pass --kokoro-lexicon and --kokoro-dict-dir or " + "provide --kokoro-lang and --kokoro-dict-dir"); SHERPA_ONNX_EXIT(-1); } @@ -362,7 +365,8 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { if (meta_data.version >= 2) { // this is a multi-lingual model, we require that you pass lexicon // and dict_dir - if (config_.model.kokoro.lexicon.empty() || + if ((config_.model.kokoro.lexicon.empty() && + config_.model.kokoro.lang.empty()) || config_.model.kokoro.dict_dir.empty()) { SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); SHERPA_ONNX_LOGE( diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc index 59645060..95b9c4ff 100644 --- a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc +++ b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc @@ -18,6 +18,13 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { "Path to voices.bin for Kokoro models"); po->Register("kokoro-tokens", &tokens, "Path to tokens.txt for Kokoro models"); + po->Register("kokoro-lang", &lang, + "Used only by kokoro >= 1.0. Example values: " + "en (English), " + "es (Spanish), fr (French), hi (hindi), it (Italian), " + "pt-br (Brazilian Portuguese)." + "You can leave it empty, in which case you need to provide " + "--kokoro-lexicon."); po->Register( "kokoro-lexicon", &lexicon, "Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0" @@ -127,7 +134,8 @@ std::string OfflineTtsKokoroModelConfig::ToString() const { os << "lexicon=\"" << lexicon << "\", "; os << "data_dir=\"" << data_dir << "\", "; os << "dict_dir=\"" << dict_dir << "\", "; - os << "length_scale=" << length_scale << ")"; + os << "length_scale=" << length_scale << ", "; + os << "lang=\"" << lang << "\")"; return os.str(); } diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h index fae17927..573b283f 100644 --- a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h +++ b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h @@ -27,6 +27,13 @@ struct OfflineTtsKokoroModelConfig { // speed = 1 / length_scale float length_scale = 1.0; + // Used only for Kokoro >= 1.0. + // + // If it is not empty, meta_data.voice is ignored. + // Example values: es (Spanish), fr (French), pt (Portuguese) + // See https://hf-mirror.com/hexgrad/Kokoro-82M/blob/main/VOICES.md + std::string lang; + OfflineTtsKokoroModelConfig() = default; OfflineTtsKokoroModelConfig(const std::string &model, @@ -34,14 +41,16 @@ struct OfflineTtsKokoroModelConfig { const std::string &tokens, const std::string &lexicon, const std::string &data_dir, - const std::string &dict_dir, float length_scale) + const std::string &dict_dir, float length_scale, + const std::string &lang) : model(model), voices(voices), tokens(tokens), lexicon(lexicon), data_dir(data_dir), dict_dir(dict_dir), - length_scale(length_scale) {} + length_scale(length_scale), + lang(lang) {} void Register(ParseOptions *po); bool Validate() const; diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index c7890a93..3624d998 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -351,7 +351,8 @@ std::vector PiperPhonemizeLexicon::ConvertTextToTokenIds( if (is_matcha_) { return ConvertTextToTokenIdsMatcha(text, voice); } else if (is_kokoro_) { - return ConvertTextToTokenIdsKokoro(text, voice); + return ConvertTextToTokenIdsKokoro( + token2id_, kokoro_meta_data_.max_token_len, text, voice); } else { return ConvertTextToTokenIdsVits(text, voice); } @@ -382,8 +383,10 @@ std::vector PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( return ans; } -std::vector PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro( - const std::string &text, const std::string &voice /*= ""*/) const { +std::vector ConvertTextToTokenIdsKokoro( + const std::unordered_map &token2id, + int32_t max_token_len, const std::string &text, + const std::string &voice /*= ""*/) { piper::eSpeakPhonemeConfig config; // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices @@ -397,8 +400,7 @@ std::vector PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro( std::vector ans; for (const auto &p : phonemes) { - auto phoneme_ids = - PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len); + auto phoneme_ids = PiperPhonemesToIdsKokoro(token2id, p, max_token_len); for (auto &ids : phoneme_ids) { ans.emplace_back(std::move(ids)); diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.h b/sherpa-onnx/csrc/piper-phonemize-lexicon.h index bb8c6e30..8738ac4b 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.h +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.h @@ -52,9 +52,6 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { std::vector ConvertTextToTokenIdsMatcha( const std::string &text, const std::string &voice = "") const; - std::vector ConvertTextToTokenIdsKokoro( - const std::string &text, const std::string &voice = "") const; - private: // map unicode codepoint to an integer ID std::unordered_map token2id_; diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsKokoroModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsKokoroModelConfig.java index 67fbf5ea..97363510 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsKokoroModelConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsKokoroModelConfig.java @@ -6,6 +6,7 @@ public class OfflineTtsKokoroModelConfig { private final String voices; private final String tokens; private final String lexicon; + private final String lang; private final String dataDir; private final String dictDir; private final float lengthScale; @@ -15,6 +16,7 @@ public class OfflineTtsKokoroModelConfig { this.voices = builder.voices; this.tokens = builder.tokens; this.lexicon = builder.lexicon; + this.lang = builder.lang; this.dataDir = builder.dataDir; this.dictDir = builder.dictDir; this.lengthScale = builder.lengthScale; @@ -50,6 +52,7 @@ public class OfflineTtsKokoroModelConfig { private String voices = ""; private String tokens = ""; private String lexicon = ""; + private String lang = ""; private String dataDir = ""; private String dictDir = ""; private float lengthScale = 1.0f; @@ -78,6 +81,11 @@ public class OfflineTtsKokoroModelConfig { return this; } + public Builder setLang(String lang) { + this.lang = lang; + return this; + } + public Builder setDataDir(String dataDir) { this.dataDir = dataDir; return this; diff --git a/sherpa-onnx/jni/offline-tts.cc b/sherpa-onnx/jni/offline-tts.cc index be0a2634..0e35fd25 100644 --- a/sherpa-onnx/jni/offline-tts.cc +++ b/sherpa-onnx/jni/offline-tts.cc @@ -145,6 +145,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { ans.model.kokoro.lexicon = p; env->ReleaseStringUTFChars(s, p); + fid = env->GetFieldID(kokoro_cls, "lang", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(kokoro, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.model.kokoro.lang = p; + env->ReleaseStringUTFChars(s, p); + fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;"); s = (jstring)env->GetObjectField(kokoro, fid); p = env->GetStringUTFChars(s, nullptr); diff --git a/sherpa-onnx/kotlin-api/Tts.kt b/sherpa-onnx/kotlin-api/Tts.kt index b4e07984..5f977a4e 100644 --- a/sherpa-onnx/kotlin-api/Tts.kt +++ b/sherpa-onnx/kotlin-api/Tts.kt @@ -31,6 +31,7 @@ data class OfflineTtsKokoroModelConfig( var tokens: String = "", var dataDir: String = "", var lexicon: String = "", + var lang: String = "", var dictDir: String = "", var lengthScale: Float = 1.0f, ) diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index ea4dcff0..652ea539 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -84,6 +84,7 @@ type LengthScale: Single; DictDir: AnsiString; Lexicon: AnsiString; + Lang: AnsiString; function ToString: AnsiString; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); @@ -841,6 +842,7 @@ type LengthScale: cfloat; DictDir: PAnsiChar; Lexicon: PAnsiChar; + Lang: PAnsiChar; end; SherpaOnnxOfflineTtsModelConfig = record @@ -2096,10 +2098,11 @@ begin 'DataDir := %s, ' + 'LengthScale := %.2f, ' + 'DictDir := %s, ' + - 'Lexicon := %s' + + 'Lexicon := %s, ' + + 'Lang := %s' + ')', [Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale, - Self.DictDir, Self.Lexicon]); + Self.DictDir, Self.Lexicon, Self.Lang]); end; class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); @@ -2180,6 +2183,7 @@ begin C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale; C.Model.Kokoro.DictDir := PAnsiChar(Config.Model.Kokoro.DictDir); C.Model.Kokoro.Lexicon := PAnsiChar(Config.Model.Kokoro.Lexicon); + C.Model.Kokoro.Lang := PAnsiChar(Config.Model.Kokoro.Lang); C.Model.NumThreads := Config.Model.NumThreads; C.Model.Provider := PAnsiChar(Config.Model.Provider); diff --git a/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc index d9a00ca4..620e82c4 100644 --- a/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc +++ b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc @@ -17,10 +17,12 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { .def(py::init<>()) .def(py::init(), + const std::string &, const std::string &, float, + const std::string &>(), py::arg("model"), py::arg("voices"), py::arg("tokens"), py::arg("lexicon") = "", py::arg("data_dir"), - py::arg("dict_dir") = "", py::arg("length_scale") = 1.0) + py::arg("dict_dir") = "", py::arg("length_scale") = 1.0, + py::arg("lang") = "") .def_readwrite("model", &PyClass::model) .def_readwrite("voices", &PyClass::voices) .def_readwrite("tokens", &PyClass::tokens) @@ -28,6 +30,7 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { .def_readwrite("data_dir", &PyClass::data_dir) .def_readwrite("dict_dir", &PyClass::dict_dir) .def_readwrite("length_scale", &PyClass::length_scale) + .def_readwrite("lang", &PyClass::lang) .def("__str__", &PyClass::ToString) .def("validate", &PyClass::Validate); } diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index b2cef56b..148876a8 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -806,7 +806,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig( dataDir: String = "", lengthScale: Float = 1.0, dictDir: String = "", - lexicon: String = "" + lexicon: String = "", + lang: String = "" ) -> SherpaOnnxOfflineTtsKokoroModelConfig { return SherpaOnnxOfflineTtsKokoroModelConfig( model: toCPointer(model), @@ -815,7 +816,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig( data_dir: toCPointer(dataDir), length_scale: lengthScale, dict_dir: toCPointer(dictDir), - lexicon: toCPointer(lexicon) + lexicon: toCPointer(lexicon), + lang: toCPointer(lang) ) } diff --git a/wasm/tts/sherpa-onnx-tts.js b/wasm/tts/sherpa-onnx-tts.js index 1532ae89..be7188f4 100644 --- a/wasm/tts/sherpa-onnx-tts.js +++ b/wasm/tts/sherpa-onnx-tts.js @@ -143,13 +143,14 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1; const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; + const langLen = Module.lengthBytesUTF8(config.lang || '') + 1; - const n = - modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + lexiconLen; + const n = modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + + lexiconLen + langLen; const buffer = Module._malloc(n); - const len = 7 * 4; + const len = 8 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -171,6 +172,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); offset += lexiconLen; + Module.stringToUTF8(config.lang || '', buffer + offset, langLen); + offset += langLen; + offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); offset += modelLen; @@ -192,6 +196,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { Module.setValue(ptr + 24, buffer + offset, 'i8*'); offset += lexiconLen; + Module.setValue(ptr + 28, buffer + offset, 'i8*'); + offset += langLen; + return { buffer: buffer, ptr: ptr, len: len, } @@ -233,6 +240,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { dataDir: '', dictDir: '', lexicon: '', + lang: '', }; } diff --git a/wasm/tts/sherpa-onnx-wasm-main-tts.cc b/wasm/tts/sherpa-onnx-wasm-main-tts.cc index 44c45842..88f75c80 100644 --- a/wasm/tts/sherpa-onnx-wasm-main-tts.cc +++ b/wasm/tts/sherpa-onnx-wasm-main-tts.cc @@ -15,7 +15,7 @@ extern "C" { static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); -static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 7 * 4, ""); +static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +