Add C++ and Python API for Kokoro 1.0 multilingual TTS model (#1795)

This commit is contained in:
Fangjun Kuang
2025-02-06 22:57:13 +08:00
committed by GitHub
parent 08cefe8488
commit c84a833863
20 changed files with 819 additions and 39 deletions

View File

@@ -153,6 +153,7 @@ if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND sources
hifigan-vocoder.cc
jieba-lexicon.cc
kokoro-multi-lang-lexicon.cc
lexicon.cc
melo-tts-lexicon.cc
offline-tts-character-frontend.cc

View File

@@ -0,0 +1,522 @@
// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
//
// Copyright (c) 2025 Xiaomi Corporation
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
#include <codecvt>
#include <fstream>
#include <locale>
#include <regex> // NOLINT
#include <sstream>
#include <strstream>
#include <unordered_map>
#include <utility>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#if __OHOS__
#include "rawfile/raw_file_manager.h"
#endif
#include "cppjieba/Jieba.hpp"
#include "espeak-ng/speak_lib.h"
#include "phoneme_ids.hpp"
#include "phonemize.hpp"
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/symbol-table.h"
#include "sherpa-onnx/csrc/text-utils.h"
namespace sherpa_onnx {
void CallPhonemizeEspeak(const std::string &text,
piper::eSpeakPhonemeConfig &config, // NOLINT
std::vector<std::vector<piper::Phoneme>> *phonemes);
static std::wstring ToWideString(const std::string &s) {
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(s);
}
static std::string ToString(const std::wstring &s) {
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.to_bytes(s);
}
class KokoroMultiLangLexicon::Impl {
public:
Impl(const std::string &tokens, const std::string &lexicon,
const std::string &dict_dir, const std::string &data_dir,
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
: meta_data_(meta_data), debug_(debug) {
InitTokens(tokens);
InitLexicon(lexicon);
InitJieba(dict_dir);
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
}
template <typename Manager>
Impl(Manager *mgr, const std::string &tokens, const std::string &lexicon,
const std::string &dict_dir, const std::string &data_dir,
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
: meta_data_(meta_data), debug_(debug) {
InitTokens(mgr, tokens);
InitLexicon(mgr, lexicon);
// we assume you have copied dict_dir and data_dir from assets to some path
InitJieba(dict_dir);
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
}
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
std::string text = ToLowerCase(_text);
if (debug_) {
SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
}
std::vector<std::pair<std::string, std::string>> replace_str_pairs = {
{"", ","}, {":", ","}, {"", ","}, {"", ";"}, {"", ":"},
{"", "."}, {"", "?"}, {"", "!"}, {"\\s+", " "},
};
for (const auto &p : replace_str_pairs) {
std::regex re(p.first);
text = std::regex_replace(text, re, p.second);
}
if (debug_) {
SHERPA_ONNX_LOGE("After replacing punctuations and merging spaces:\n%s",
text.c_str());
}
// https://en.cppreference.com/w/cpp/regex
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
std::string expr =
"([;:,.?!'\"\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)";
auto ws = ToWideString(text);
std::wstring wexpr = ToWideString(expr);
std::wregex we(wexpr);
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
auto end = std::wsregex_iterator();
std::vector<TokenIDs> ans;
for (std::wsregex_iterator i = begin; i != end; ++i) {
std::wsmatch match = *i;
std::wstring match_str = match.str();
auto ms = ToString(match_str);
uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
std::vector<std::vector<int32_t>> ids_vec;
if (c < 0x80) {
if (debug_) {
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
}
ids_vec = ConvertEnglishToTokenIDs(ms);
} else {
if (debug_) {
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
}
ids_vec = ConvertChineseToTokenIDs(ms);
}
for (const auto &ids : ids_vec) {
if (ids.size() > 4) {
ans.emplace_back(ids);
} else {
if (ans.empty()) {
ans.emplace_back(ids);
} else {
ans.back().tokens.back() = ids[1];
ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2,
ids.end());
}
}
}
}
if (debug_) {
for (const auto &v : ans) {
std::ostringstream os;
os << "\n";
std::string sep;
for (auto i : v.tokens) {
os << sep << i;
sep = " ";
}
os << "\n";
SHERPA_ONNX_LOGE("%s", os.str().c_str());
}
}
return ans;
}
private:
bool IsPunctuation(const std::string &text) const {
if (text == ";" || text == ":" || text == "," || text == "." ||
text == "!" || text == "?" || text == "" || text == "" ||
text == "\"" || text == "(" || text == ")" || text == "" ||
text == "") {
return true;
}
return false;
}
std::vector<int32_t> ConvertWordToIds(const std::string &w) const {
std::vector<int32_t> ans;
if (word2ids_.count(w)) {
ans = word2ids_.at(w);
return ans;
}
std::vector<std::string> words = SplitUtf8(w);
for (const auto &word : words) {
if (word2ids_.count(word)) {
auto ids = ConvertWordToIds(word);
ans.insert(ans.end(), ids.begin(), ids.end());
} else {
SHERPA_ONNX_LOGE("Skip OOV: '%s'", word.c_str());
}
}
return ans;
}
std::vector<std::vector<int32_t>> ConvertChineseToTokenIDs(
const std::string &text) const {
bool is_hmm = true;
std::vector<std::string> words;
jieba_->Cut(text, words, is_hmm);
if (debug_) {
std::ostringstream os;
os << "After jieba processing:\n";
std::string sep;
for (const auto &w : words) {
os << sep << w;
sep = "_";
}
SHERPA_ONNX_LOGE("%s", os.str().c_str());
}
std::vector<std::vector<int32_t>> ans;
std::vector<int32_t> this_sentence;
int32_t max_len = meta_data_.max_token_len;
this_sentence.push_back(0);
for (const auto &w : words) {
auto ids = ConvertWordToIds(w);
if (this_sentence.size() + ids.size() > max_len - 2) {
this_sentence.push_back(0);
ans.push_back(std::move(this_sentence));
this_sentence.push_back(0);
}
this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
}
if (this_sentence.size() > 1) {
this_sentence.push_back(0);
ans.push_back(std::move(this_sentence));
}
if (debug_) {
for (const auto &v : ans) {
std::ostringstream os;
os << "\n";
std::string sep;
for (auto i : v) {
os << sep << i;
sep = " ";
}
os << "\n";
SHERPA_ONNX_LOGE("%s", os.str().c_str());
}
}
return ans;
}
std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
const std::string &text) const {
std::vector<std::string> words = SplitUtf8(text);
if (debug_) {
std::ostringstream os;
os << "After splitting to words: ";
std::string sep;
for (const auto &w : words) {
os << sep << w;
sep = "_";
}
SHERPA_ONNX_LOGE("%s", os.str().c_str());
}
std::vector<std::vector<int32_t>> ans;
int32_t max_len = meta_data_.max_token_len;
std::vector<int32_t> this_sentence;
int32_t space_id = token2id_.at(" ");
this_sentence.push_back(0);
for (const auto &word : words) {
if (IsPunctuation(word)) {
this_sentence.push_back(token2id_.at(word));
if (this_sentence.size() > max_len - 2) {
// this sentence is too long, split it
this_sentence.push_back(0);
ans.push_back(std::move(this_sentence));
this_sentence.push_back(0);
continue;
}
if (word == "." || word == "!" || word == "?" || word == ";") {
// Note: You can add more punctuations here to split the text
// into sentences. We just use four here: .!?;
this_sentence.push_back(0);
ans.push_back(std::move(this_sentence));
this_sentence.push_back(0);
}
} else if (word2ids_.count(word)) {
const auto &ids = word2ids_.at(word);
if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
this_sentence.push_back(0);
ans.push_back(std::move(this_sentence));
this_sentence.push_back(0);
}
this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
this_sentence.push_back(space_id);
} else {
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
piper::eSpeakPhonemeConfig config;
config.voice = "en-us";
std::vector<std::vector<piper::Phoneme>> phonemes;
CallPhonemizeEspeak(word, config, &phonemes);
// Note phonemes[i] contains a vector of unicode codepoints;
// we need to convert them to utf8
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
std::vector<int32_t> ids;
for (const auto &v : phonemes) {
for (const auto p : v) {
auto token = conv.to_bytes(p);
if (token2id_.count(token)) {
ids.push_back(token2id_.at(token));
} else {
SHERPA_ONNX_LOGE("Skip OOV token '%s' from '%s'", token.c_str(),
word.c_str());
}
}
}
if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
this_sentence.push_back(0);
ans.push_back(std::move(this_sentence));
this_sentence.push_back(0);
}
this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
this_sentence.push_back(space_id);
}
}
if (this_sentence.size() > 1) {
this_sentence.push_back(0);
ans.push_back(std::move(this_sentence));
}
if (debug_) {
for (const auto &v : ans) {
std::ostringstream os;
os << "\n";
std::string sep;
for (auto i : v) {
os << sep << i;
sep = " ";
}
os << "\n";
SHERPA_ONNX_LOGE("%s", os.str().c_str());
}
}
return ans;
}
void InitTokens(const std::string &tokens) {
std::ifstream is(tokens);
InitTokens(is);
}
template <typename Manager>
void InitTokens(Manager *mgr, const std::string &tokens) {
auto buf = ReadFile(mgr, tokens);
std::istrstream is(buf.data(), buf.size());
InitTokens(is);
}
void InitTokens(std::istream &is) {
token2id_ = ReadTokens(is); // defined in ./symbol-table.cc
}
void InitLexicon(const std::string &lexicon) {
std::vector<std::string> files;
SplitStringToVector(lexicon, ",", false, &files);
for (const auto &f : files) {
std::ifstream is(f);
InitLexicon(is);
}
}
template <typename Manager>
void InitLexicon(Manager *mgr, const std::string &lexicon) {
std::vector<std::string> files;
SplitStringToVector(lexicon, ",", false, &files);
for (const auto &f : files) {
auto buf = ReadFile(mgr, f);
std::istrstream is(buf.data(), buf.size());
InitLexicon(is);
}
}
void InitLexicon(std::istream &is) {
std::string word;
std::vector<std::string> token_list;
std::string token;
std::string line;
int32_t line_num = 0;
int32_t num_warn = 0;
while (std::getline(is, line)) {
++line_num;
std::istringstream iss(line);
token_list.clear();
iss >> word;
ToLowerCase(&word);
if (word2ids_.count(word)) {
num_warn += 1;
if (num_warn < 10) {
SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
word.c_str(), line_num, line.c_str());
}
continue;
}
while (iss >> token) {
token_list.push_back(std::move(token));
}
std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
if (ids.empty()) {
SHERPA_ONNX_LOGE(
"Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
word.c_str(), line_num, line.c_str());
continue;
}
word2ids_.insert({std::move(word), std::move(ids)});
}
}
void InitJieba(const std::string &dict_dir) {
std::string dict = dict_dir + "/jieba.dict.utf8";
std::string hmm = dict_dir + "/hmm_model.utf8";
std::string user_dict = dict_dir + "/user.dict.utf8";
std::string idf = dict_dir + "/idf.utf8";
std::string stop_word = dict_dir + "/stop_words.utf8";
AssertFileExists(dict);
AssertFileExists(hmm);
AssertFileExists(user_dict);
AssertFileExists(idf);
AssertFileExists(stop_word);
jieba_ =
std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word);
}
private:
OfflineTtsKokoroModelMetaData meta_data_;
// word to token IDs
std::unordered_map<std::string, std::vector<int32_t>> word2ids_;
// tokens.txt is saved in token2id_
std::unordered_map<std::string, int32_t> token2id_;
std::unique_ptr<cppjieba::Jieba> jieba_;
bool debug_ = false;
};
KokoroMultiLangLexicon::~KokoroMultiLangLexicon() = default;
KokoroMultiLangLexicon::KokoroMultiLangLexicon(
const std::string &tokens, const std::string &lexicon,
const std::string &dict_dir, const std::string &data_dir,
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
: impl_(std::make_unique<Impl>(tokens, lexicon, dict_dir, data_dir,
meta_data, debug)) {}
template <typename Manager>
KokoroMultiLangLexicon::KokoroMultiLangLexicon(
Manager *mgr, const std::string &tokens, const std::string &lexicon,
const std::string &dict_dir, const std::string &data_dir,
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
: impl_(std::make_unique<Impl>(mgr, tokens, lexicon, dict_dir, data_dir,
meta_data, debug)) {}
std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
const std::string &text, const std::string & /*unused_voice = ""*/) const {
return impl_->ConvertTextToTokenIds(text);
}
#if __ANDROID_API__ >= 9
template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
AAssetManager *mgr, const std::string &tokens, const std::string &lexicon,
const std::string &dict_dir, const std::string &data_dir,
const OfflineTtsKokoroModelMetaData &meta_data, bool debug);
#endif
#if __OHOS__
template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
NativeResourceManager *mgr, const std::string &tokens,
const std::string &lexicon, const std::string &dict_dir,
const std::string &data_dir, const OfflineTtsKokoroModelMetaData &meta_data,
bool debug);
#endif
} // namespace sherpa_onnx

View File

@@ -0,0 +1,45 @@
// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h
//
// Copyright (c) 2025 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
#define SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
#include <memory>
#include <string>
#include <vector>
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h"
namespace sherpa_onnx {
class KokoroMultiLangLexicon : public OfflineTtsFrontend {
public:
~KokoroMultiLangLexicon() override;
KokoroMultiLangLexicon(const std::string &tokens, const std::string &lexicon,
const std::string &dict_dir,
const std::string &data_dir,
const OfflineTtsKokoroModelMetaData &meta_data,
bool debug);
template <typename Manager>
KokoroMultiLangLexicon(Manager *mgr, const std::string &tokens,
const std::string &lexicon,
const std::string &dict_dir,
const std::string &data_dir,
const OfflineTtsKokoroModelMetaData &meta_data,
bool debug);
std::vector<TokenIDs> ConvertTextToTokenIds(
const std::string &text, const std::string &voice = "") const override;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_

View File

@@ -6,7 +6,9 @@
#include <fstream>
#include <regex> // NOLINT
#include <sstream>
#include <strstream>
#include <unordered_map>
#include <utility>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"

View File

@@ -7,7 +7,6 @@
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "sherpa-onnx/csrc/offline-tts-frontend.h"

View File

@@ -19,6 +19,9 @@ struct TokenIDs {
/*implicit*/ TokenIDs(std::vector<int64_t> tokens) // NOLINT
: tokens{std::move(tokens)} {}
/*implicit*/ TokenIDs(const std::vector<int32_t> &tokens) // NOLINT
: tokens{tokens.begin(), tokens.end()} {}
TokenIDs(std::vector<int64_t> tokens, // NOLINT
std::vector<int64_t> tones) // NOLINT
: tokens{std::move(tokens)}, tones{std::move(tones)} {}
@@ -51,6 +54,9 @@ class OfflineTtsFrontend {
const std::string &text, const std::string &voice = "") const = 0;
};
// implementation is in ./piper-phonemize-lexicon.cc
void InitEspeak(const std::string &data_dir);
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_

View File

@@ -13,6 +13,7 @@
#include "fst/extensions/far/far.h"
#include "kaldifst/csrc/kaldi-fst-io.h"
#include "kaldifst/csrc/text-normalizer.h"
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
#include "sherpa-onnx/csrc/lexicon.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
@@ -314,6 +315,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
template <typename Manager>
void InitFrontend(Manager *mgr) {
const auto &meta_data = model_->GetMetaData();
if (meta_data.version >= 2) {
// this is a multi-lingual model, we require that you pass lexicon
// and dict_dir
if (config_.model.kokoro.lexicon.empty() ||
config_.model.kokoro.dict_dir.empty()) {
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
SHERPA_ONNX_LOGE(
"You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
"v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
SHERPA_ONNX_EXIT(-1);
}
frontend_ = std::make_unique<KokoroMultiLangLexicon>(
mgr, config_.model.kokoro.tokens, config_.model.kokoro.lexicon,
config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir,
meta_data, config_.model.debug);
return;
}
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir,
meta_data);
@@ -321,7 +343,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
void InitFrontend() {
const auto &meta_data = model_->GetMetaData();
if (meta_data.version >= 2) {
// this is a multi-lingual model, we require that you pass lexicon
// and dict_dir
if (config_.model.kokoro.lexicon.empty() ||
config_.model.kokoro.dict_dir.empty()) {
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
SHERPA_ONNX_LOGE(
"You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
"v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
SHERPA_ONNX_EXIT(-1);
}
frontend_ = std::make_unique<KokoroMultiLangLexicon>(
config_.model.kokoro.tokens, config_.model.kokoro.lexicon,
config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir,
meta_data, config_.model.debug);
return;
}
// this is for kokoro v0.19, which supports only English
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data);
}

View File

@@ -8,6 +8,7 @@
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/text-utils.h"
namespace sherpa_onnx {
@@ -17,8 +18,16 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
"Path to voices.bin for Kokoro models");
po->Register("kokoro-tokens", &tokens,
"Path to tokens.txt for Kokoro models");
po->Register(
"kokoro-lexicon", &lexicon,
"Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0"
"You can pass multiple files, separated by ','. Example: "
"./lexicon-us-en.txt,./lexicon-zh.txt");
po->Register("kokoro-data-dir", &data_dir,
"Path to the directory containing dict for espeak-ng.");
po->Register("kokoro-dict-dir", &dict_dir,
"Path to the directory containing dict for jieba. "
"Used only for Kokoro >= v1.0");
po->Register("kokoro-length-scale", &length_scale,
"Speech speed. Larger->Slower; Smaller->faster.");
}
@@ -44,6 +53,19 @@ bool OfflineTtsKokoroModelConfig::Validate() const {
return false;
}
if (!lexicon.empty()) {
std::vector<std::string> files;
SplitStringToVector(lexicon, ",", false, &files);
for (const auto &f : files) {
if (!FileExists(f)) {
SHERPA_ONNX_LOGE(
"lexicon '%s' does not exist. Please re-check --kokoro-lexicon",
f.c_str());
return false;
}
}
}
if (data_dir.empty()) {
SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir");
return false;
@@ -77,6 +99,21 @@ bool OfflineTtsKokoroModelConfig::Validate() const {
return false;
}
if (!dict_dir.empty()) {
std::vector<std::string> required_files = {
"jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8",
"idf.utf8", "stop_words.utf8",
};
for (const auto &f : required_files) {
if (!FileExists(dict_dir + "/" + f)) {
SHERPA_ONNX_LOGE("'%s/%s' does not exist. Please check kokoro-dict-dir",
dict_dir.c_str(), f.c_str());
return false;
}
}
}
return true;
}
@@ -87,7 +124,9 @@ std::string OfflineTtsKokoroModelConfig::ToString() const {
os << "model=\"" << model << "\", ";
os << "voices=\"" << voices << "\", ";
os << "tokens=\"" << tokens << "\", ";
os << "lexicon=\"" << lexicon << "\", ";
os << "data_dir=\"" << data_dir << "\", ";
os << "dict_dir=\"" << dict_dir << "\", ";
os << "length_scale=" << length_scale << ")";
return os.str();

View File

@@ -16,8 +16,14 @@ struct OfflineTtsKokoroModelConfig {
std::string voices;
std::string tokens;
// Note: You can pass multiple files, separated by ",", to lexicon
// Example: lexicon = "./lexicon-gb-en.txt,./lexicon-zh.txt";
std::string lexicon;
std::string data_dir;
std::string dict_dir;
// speed = 1 / length_scale
float length_scale = 1.0;
@@ -26,11 +32,15 @@ struct OfflineTtsKokoroModelConfig {
OfflineTtsKokoroModelConfig(const std::string &model,
const std::string &voices,
const std::string &tokens,
const std::string &data_dir, float length_scale)
const std::string &lexicon,
const std::string &data_dir,
const std::string &dict_dir, float length_scale)
: model(model),
voices(voices),
tokens(tokens),
lexicon(lexicon),
data_dir(data_dir),
dict_dir(dict_dir),
length_scale(length_scale) {}
void Register(ParseOptions *po);

View File

@@ -32,10 +32,9 @@
namespace sherpa_onnx {
static void CallPhonemizeEspeak(
const std::string &text,
piper::eSpeakPhonemeConfig &config, // NOLINT
std::vector<std::vector<piper::Phoneme>> *phonemes) {
void CallPhonemizeEspeak(const std::string &text,
piper::eSpeakPhonemeConfig &config, // NOLINT
std::vector<std::vector<piper::Phoneme>> *phonemes) {
static std::mutex espeak_mutex;
std::lock_guard<std::mutex> lock(espeak_mutex);
@@ -245,7 +244,7 @@ static std::vector<int64_t> CoquiPhonemesToIds(
return ans;
}
static void InitEspeak(const std::string &data_dir) {
void InitEspeak(const std::string &data_dir) {
static std::once_flag init_flag;
std::call_once(init_flag, [data_dir]() {
int32_t result =

View File

@@ -241,7 +241,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/,
jlong ptr, jstring text,
jint sid, jfloat speed) {
const char *p_text = env->GetStringUTFChars(text, nullptr);
SHERPA_ONNX_LOGE("string is: %s", p_text);
auto audio = reinterpret_cast<sherpa_onnx::OfflineTts *>(ptr)->Generate(
p_text, sid, speed);
@@ -267,7 +266,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl(
JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid,
jfloat speed, jobject callback) {
const char *p_text = env->GetStringUTFChars(text, nullptr);
SHERPA_ONNX_LOGE("string is: %s", p_text);
std::function<int32_t(const float *, int32_t, float)> callback_wrapper =
[env, callback](const float *samples, int32_t n,

View File

@@ -16,13 +16,17 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig")
.def(py::init<>())
.def(py::init<const std::string &, const std::string &,
const std::string &, const std::string &,
const std::string &, const std::string &, float>(),
py::arg("model"), py::arg("voices"), py::arg("tokens"),
py::arg("data_dir"), py::arg("length_scale") = 1.0)
py::arg("lexicon") = "", py::arg("data_dir"),
py::arg("dict_dir") = "", py::arg("length_scale") = 1.0)
.def_readwrite("model", &PyClass::model)
.def_readwrite("voices", &PyClass::voices)
.def_readwrite("tokens", &PyClass::tokens)
.def_readwrite("lexicon", &PyClass::lexicon)
.def_readwrite("data_dir", &PyClass::data_dir)
.def_readwrite("dict_dir", &PyClass::dict_dir)
.def_readwrite("length_scale", &PyClass::length_scale)
.def("__str__", &PyClass::ToString)
.def("validate", &PyClass::Validate);