Support extra languages in multi-lang kokoro tts (#2303)
This commit is contained in:
16
.github/workflows/test-build-wheel.yaml
vendored
16
.github/workflows/test-build-wheel.yaml
vendored
@@ -35,18 +35,18 @@ jobs:
|
||||
matrix:
|
||||
# See https://github.com/actions/runner-images
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
python-version: "3.7"
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-latest
|
||||
python-version: "3.8"
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-latest
|
||||
python-version: "3.9"
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-latest
|
||||
python-version: "3.10"
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-latest
|
||||
python-version: "3.11"
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-latest
|
||||
python-version: "3.12"
|
||||
- os: ubuntu-latest
|
||||
python-version: "3.13"
|
||||
|
||||
- os: macos-13
|
||||
python-version: "3.8"
|
||||
@@ -103,7 +103,7 @@ jobs:
|
||||
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
|
||||
cmake --version
|
||||
|
||||
export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j"
|
||||
export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j2"
|
||||
|
||||
python3 setup.py bdist_wheel
|
||||
ls -lh dist
|
||||
|
||||
17
README.md
17
README.md
@@ -1,18 +1,18 @@
|
||||
### Supported functions
|
||||
|
||||
|Speech recognition| Speech synthesis | Source separation |
|
||||
|Speech recognition| [Speech synthesis][tts-url] | [Source separation][ss-url] |
|
||||
|------------------|------------------|-------------------|
|
||||
| ✔️ | ✔️ | ✔️ |
|
||||
|
||||
|Speaker identification| Speaker diarization | Speaker verification |
|
||||
|Speaker identification| [Speaker diarization][sd-url] | Speaker verification |
|
||||
|----------------------|-------------------- |------------------------|
|
||||
| ✔️ | ✔️ | ✔️ |
|
||||
|
||||
| Spoken Language identification | Audio tagging | Voice activity detection |
|
||||
| [Spoken Language identification][slid-url] | [Audio tagging][at-url] | [Voice activity detection][vad-url] |
|
||||
|--------------------------------|---------------|--------------------------|
|
||||
| ✔️ | ✔️ | ✔️ |
|
||||
|
||||
| Keyword spotting | Add punctuation | Speech enhancement |
|
||||
| [Keyword spotting][kws-url] | [Add punctuation][punct-url] | [Speech enhancement][se-url] |
|
||||
|------------------|-----------------|--------------------|
|
||||
| ✔️ | ✔️ | ✔️ |
|
||||
|
||||
@@ -501,3 +501,12 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
|
||||
[spleeter]: https://github.com/deezer/spleeter
|
||||
[UVR]: https://github.com/Anjok07/ultimatevocalremovergui
|
||||
[gtcrn]: https://github.com/Xiaobin-Rong/gtcrn
|
||||
[tts-url]: https://k2-fsa.github.io/sherpa/onnx/tts/all-in-one.html
|
||||
[ss-url]: https://k2-fsa.github.io/sherpa/onnx/source-separation/index.html
|
||||
[sd-url]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/index.html
|
||||
[slid-url]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/index.html
|
||||
[at-url]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/index.html
|
||||
[vad-url]: https://k2-fsa.github.io/sherpa/onnx/vad/index.html
|
||||
[kws-url]: https://k2-fsa.github.io/sherpa/onnx/kws/index.html
|
||||
[punct-url]: https://k2-fsa.github.io/sherpa/onnx/punctuation/index.html
|
||||
[se-url]: https://k2-fsa.github.io/sherpa/onnx/speech-enhancment/index.html
|
||||
|
||||
@@ -201,6 +201,7 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct {
|
||||
external double lengthScale;
|
||||
external Pointer<Utf8> dictDir;
|
||||
external Pointer<Utf8> lexicon;
|
||||
external Pointer<Utf8> lang;
|
||||
}
|
||||
|
||||
final class SherpaOnnxOfflineTtsModelConfig extends Struct {
|
||||
|
||||
@@ -117,6 +117,7 @@ class OfflineTtsKokoroModelConfig {
|
||||
this.lengthScale = 1.0,
|
||||
this.dictDir = '',
|
||||
this.lexicon = '',
|
||||
this.lang = '',
|
||||
});
|
||||
|
||||
factory OfflineTtsKokoroModelConfig.fromJson(Map<String, dynamic> json) {
|
||||
@@ -128,12 +129,13 @@ class OfflineTtsKokoroModelConfig {
|
||||
lengthScale: (json['lengthScale'] as num?)?.toDouble() ?? 1.0,
|
||||
dictDir: json['dictDir'] as String? ?? '',
|
||||
lexicon: json['lexicon'] as String? ?? '',
|
||||
lang: json['lang'] as String? ?? '',
|
||||
);
|
||||
}
|
||||
|
||||
@override
|
||||
String toString() {
|
||||
return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon)';
|
||||
return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon, lang: $lang)';
|
||||
}
|
||||
|
||||
Map<String, dynamic> toJson() => {
|
||||
@@ -144,6 +146,7 @@ class OfflineTtsKokoroModelConfig {
|
||||
'lengthScale': lengthScale,
|
||||
'dictDir': dictDir,
|
||||
'lexicon': lexicon,
|
||||
'lang': lang,
|
||||
};
|
||||
|
||||
final String model;
|
||||
@@ -153,6 +156,7 @@ class OfflineTtsKokoroModelConfig {
|
||||
final double lengthScale;
|
||||
final String dictDir;
|
||||
final String lexicon;
|
||||
final String lang;
|
||||
}
|
||||
|
||||
class OfflineTtsModelConfig {
|
||||
@@ -286,6 +290,7 @@ class OfflineTts {
|
||||
c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale;
|
||||
c.ref.model.kokoro.dictDir = config.model.kokoro.dictDir.toNativeUtf8();
|
||||
c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8();
|
||||
c.ref.model.kokoro.lang = config.model.kokoro.lang.toNativeUtf8();
|
||||
|
||||
c.ref.model.numThreads = config.model.numThreads;
|
||||
c.ref.model.debug = config.model.debug ? 1 : 0;
|
||||
@@ -302,6 +307,7 @@ class OfflineTts {
|
||||
calloc.free(c.ref.ruleFsts);
|
||||
calloc.free(c.ref.model.provider);
|
||||
|
||||
calloc.free(c.ref.model.kokoro.lang);
|
||||
calloc.free(c.ref.model.kokoro.lexicon);
|
||||
calloc.free(c.ref.model.kokoro.dictDir);
|
||||
calloc.free(c.ref.model.kokoro.dataDir);
|
||||
|
||||
@@ -70,6 +70,7 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig(
|
||||
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(lang, lang);
|
||||
|
||||
return c;
|
||||
}
|
||||
@@ -177,6 +178,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.data_dir);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.dict_dir);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lexicon);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lang);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model.provider);
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ export class OfflineTtsKokoroModelConfig {
|
||||
public lengthScale: number = 1.0;
|
||||
public dictDir: string = '';
|
||||
public lexicon: string = '';
|
||||
public lang: string = '';
|
||||
}
|
||||
|
||||
export class OfflineTtsModelConfig {
|
||||
|
||||
@@ -18,6 +18,7 @@ namespace SherpaOnnx
|
||||
|
||||
DictDir = "";
|
||||
Lexicon = "";
|
||||
Lang = "";
|
||||
}
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string Model;
|
||||
@@ -38,5 +39,8 @@ namespace SherpaOnnx
|
||||
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string Lexicon;
|
||||
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string Lang;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -857,6 +857,7 @@ type OfflineTtsKokoroModelConfig struct {
|
||||
DataDir string // Path to espeak-ng-data directory
|
||||
DictDir string // Path to dict directory
|
||||
Lexicon string // Path to lexicon files
|
||||
Lang string // Example: es for Spanish, fr-fr for French. Can be empty
|
||||
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
|
||||
}
|
||||
|
||||
@@ -1006,6 +1007,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
|
||||
c.model.kokoro.lexicon = C.CString(config.Model.Kokoro.Lexicon)
|
||||
defer C.free(unsafe.Pointer(c.model.kokoro.lexicon))
|
||||
|
||||
c.model.kokoro.lang = C.CString(config.Model.Kokoro.Lang)
|
||||
defer C.free(unsafe.Pointer(c.model.kokoro.lang))
|
||||
|
||||
c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale)
|
||||
|
||||
c.model.num_threads = C.int(config.Model.NumThreads)
|
||||
|
||||
@@ -1164,6 +1164,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
|
||||
SHERPA_ONNX_OR(config->model.kokoro.dict_dir, "");
|
||||
tts_config.model.kokoro.lexicon =
|
||||
SHERPA_ONNX_OR(config->model.kokoro.lexicon, "");
|
||||
tts_config.model.kokoro.lang = SHERPA_ONNX_OR(config->model.kokoro.lang, "");
|
||||
|
||||
tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1);
|
||||
tts_config.model.debug = config->model.debug;
|
||||
|
||||
@@ -958,6 +958,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsKokoroModelConfig {
|
||||
float length_scale; // < 1, faster in speech speed; > 1, slower in speed
|
||||
const char *dict_dir;
|
||||
const char *lexicon;
|
||||
const char *lang;
|
||||
} SherpaOnnxOfflineTtsKokoroModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig {
|
||||
|
||||
@@ -366,6 +366,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
|
||||
c.model.kokoro.length_scale = config.model.kokoro.length_scale;
|
||||
c.model.kokoro.dict_dir = config.model.kokoro.dict_dir.c_str();
|
||||
c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str();
|
||||
c.model.kokoro.lang = config.model.kokoro.lang.c_str();
|
||||
|
||||
c.model.num_threads = config.model.num_threads;
|
||||
c.model.debug = config.model.debug;
|
||||
|
||||
@@ -367,6 +367,7 @@ struct OfflineTtsKokoroModelConfig {
|
||||
std::string data_dir;
|
||||
std::string dict_dir;
|
||||
std::string lexicon;
|
||||
std::string lang;
|
||||
|
||||
float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed
|
||||
};
|
||||
|
||||
@@ -67,7 +67,8 @@ class KokoroMultiLangLexicon::Impl {
|
||||
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
|
||||
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text,
|
||||
const std::string &voice) const {
|
||||
std::string text = ToLowerCase(_text);
|
||||
if (debug_) {
|
||||
SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
|
||||
@@ -124,7 +125,7 @@ class KokoroMultiLangLexicon::Impl {
|
||||
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
||||
}
|
||||
|
||||
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
|
||||
ids_vec = ConvertNonChineseToTokenIDs(ms, voice);
|
||||
}
|
||||
|
||||
for (const auto &ids : ids_vec) {
|
||||
@@ -255,8 +256,30 @@ class KokoroMultiLangLexicon::Impl {
|
||||
return ans;
|
||||
}
|
||||
|
||||
std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
|
||||
std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak(
|
||||
const std::string &text, const std::string &voice) const {
|
||||
auto temp = ConvertTextToTokenIdsKokoro(
|
||||
phoneme2id_, meta_data_.max_token_len, text, voice);
|
||||
std::vector<std::vector<int32_t>> ans;
|
||||
ans.reserve(temp.size());
|
||||
|
||||
for (const auto &i : temp) {
|
||||
ans.emplace_back(i.tokens.begin(), i.tokens.end());
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
std::vector<std::vector<int32_t>> ConvertNonChineseToTokenIDs(
|
||||
const std::string &text, const std::string &voice) const {
|
||||
if (!voice.empty()) {
|
||||
return ConvertTextToTokenIDsWithEspeak(text, voice);
|
||||
}
|
||||
|
||||
// If voice is empty, we split the text into words and use the lexicon
|
||||
// to lookup the pronunciation of each word, fallback to espeak if
|
||||
// a word is not in the lexicon.
|
||||
|
||||
std::vector<std::string> words = SplitUtf8(text);
|
||||
if (debug_) {
|
||||
std::ostringstream os;
|
||||
@@ -317,7 +340,7 @@ class KokoroMultiLangLexicon::Impl {
|
||||
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
config.voice = voice;
|
||||
config.voice = meta_data_.voice;
|
||||
|
||||
std::vector<std::vector<piper::Phoneme>> phonemes;
|
||||
|
||||
@@ -391,9 +414,28 @@ class KokoroMultiLangLexicon::Impl {
|
||||
|
||||
void InitTokens(std::istream &is) {
|
||||
token2id_ = ReadTokens(is); // defined in ./symbol-table.cc
|
||||
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
|
||||
std::u32string s;
|
||||
for (const auto &p : token2id_) {
|
||||
s = conv.from_bytes(p.first);
|
||||
|
||||
if (s.size() != 1) {
|
||||
SHERPA_ONNX_LOGE("Error for token %s with id %d", p.first.c_str(),
|
||||
p.second);
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
char32_t c = s[0];
|
||||
phoneme2id_.insert({c, p.second});
|
||||
}
|
||||
}
|
||||
|
||||
void InitLexicon(const std::string &lexicon) {
|
||||
if (lexicon.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(lexicon, ",", false, &files);
|
||||
for (const auto &f : files) {
|
||||
@@ -404,6 +446,10 @@ class KokoroMultiLangLexicon::Impl {
|
||||
|
||||
template <typename Manager>
|
||||
void InitLexicon(Manager *mgr, const std::string &lexicon) {
|
||||
if (lexicon.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(lexicon, ",", false, &files);
|
||||
for (const auto &f : files) {
|
||||
@@ -445,7 +491,7 @@ class KokoroMultiLangLexicon::Impl {
|
||||
|
||||
std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
|
||||
|
||||
if (ids.empty()) {
|
||||
if (ids.empty() && word != "呣") {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
|
||||
word.c_str(), line_num, line.c_str());
|
||||
@@ -465,6 +511,8 @@ class KokoroMultiLangLexicon::Impl {
|
||||
// tokens.txt is saved in token2id_
|
||||
std::unordered_map<std::string, int32_t> token2id_;
|
||||
|
||||
std::unordered_map<char32_t, int32_t> phoneme2id_;
|
||||
|
||||
std::unique_ptr<cppjieba::Jieba> jieba_;
|
||||
bool debug_ = false;
|
||||
};
|
||||
@@ -487,8 +535,8 @@ KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||||
meta_data, debug)) {}
|
||||
|
||||
std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string & /*unused_voice = ""*/) const {
|
||||
return impl_->ConvertTextToTokenIds(text);
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
return impl_->ConvertTextToTokenIds(text, voice);
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
|
||||
@@ -20,9 +20,9 @@ struct OfflineSpeechDenoiserModelConfig {
|
||||
|
||||
OfflineSpeechDenoiserModelConfig() = default;
|
||||
|
||||
OfflineSpeechDenoiserModelConfig(OfflineSpeechDenoiserGtcrnModelConfig gtcrn,
|
||||
int32_t num_threads, bool debug,
|
||||
const std::string &provider)
|
||||
OfflineSpeechDenoiserModelConfig(
|
||||
const OfflineSpeechDenoiserGtcrnModelConfig >crn, int32_t num_threads,
|
||||
bool debug, const std::string &provider)
|
||||
: gtcrn(gtcrn),
|
||||
num_threads(num_threads),
|
||||
debug(debug),
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@@ -57,6 +58,12 @@ class OfflineTtsFrontend {
|
||||
// implementation is in ./piper-phonemize-lexicon.cc
|
||||
void InitEspeak(const std::string &data_dir);
|
||||
|
||||
// implementation in ./piper-phonemize-lexicon.cc
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
int32_t max_token_len, const std::string &text,
|
||||
const std::string &voice = "");
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
|
||||
|
||||
@@ -220,8 +220,9 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> token_ids =
|
||||
frontend_->ConvertTextToTokenIds(text, meta_data.voice);
|
||||
std::vector<TokenIDs> token_ids = frontend_->ConvertTextToTokenIds(
|
||||
text, config_.model.kokoro.lang.empty() ? meta_data.voice
|
||||
: config_.model.kokoro.lang);
|
||||
|
||||
if (token_ids.empty() ||
|
||||
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
|
||||
@@ -335,12 +336,14 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
||||
if (meta_data.version >= 2) {
|
||||
// this is a multi-lingual model, we require that you pass lexicon
|
||||
// and dict_dir
|
||||
if (config_.model.kokoro.lexicon.empty() ||
|
||||
if ((config_.model.kokoro.lexicon.empty() &&
|
||||
config_.model.kokoro.lang.empty()) ||
|
||||
config_.model.kokoro.dict_dir.empty()) {
|
||||
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
|
||||
SHERPA_ONNX_LOGE(
|
||||
"You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
|
||||
"v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
|
||||
"v1.0). Please pass --kokoro-lexicon and --kokoro-dict-dir or "
|
||||
"provide --kokoro-lang and --kokoro-dict-dir");
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
@@ -362,7 +365,8 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
||||
if (meta_data.version >= 2) {
|
||||
// this is a multi-lingual model, we require that you pass lexicon
|
||||
// and dict_dir
|
||||
if (config_.model.kokoro.lexicon.empty() ||
|
||||
if ((config_.model.kokoro.lexicon.empty() &&
|
||||
config_.model.kokoro.lang.empty()) ||
|
||||
config_.model.kokoro.dict_dir.empty()) {
|
||||
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
|
||||
SHERPA_ONNX_LOGE(
|
||||
|
||||
@@ -18,6 +18,13 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
|
||||
"Path to voices.bin for Kokoro models");
|
||||
po->Register("kokoro-tokens", &tokens,
|
||||
"Path to tokens.txt for Kokoro models");
|
||||
po->Register("kokoro-lang", &lang,
|
||||
"Used only by kokoro >= 1.0. Example values: "
|
||||
"en (English), "
|
||||
"es (Spanish), fr (French), hi (hindi), it (Italian), "
|
||||
"pt-br (Brazilian Portuguese)."
|
||||
"You can leave it empty, in which case you need to provide "
|
||||
"--kokoro-lexicon.");
|
||||
po->Register(
|
||||
"kokoro-lexicon", &lexicon,
|
||||
"Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0"
|
||||
@@ -127,7 +134,8 @@ std::string OfflineTtsKokoroModelConfig::ToString() const {
|
||||
os << "lexicon=\"" << lexicon << "\", ";
|
||||
os << "data_dir=\"" << data_dir << "\", ";
|
||||
os << "dict_dir=\"" << dict_dir << "\", ";
|
||||
os << "length_scale=" << length_scale << ")";
|
||||
os << "length_scale=" << length_scale << ", ";
|
||||
os << "lang=\"" << lang << "\")";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
@@ -27,6 +27,13 @@ struct OfflineTtsKokoroModelConfig {
|
||||
// speed = 1 / length_scale
|
||||
float length_scale = 1.0;
|
||||
|
||||
// Used only for Kokoro >= 1.0.
|
||||
//
|
||||
// If it is not empty, meta_data.voice is ignored.
|
||||
// Example values: es (Spanish), fr (French), pt (Portuguese)
|
||||
// See https://hf-mirror.com/hexgrad/Kokoro-82M/blob/main/VOICES.md
|
||||
std::string lang;
|
||||
|
||||
OfflineTtsKokoroModelConfig() = default;
|
||||
|
||||
OfflineTtsKokoroModelConfig(const std::string &model,
|
||||
@@ -34,14 +41,16 @@ struct OfflineTtsKokoroModelConfig {
|
||||
const std::string &tokens,
|
||||
const std::string &lexicon,
|
||||
const std::string &data_dir,
|
||||
const std::string &dict_dir, float length_scale)
|
||||
const std::string &dict_dir, float length_scale,
|
||||
const std::string &lang)
|
||||
: model(model),
|
||||
voices(voices),
|
||||
tokens(tokens),
|
||||
lexicon(lexicon),
|
||||
data_dir(data_dir),
|
||||
dict_dir(dict_dir),
|
||||
length_scale(length_scale) {}
|
||||
length_scale(length_scale),
|
||||
lang(lang) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
|
||||
@@ -351,7 +351,8 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
|
||||
if (is_matcha_) {
|
||||
return ConvertTextToTokenIdsMatcha(text, voice);
|
||||
} else if (is_kokoro_) {
|
||||
return ConvertTextToTokenIdsKokoro(text, voice);
|
||||
return ConvertTextToTokenIdsKokoro(
|
||||
token2id_, kokoro_meta_data_.max_token_len, text, voice);
|
||||
} else {
|
||||
return ConvertTextToTokenIdsVits(text, voice);
|
||||
}
|
||||
@@ -382,8 +383,10 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha(
|
||||
return ans;
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
int32_t max_token_len, const std::string &text,
|
||||
const std::string &voice /*= ""*/) {
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
// ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
|
||||
@@ -397,8 +400,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(
|
||||
std::vector<TokenIDs> ans;
|
||||
|
||||
for (const auto &p : phonemes) {
|
||||
auto phoneme_ids =
|
||||
PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len);
|
||||
auto phoneme_ids = PiperPhonemesToIdsKokoro(token2id, p, max_token_len);
|
||||
|
||||
for (auto &ids : phoneme_ids) {
|
||||
ans.emplace_back(std::move(ids));
|
||||
|
||||
@@ -52,9 +52,6 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsMatcha(
|
||||
const std::string &text, const std::string &voice = "") const;
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
|
||||
const std::string &text, const std::string &voice = "") const;
|
||||
|
||||
private:
|
||||
// map unicode codepoint to an integer ID
|
||||
std::unordered_map<char32_t, int32_t> token2id_;
|
||||
|
||||
@@ -6,6 +6,7 @@ public class OfflineTtsKokoroModelConfig {
|
||||
private final String voices;
|
||||
private final String tokens;
|
||||
private final String lexicon;
|
||||
private final String lang;
|
||||
private final String dataDir;
|
||||
private final String dictDir;
|
||||
private final float lengthScale;
|
||||
@@ -15,6 +16,7 @@ public class OfflineTtsKokoroModelConfig {
|
||||
this.voices = builder.voices;
|
||||
this.tokens = builder.tokens;
|
||||
this.lexicon = builder.lexicon;
|
||||
this.lang = builder.lang;
|
||||
this.dataDir = builder.dataDir;
|
||||
this.dictDir = builder.dictDir;
|
||||
this.lengthScale = builder.lengthScale;
|
||||
@@ -50,6 +52,7 @@ public class OfflineTtsKokoroModelConfig {
|
||||
private String voices = "";
|
||||
private String tokens = "";
|
||||
private String lexicon = "";
|
||||
private String lang = "";
|
||||
private String dataDir = "";
|
||||
private String dictDir = "";
|
||||
private float lengthScale = 1.0f;
|
||||
@@ -78,6 +81,11 @@ public class OfflineTtsKokoroModelConfig {
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setLang(String lang) {
|
||||
this.lang = lang;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setDataDir(String dataDir) {
|
||||
this.dataDir = dataDir;
|
||||
return this;
|
||||
|
||||
@@ -145,6 +145,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
|
||||
ans.model.kokoro.lexicon = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
fid = env->GetFieldID(kokoro_cls, "lang", "Ljava/lang/String;");
|
||||
s = (jstring)env->GetObjectField(kokoro, fid);
|
||||
p = env->GetStringUTFChars(s, nullptr);
|
||||
ans.model.kokoro.lang = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;");
|
||||
s = (jstring)env->GetObjectField(kokoro, fid);
|
||||
p = env->GetStringUTFChars(s, nullptr);
|
||||
|
||||
@@ -31,6 +31,7 @@ data class OfflineTtsKokoroModelConfig(
|
||||
var tokens: String = "",
|
||||
var dataDir: String = "",
|
||||
var lexicon: String = "",
|
||||
var lang: String = "",
|
||||
var dictDir: String = "",
|
||||
var lengthScale: Float = 1.0f,
|
||||
)
|
||||
|
||||
@@ -84,6 +84,7 @@ type
|
||||
LengthScale: Single;
|
||||
DictDir: AnsiString;
|
||||
Lexicon: AnsiString;
|
||||
Lang: AnsiString;
|
||||
|
||||
function ToString: AnsiString;
|
||||
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
|
||||
@@ -841,6 +842,7 @@ type
|
||||
LengthScale: cfloat;
|
||||
DictDir: PAnsiChar;
|
||||
Lexicon: PAnsiChar;
|
||||
Lang: PAnsiChar;
|
||||
end;
|
||||
|
||||
SherpaOnnxOfflineTtsModelConfig = record
|
||||
@@ -2096,10 +2098,11 @@ begin
|
||||
'DataDir := %s, ' +
|
||||
'LengthScale := %.2f, ' +
|
||||
'DictDir := %s, ' +
|
||||
'Lexicon := %s' +
|
||||
'Lexicon := %s, ' +
|
||||
'Lang := %s' +
|
||||
')',
|
||||
[Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale,
|
||||
Self.DictDir, Self.Lexicon]);
|
||||
Self.DictDir, Self.Lexicon, Self.Lang]);
|
||||
end;
|
||||
|
||||
class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
|
||||
@@ -2180,6 +2183,7 @@ begin
|
||||
C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale;
|
||||
C.Model.Kokoro.DictDir := PAnsiChar(Config.Model.Kokoro.DictDir);
|
||||
C.Model.Kokoro.Lexicon := PAnsiChar(Config.Model.Kokoro.Lexicon);
|
||||
C.Model.Kokoro.Lang := PAnsiChar(Config.Model.Kokoro.Lang);
|
||||
|
||||
C.Model.NumThreads := Config.Model.NumThreads;
|
||||
C.Model.Provider := PAnsiChar(Config.Model.Provider);
|
||||
|
||||
@@ -17,10 +17,12 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
|
||||
.def(py::init<>())
|
||||
.def(py::init<const std::string &, const std::string &,
|
||||
const std::string &, const std::string &,
|
||||
const std::string &, const std::string &, float>(),
|
||||
const std::string &, const std::string &, float,
|
||||
const std::string &>(),
|
||||
py::arg("model"), py::arg("voices"), py::arg("tokens"),
|
||||
py::arg("lexicon") = "", py::arg("data_dir"),
|
||||
py::arg("dict_dir") = "", py::arg("length_scale") = 1.0)
|
||||
py::arg("dict_dir") = "", py::arg("length_scale") = 1.0,
|
||||
py::arg("lang") = "")
|
||||
.def_readwrite("model", &PyClass::model)
|
||||
.def_readwrite("voices", &PyClass::voices)
|
||||
.def_readwrite("tokens", &PyClass::tokens)
|
||||
@@ -28,6 +30,7 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
|
||||
.def_readwrite("data_dir", &PyClass::data_dir)
|
||||
.def_readwrite("dict_dir", &PyClass::dict_dir)
|
||||
.def_readwrite("length_scale", &PyClass::length_scale)
|
||||
.def_readwrite("lang", &PyClass::lang)
|
||||
.def("__str__", &PyClass::ToString)
|
||||
.def("validate", &PyClass::Validate);
|
||||
}
|
||||
|
||||
@@ -806,7 +806,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig(
|
||||
dataDir: String = "",
|
||||
lengthScale: Float = 1.0,
|
||||
dictDir: String = "",
|
||||
lexicon: String = ""
|
||||
lexicon: String = "",
|
||||
lang: String = ""
|
||||
) -> SherpaOnnxOfflineTtsKokoroModelConfig {
|
||||
return SherpaOnnxOfflineTtsKokoroModelConfig(
|
||||
model: toCPointer(model),
|
||||
@@ -815,7 +816,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig(
|
||||
data_dir: toCPointer(dataDir),
|
||||
length_scale: lengthScale,
|
||||
dict_dir: toCPointer(dictDir),
|
||||
lexicon: toCPointer(lexicon)
|
||||
lexicon: toCPointer(lexicon),
|
||||
lang: toCPointer(lang)
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -143,13 +143,14 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
|
||||
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
|
||||
const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
|
||||
const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
|
||||
const langLen = Module.lengthBytesUTF8(config.lang || '') + 1;
|
||||
|
||||
const n =
|
||||
modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + lexiconLen;
|
||||
const n = modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen +
|
||||
lexiconLen + langLen;
|
||||
|
||||
const buffer = Module._malloc(n);
|
||||
|
||||
const len = 7 * 4;
|
||||
const len = 8 * 4;
|
||||
const ptr = Module._malloc(len);
|
||||
|
||||
let offset = 0;
|
||||
@@ -171,6 +172,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
|
||||
Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
|
||||
offset += lexiconLen;
|
||||
|
||||
Module.stringToUTF8(config.lang || '', buffer + offset, langLen);
|
||||
offset += langLen;
|
||||
|
||||
offset = 0;
|
||||
Module.setValue(ptr, buffer + offset, 'i8*');
|
||||
offset += modelLen;
|
||||
@@ -192,6 +196,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
|
||||
Module.setValue(ptr + 24, buffer + offset, 'i8*');
|
||||
offset += lexiconLen;
|
||||
|
||||
Module.setValue(ptr + 28, buffer + offset, 'i8*');
|
||||
offset += langLen;
|
||||
|
||||
return {
|
||||
buffer: buffer, ptr: ptr, len: len,
|
||||
}
|
||||
@@ -233,6 +240,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
||||
dataDir: '',
|
||||
dictDir: '',
|
||||
lexicon: '',
|
||||
lang: '',
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ extern "C" {
|
||||
|
||||
static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 7 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
|
||||
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
|
||||
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +
|
||||
|
||||
Reference in New Issue
Block a user