Add C++ and Python API for Kokoro 1.0 multilingual TTS model (#1795)
This commit is contained in:
24
.github/scripts/test-python.sh
vendored
24
.github/scripts/test-python.sh
vendored
@@ -267,6 +267,27 @@ log "Offline TTS test"
|
|||||||
# test waves are saved in ./tts
|
# test waves are saved in ./tts
|
||||||
mkdir ./tts
|
mkdir ./tts
|
||||||
|
|
||||||
|
log "kokoro-multi-lang-v1_0 test"
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
|
||||||
|
tar xf kokoro-multi-lang-v1_0.tar.bz2
|
||||||
|
rm kokoro-multi-lang-v1_0.tar.bz2
|
||||||
|
|
||||||
|
python3 ./python-api-examples/offline-tts.py \
|
||||||
|
--debug=1 \
|
||||||
|
--kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
|
||||||
|
--kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
|
||||||
|
--kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
|
||||||
|
--kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
|
||||||
|
--kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
|
||||||
|
--kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
|
||||||
|
--num-threads=2 \
|
||||||
|
--sid=18 \
|
||||||
|
--output-filename="./tts/kokoro-18-zh-en.wav" \
|
||||||
|
"中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
|
||||||
|
|
||||||
|
rm -rf kokoro-multi-lang-v1_0
|
||||||
|
|
||||||
log "kokoro-en-v0_19 test"
|
log "kokoro-en-v0_19 test"
|
||||||
|
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||||
@@ -580,13 +601,10 @@ if [[ x$OS != x'windows-latest' ]]; then
|
|||||||
repo=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
|
repo=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
|
||||||
log "Start testing ${repo}"
|
log "Start testing ${repo}"
|
||||||
|
|
||||||
pushd $dir
|
|
||||||
curl -LS -O https://github.com/pkufool/keyword-spotting-models/releases/download/v0.1/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
|
curl -LS -O https://github.com/pkufool/keyword-spotting-models/releases/download/v0.1/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
|
||||||
tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
|
tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
|
||||||
rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
|
rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
|
||||||
popd
|
|
||||||
|
|
||||||
repo=$dir/$repo
|
|
||||||
ls -lh $repo
|
ls -lh $repo
|
||||||
|
|
||||||
python3 ./python-api-examples/keyword-spotter.py
|
python3 ./python-api-examples/keyword-spotter.py
|
||||||
|
|||||||
16
.github/workflows/export-kokoro.yaml
vendored
16
.github/workflows/export-kokoro.yaml
vendored
@@ -4,7 +4,6 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- export-kokoro
|
- export-kokoro
|
||||||
- kokoro-1.0-2
|
|
||||||
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
@@ -76,6 +75,14 @@ jobs:
|
|||||||
if: matrix.version == '1.0'
|
if: matrix.version == '1.0'
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
|
curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
|
||||||
|
tar xvf dict.tar.bz2
|
||||||
|
rm dict.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
|
||||||
|
curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
|
||||||
|
curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
|
||||||
|
|
||||||
src=scripts/kokoro/v1.0
|
src=scripts/kokoro/v1.0
|
||||||
|
|
||||||
d=kokoro-multi-lang-v1_0
|
d=kokoro-multi-lang-v1_0
|
||||||
@@ -87,7 +94,12 @@ jobs:
|
|||||||
cp -v $src/tokens.txt $d/
|
cp -v $src/tokens.txt $d/
|
||||||
cp -v $src/lexicon*.txt $d/
|
cp -v $src/lexicon*.txt $d/
|
||||||
cp -v $src/README.md $d/README.md
|
cp -v $src/README.md $d/README.md
|
||||||
|
cp -av dict $d/
|
||||||
|
cp -v ./*.fst $d/
|
||||||
ls -lh $d/
|
ls -lh $d/
|
||||||
|
echo "---"
|
||||||
|
ls -lh $d/dict
|
||||||
|
|
||||||
tar cjfv $d.tar.bz2 $d
|
tar cjfv $d.tar.bz2 $d
|
||||||
rm -rf $d
|
rm -rf $d
|
||||||
|
|
||||||
@@ -180,6 +192,8 @@ jobs:
|
|||||||
cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
|
cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
|
||||||
cp -v ../scripts/kokoro/v1.0/README.md ./README.md
|
cp -v ../scripts/kokoro/v1.0/README.md ./README.md
|
||||||
cp -v ../LICENSE ./
|
cp -v ../LICENSE ./
|
||||||
|
cp -av ../dict ./
|
||||||
|
cp -v ../*.fst $d/
|
||||||
|
|
||||||
git lfs track "*.onnx"
|
git lfs track "*.onnx"
|
||||||
git add .
|
git add .
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -132,3 +132,4 @@ kokoro-en-v0_19
|
|||||||
lexicon.txt
|
lexicon.txt
|
||||||
us_gold.json
|
us_gold.json
|
||||||
us_silver.json
|
us_silver.json
|
||||||
|
kokoro-multi-lang-v1_0
|
||||||
|
|||||||
@@ -25,27 +25,28 @@ int32_t main() {
|
|||||||
|
|
||||||
memset(&config, 0, sizeof(config));
|
memset(&config, 0, sizeof(config));
|
||||||
config.model_config.transducer.encoder =
|
config.model_config.transducer.encoder =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
"encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
|
"encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
|
||||||
|
|
||||||
config.model_config.transducer.decoder =
|
config.model_config.transducer.decoder =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
"decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
|
"decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
|
||||||
|
|
||||||
config.model_config.transducer.joiner =
|
config.model_config.transducer.joiner =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
"joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
|
"joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
|
||||||
|
|
||||||
config.model_config.tokens =
|
config.model_config.tokens =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt";
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
|
"tokens.txt";
|
||||||
|
|
||||||
config.model_config.provider = "cpu";
|
config.model_config.provider = "cpu";
|
||||||
config.model_config.num_threads = 1;
|
config.model_config.num_threads = 1;
|
||||||
config.model_config.debug = 1;
|
config.model_config.debug = 1;
|
||||||
|
|
||||||
config.keywords_file =
|
config.keywords_file =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/"
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
"test_keywords.txt";
|
"test_wavs/test_keywords.txt";
|
||||||
|
|
||||||
const SherpaOnnxKeywordSpotter *kws = SherpaOnnxCreateKeywordSpotter(&config);
|
const SherpaOnnxKeywordSpotter *kws = SherpaOnnxCreateKeywordSpotter(&config);
|
||||||
if (!kws) {
|
if (!kws) {
|
||||||
|
|||||||
@@ -24,27 +24,28 @@ int32_t main() {
|
|||||||
|
|
||||||
KeywordSpotterConfig config;
|
KeywordSpotterConfig config;
|
||||||
config.model_config.transducer.encoder =
|
config.model_config.transducer.encoder =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
"encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
|
"encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
|
||||||
|
|
||||||
config.model_config.transducer.decoder =
|
config.model_config.transducer.decoder =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
"decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
|
"decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
|
||||||
|
|
||||||
config.model_config.transducer.joiner =
|
config.model_config.transducer.joiner =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
"joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
|
"joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
|
||||||
|
|
||||||
config.model_config.tokens =
|
config.model_config.tokens =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt";
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
|
"tokens.txt";
|
||||||
|
|
||||||
config.model_config.provider = "cpu";
|
config.model_config.provider = "cpu";
|
||||||
config.model_config.num_threads = 1;
|
config.model_config.num_threads = 1;
|
||||||
config.model_config.debug = 1;
|
config.model_config.debug = 1;
|
||||||
|
|
||||||
config.keywords_file =
|
config.keywords_file =
|
||||||
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/"
|
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
|
||||||
"test_keywords.txt";
|
"test_wavs/test_keywords.txt";
|
||||||
|
|
||||||
KeywordSpotter kws = KeywordSpotter::Create(config);
|
KeywordSpotter kws = KeywordSpotter::Create(config);
|
||||||
if (!kws.Get()) {
|
if (!kws.Get()) {
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ while the model is still generating.
|
|||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
Example (1/6)
|
Example (1/7)
|
||||||
|
|
||||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||||
@@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
|||||||
--output-filename=./generated.wav \
|
--output-filename=./generated.wav \
|
||||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||||
|
|
||||||
Example (2/6)
|
Example (2/7)
|
||||||
|
|
||||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||||
tar xvf vits-zh-aishell3.tar.bz2
|
tar xvf vits-zh-aishell3.tar.bz2
|
||||||
@@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
|||||||
--output-filename=./liubei-21.wav \
|
--output-filename=./liubei-21.wav \
|
||||||
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
|
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
|
||||||
|
|
||||||
Example (3/6)
|
Example (3/7)
|
||||||
|
|
||||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
||||||
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
|
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
|
||||||
@@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
|||||||
--output-filename=./test-2.wav \
|
--output-filename=./test-2.wav \
|
||||||
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
|
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
|
||||||
|
|
||||||
Example (4/6)
|
Example (4/7)
|
||||||
|
|
||||||
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||||
tar xvf matcha-icefall-zh-baker.tar.bz2
|
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||||
@@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
|||||||
--output-filename=./test-matcha.wav \
|
--output-filename=./test-matcha.wav \
|
||||||
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||||||
|
|
||||||
Example (5/6)
|
Example (5/7)
|
||||||
|
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
@@ -88,7 +88,9 @@ python3 ./python-api-examples/offline-tts-play.py \
|
|||||||
--num-threads=2 \
|
--num-threads=2 \
|
||||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||||
|
|
||||||
Example (6/6)
|
Example (6/7)
|
||||||
|
|
||||||
|
(This version of kokoro supports only English)
|
||||||
|
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||||
tar xf kokoro-en-v0_19.tar.bz2
|
tar xf kokoro-en-v0_19.tar.bz2
|
||||||
@@ -105,6 +107,27 @@ python3 ./python-api-examples/offline-tts.py \
|
|||||||
--output-filename="./kokoro-10.wav" \
|
--output-filename="./kokoro-10.wav" \
|
||||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
|
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
|
||||||
|
|
||||||
|
Example (7/7)
|
||||||
|
|
||||||
|
(This version of kokoro supports English, Chinese, etc.)
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
|
||||||
|
tar xf kokoro-multi-lang-v1_0.tar.bz2
|
||||||
|
rm kokoro-multi-lang-v1_0.tar.bz2
|
||||||
|
|
||||||
|
python3 ./python-api-examples/offline-tts-play.py \
|
||||||
|
--debug=1 \
|
||||||
|
--kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
|
||||||
|
--kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
|
||||||
|
--kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
|
||||||
|
--kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
|
||||||
|
--kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
|
||||||
|
--kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
|
||||||
|
--num-threads=2 \
|
||||||
|
--sid=18 \
|
||||||
|
--output-filename="./kokoro-18-zh-en.wav" \
|
||||||
|
"中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
|
||||||
|
|
||||||
You can find more models at
|
You can find more models at
|
||||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||||
|
|
||||||
@@ -247,6 +270,20 @@ def add_kokoro_args(parser):
|
|||||||
help="Path to the dict directory of espeak-ng.",
|
help="Path to the dict directory of espeak-ng.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--kokoro-dict-dir",
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--kokoro-lexicon",
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@@ -459,6 +496,8 @@ def main():
|
|||||||
voices=args.kokoro_voices,
|
voices=args.kokoro_voices,
|
||||||
tokens=args.kokoro_tokens,
|
tokens=args.kokoro_tokens,
|
||||||
data_dir=args.kokoro_data_dir,
|
data_dir=args.kokoro_data_dir,
|
||||||
|
dict_dir=args.kokoro_dict_dir,
|
||||||
|
lexicon=args.kokoro_lexicon,
|
||||||
),
|
),
|
||||||
provider=args.provider,
|
provider=args.provider,
|
||||||
debug=args.debug,
|
debug=args.debug,
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ generated audio.
|
|||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
Example (1/6)
|
Example (1/7)
|
||||||
|
|
||||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||||
@@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \
|
|||||||
--output-filename=./generated.wav \
|
--output-filename=./generated.wav \
|
||||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||||
|
|
||||||
Example (2/6)
|
Example (2/7)
|
||||||
|
|
||||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||||
@@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \
|
|||||||
--output-filename=./liubei-21.wav \
|
--output-filename=./liubei-21.wav \
|
||||||
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
|
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
|
||||||
|
|
||||||
Example (3/6)
|
Example (3/7)
|
||||||
|
|
||||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
||||||
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
|
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
|
||||||
@@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \
|
|||||||
--output-filename=./test-2.wav \
|
--output-filename=./test-2.wav \
|
||||||
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
|
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
|
||||||
|
|
||||||
Example (4/6)
|
Example (4/7)
|
||||||
|
|
||||||
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||||
tar xvf matcha-icefall-zh-baker.tar.bz2
|
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||||
@@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \
|
|||||||
--output-filename=./test-matcha.wav \
|
--output-filename=./test-matcha.wav \
|
||||||
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||||||
|
|
||||||
Example (5/6)
|
Example (5/7)
|
||||||
|
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
@@ -89,7 +89,9 @@ python3 ./python-api-examples/offline-tts.py \
|
|||||||
--num-threads=2 \
|
--num-threads=2 \
|
||||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||||
|
|
||||||
Example (6/6)
|
Example (6/7)
|
||||||
|
|
||||||
|
(This version of kokoro supports only English)
|
||||||
|
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||||
tar xf kokoro-en-v0_19.tar.bz2
|
tar xf kokoro-en-v0_19.tar.bz2
|
||||||
@@ -106,6 +108,27 @@ python3 ./python-api-examples/offline-tts.py \
|
|||||||
--output-filename="./kokoro-10.wav" \
|
--output-filename="./kokoro-10.wav" \
|
||||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
|
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
|
||||||
|
|
||||||
|
Example (7/7)
|
||||||
|
|
||||||
|
(This version of kokoro supports English, Chinese, etc.)
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
|
||||||
|
tar xf kokoro-multi-lang-v1_0.tar.bz2
|
||||||
|
rm kokoro-multi-lang-v1_0.tar.bz2
|
||||||
|
|
||||||
|
python3 ./python-api-examples/offline-tts.py \
|
||||||
|
--debug=1 \
|
||||||
|
--kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
|
||||||
|
--kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
|
||||||
|
--kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
|
||||||
|
--kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
|
||||||
|
--kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
|
||||||
|
--kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
|
||||||
|
--num-threads=2 \
|
||||||
|
--sid=18 \
|
||||||
|
--output-filename="./kokoro-18-zh-en.wav" \
|
||||||
|
"中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
|
||||||
|
|
||||||
You can find more models at
|
You can find more models at
|
||||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||||
|
|
||||||
@@ -234,6 +257,20 @@ def add_kokoro_args(parser):
|
|||||||
help="Path to the dict directory of espeak-ng.",
|
help="Path to the dict directory of espeak-ng.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--kokoro-dict-dir",
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--kokoro-lexicon",
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@@ -342,6 +379,8 @@ def main():
|
|||||||
voices=args.kokoro_voices,
|
voices=args.kokoro_voices,
|
||||||
tokens=args.kokoro_tokens,
|
tokens=args.kokoro_tokens,
|
||||||
data_dir=args.kokoro_data_dir,
|
data_dir=args.kokoro_data_dir,
|
||||||
|
dict_dir=args.kokoro_dict_dir,
|
||||||
|
lexicon=args.kokoro_lexicon,
|
||||||
),
|
),
|
||||||
provider=args.provider,
|
provider=args.provider,
|
||||||
debug=args.debug,
|
debug=args.debug,
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ def main():
|
|||||||
with open("voices.bin", "wb") as f:
|
with open("voices.bin", "wb") as f:
|
||||||
for _, speaker in id2speaker.items():
|
for _, speaker in id2speaker.items():
|
||||||
m = torch.load(
|
m = torch.load(
|
||||||
f"{speaker}.pt",
|
f"voices/{speaker}.pt",
|
||||||
weights_only=True,
|
weights_only=True,
|
||||||
map_location="cpu",
|
map_location="cpu",
|
||||||
).numpy()
|
).numpy()
|
||||||
|
|||||||
@@ -153,6 +153,7 @@ if(SHERPA_ONNX_ENABLE_TTS)
|
|||||||
list(APPEND sources
|
list(APPEND sources
|
||||||
hifigan-vocoder.cc
|
hifigan-vocoder.cc
|
||||||
jieba-lexicon.cc
|
jieba-lexicon.cc
|
||||||
|
kokoro-multi-lang-lexicon.cc
|
||||||
lexicon.cc
|
lexicon.cc
|
||||||
melo-tts-lexicon.cc
|
melo-tts-lexicon.cc
|
||||||
offline-tts-character-frontend.cc
|
offline-tts-character-frontend.cc
|
||||||
|
|||||||
522
sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
Normal file
522
sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
Normal file
@@ -0,0 +1,522 @@
|
|||||||
|
// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
|
||||||
|
|
||||||
|
#include <codecvt>
|
||||||
|
#include <fstream>
|
||||||
|
#include <locale>
|
||||||
|
#include <regex> // NOLINT
|
||||||
|
#include <sstream>
|
||||||
|
#include <strstream>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#if __ANDROID_API__ >= 9
|
||||||
|
#include "android/asset_manager.h"
|
||||||
|
#include "android/asset_manager_jni.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if __OHOS__
|
||||||
|
#include "rawfile/raw_file_manager.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "cppjieba/Jieba.hpp"
|
||||||
|
#include "espeak-ng/speak_lib.h"
|
||||||
|
#include "phoneme_ids.hpp"
|
||||||
|
#include "phonemize.hpp"
|
||||||
|
#include "sherpa-onnx/csrc/file-utils.h"
|
||||||
|
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||||
|
#include "sherpa-onnx/csrc/symbol-table.h"
|
||||||
|
#include "sherpa-onnx/csrc/text-utils.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
void CallPhonemizeEspeak(const std::string &text,
|
||||||
|
piper::eSpeakPhonemeConfig &config, // NOLINT
|
||||||
|
std::vector<std::vector<piper::Phoneme>> *phonemes);
|
||||||
|
|
||||||
|
static std::wstring ToWideString(const std::string &s) {
|
||||||
|
// see
|
||||||
|
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||||
|
return converter.from_bytes(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string ToString(const std::wstring &s) {
|
||||||
|
// see
|
||||||
|
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||||
|
return converter.to_bytes(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
class KokoroMultiLangLexicon::Impl {
|
||||||
|
public:
|
||||||
|
Impl(const std::string &tokens, const std::string &lexicon,
|
||||||
|
const std::string &dict_dir, const std::string &data_dir,
|
||||||
|
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
|
||||||
|
: meta_data_(meta_data), debug_(debug) {
|
||||||
|
InitTokens(tokens);
|
||||||
|
|
||||||
|
InitLexicon(lexicon);
|
||||||
|
|
||||||
|
InitJieba(dict_dir);
|
||||||
|
|
||||||
|
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Manager>
|
||||||
|
Impl(Manager *mgr, const std::string &tokens, const std::string &lexicon,
|
||||||
|
const std::string &dict_dir, const std::string &data_dir,
|
||||||
|
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
|
||||||
|
: meta_data_(meta_data), debug_(debug) {
|
||||||
|
InitTokens(mgr, tokens);
|
||||||
|
|
||||||
|
InitLexicon(mgr, lexicon);
|
||||||
|
|
||||||
|
// we assume you have copied dict_dir and data_dir from assets to some path
|
||||||
|
InitJieba(dict_dir);
|
||||||
|
|
||||||
|
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
|
||||||
|
std::string text = ToLowerCase(_text);
|
||||||
|
if (debug_) {
|
||||||
|
SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::pair<std::string, std::string>> replace_str_pairs = {
|
||||||
|
{",", ","}, {":", ","}, {"、", ","}, {";", ";"}, {":", ":"},
|
||||||
|
{"。", "."}, {"?", "?"}, {"!", "!"}, {"\\s+", " "},
|
||||||
|
};
|
||||||
|
for (const auto &p : replace_str_pairs) {
|
||||||
|
std::regex re(p.first);
|
||||||
|
text = std::regex_replace(text, re, p.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug_) {
|
||||||
|
SHERPA_ONNX_LOGE("After replacing punctuations and merging spaces:\n%s",
|
||||||
|
text.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://en.cppreference.com/w/cpp/regex
|
||||||
|
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
|
||||||
|
std::string expr =
|
||||||
|
"([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)";
|
||||||
|
|
||||||
|
auto ws = ToWideString(text);
|
||||||
|
std::wstring wexpr = ToWideString(expr);
|
||||||
|
std::wregex we(wexpr);
|
||||||
|
|
||||||
|
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
|
||||||
|
auto end = std::wsregex_iterator();
|
||||||
|
|
||||||
|
std::vector<TokenIDs> ans;
|
||||||
|
|
||||||
|
for (std::wsregex_iterator i = begin; i != end; ++i) {
|
||||||
|
std::wsmatch match = *i;
|
||||||
|
std::wstring match_str = match.str();
|
||||||
|
auto ms = ToString(match_str);
|
||||||
|
uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
|
||||||
|
|
||||||
|
std::vector<std::vector<int32_t>> ids_vec;
|
||||||
|
|
||||||
|
if (c < 0x80) {
|
||||||
|
if (debug_) {
|
||||||
|
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
||||||
|
}
|
||||||
|
ids_vec = ConvertEnglishToTokenIDs(ms);
|
||||||
|
} else {
|
||||||
|
if (debug_) {
|
||||||
|
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
|
||||||
|
}
|
||||||
|
ids_vec = ConvertChineseToTokenIDs(ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto &ids : ids_vec) {
|
||||||
|
if (ids.size() > 4) {
|
||||||
|
ans.emplace_back(ids);
|
||||||
|
} else {
|
||||||
|
if (ans.empty()) {
|
||||||
|
ans.emplace_back(ids);
|
||||||
|
} else {
|
||||||
|
ans.back().tokens.back() = ids[1];
|
||||||
|
ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2,
|
||||||
|
ids.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug_) {
|
||||||
|
for (const auto &v : ans) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "\n";
|
||||||
|
std::string sep;
|
||||||
|
for (auto i : v.tokens) {
|
||||||
|
os << sep << i;
|
||||||
|
sep = " ";
|
||||||
|
}
|
||||||
|
os << "\n";
|
||||||
|
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool IsPunctuation(const std::string &text) const {
|
||||||
|
if (text == ";" || text == ":" || text == "," || text == "." ||
|
||||||
|
text == "!" || text == "?" || text == "—" || text == "…" ||
|
||||||
|
text == "\"" || text == "(" || text == ")" || text == "“" ||
|
||||||
|
text == "”") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int32_t> ConvertWordToIds(const std::string &w) const {
|
||||||
|
std::vector<int32_t> ans;
|
||||||
|
if (word2ids_.count(w)) {
|
||||||
|
ans = word2ids_.at(w);
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> words = SplitUtf8(w);
|
||||||
|
for (const auto &word : words) {
|
||||||
|
if (word2ids_.count(word)) {
|
||||||
|
auto ids = ConvertWordToIds(word);
|
||||||
|
ans.insert(ans.end(), ids.begin(), ids.end());
|
||||||
|
} else {
|
||||||
|
SHERPA_ONNX_LOGE("Skip OOV: '%s'", word.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int32_t>> ConvertChineseToTokenIDs(
|
||||||
|
const std::string &text) const {
|
||||||
|
bool is_hmm = true;
|
||||||
|
|
||||||
|
std::vector<std::string> words;
|
||||||
|
jieba_->Cut(text, words, is_hmm);
|
||||||
|
if (debug_) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "After jieba processing:\n";
|
||||||
|
|
||||||
|
std::string sep;
|
||||||
|
for (const auto &w : words) {
|
||||||
|
os << sep << w;
|
||||||
|
sep = "_";
|
||||||
|
}
|
||||||
|
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int32_t>> ans;
|
||||||
|
std::vector<int32_t> this_sentence;
|
||||||
|
int32_t max_len = meta_data_.max_token_len;
|
||||||
|
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
for (const auto &w : words) {
|
||||||
|
auto ids = ConvertWordToIds(w);
|
||||||
|
if (this_sentence.size() + ids.size() > max_len - 2) {
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
ans.push_back(std::move(this_sentence));
|
||||||
|
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this_sentence.size() > 1) {
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
ans.push_back(std::move(this_sentence));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug_) {
|
||||||
|
for (const auto &v : ans) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "\n";
|
||||||
|
std::string sep;
|
||||||
|
for (auto i : v) {
|
||||||
|
os << sep << i;
|
||||||
|
sep = " ";
|
||||||
|
}
|
||||||
|
os << "\n";
|
||||||
|
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
|
||||||
|
const std::string &text) const {
|
||||||
|
std::vector<std::string> words = SplitUtf8(text);
|
||||||
|
if (debug_) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "After splitting to words: ";
|
||||||
|
std::string sep;
|
||||||
|
for (const auto &w : words) {
|
||||||
|
os << sep << w;
|
||||||
|
sep = "_";
|
||||||
|
}
|
||||||
|
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int32_t>> ans;
|
||||||
|
int32_t max_len = meta_data_.max_token_len;
|
||||||
|
std::vector<int32_t> this_sentence;
|
||||||
|
|
||||||
|
int32_t space_id = token2id_.at(" ");
|
||||||
|
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
|
||||||
|
for (const auto &word : words) {
|
||||||
|
if (IsPunctuation(word)) {
|
||||||
|
this_sentence.push_back(token2id_.at(word));
|
||||||
|
|
||||||
|
if (this_sentence.size() > max_len - 2) {
|
||||||
|
// this sentence is too long, split it
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
ans.push_back(std::move(this_sentence));
|
||||||
|
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (word == "." || word == "!" || word == "?" || word == ";") {
|
||||||
|
// Note: You can add more punctuations here to split the text
|
||||||
|
// into sentences. We just use four here: .!?;
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
ans.push_back(std::move(this_sentence));
|
||||||
|
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
}
|
||||||
|
} else if (word2ids_.count(word)) {
|
||||||
|
const auto &ids = word2ids_.at(word);
|
||||||
|
if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
ans.push_back(std::move(this_sentence));
|
||||||
|
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
|
||||||
|
this_sentence.push_back(space_id);
|
||||||
|
} else {
|
||||||
|
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
|
||||||
|
|
||||||
|
piper::eSpeakPhonemeConfig config;
|
||||||
|
|
||||||
|
config.voice = "en-us";
|
||||||
|
|
||||||
|
std::vector<std::vector<piper::Phoneme>> phonemes;
|
||||||
|
|
||||||
|
CallPhonemizeEspeak(word, config, &phonemes);
|
||||||
|
// Note phonemes[i] contains a vector of unicode codepoints;
|
||||||
|
// we need to convert them to utf8
|
||||||
|
|
||||||
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
|
||||||
|
|
||||||
|
std::vector<int32_t> ids;
|
||||||
|
for (const auto &v : phonemes) {
|
||||||
|
for (const auto p : v) {
|
||||||
|
auto token = conv.to_bytes(p);
|
||||||
|
if (token2id_.count(token)) {
|
||||||
|
ids.push_back(token2id_.at(token));
|
||||||
|
} else {
|
||||||
|
SHERPA_ONNX_LOGE("Skip OOV token '%s' from '%s'", token.c_str(),
|
||||||
|
word.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
ans.push_back(std::move(this_sentence));
|
||||||
|
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
|
||||||
|
this_sentence.push_back(space_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this_sentence.size() > 1) {
|
||||||
|
this_sentence.push_back(0);
|
||||||
|
ans.push_back(std::move(this_sentence));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug_) {
|
||||||
|
for (const auto &v : ans) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "\n";
|
||||||
|
std::string sep;
|
||||||
|
for (auto i : v) {
|
||||||
|
os << sep << i;
|
||||||
|
sep = " ";
|
||||||
|
}
|
||||||
|
os << "\n";
|
||||||
|
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitTokens(const std::string &tokens) {
|
||||||
|
std::ifstream is(tokens);
|
||||||
|
InitTokens(is);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Manager>
|
||||||
|
void InitTokens(Manager *mgr, const std::string &tokens) {
|
||||||
|
auto buf = ReadFile(mgr, tokens);
|
||||||
|
|
||||||
|
std::istrstream is(buf.data(), buf.size());
|
||||||
|
InitTokens(is);
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitTokens(std::istream &is) {
|
||||||
|
token2id_ = ReadTokens(is); // defined in ./symbol-table.cc
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitLexicon(const std::string &lexicon) {
|
||||||
|
std::vector<std::string> files;
|
||||||
|
SplitStringToVector(lexicon, ",", false, &files);
|
||||||
|
for (const auto &f : files) {
|
||||||
|
std::ifstream is(f);
|
||||||
|
InitLexicon(is);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Manager>
|
||||||
|
void InitLexicon(Manager *mgr, const std::string &lexicon) {
|
||||||
|
std::vector<std::string> files;
|
||||||
|
SplitStringToVector(lexicon, ",", false, &files);
|
||||||
|
for (const auto &f : files) {
|
||||||
|
auto buf = ReadFile(mgr, f);
|
||||||
|
|
||||||
|
std::istrstream is(buf.data(), buf.size());
|
||||||
|
InitLexicon(is);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitLexicon(std::istream &is) {
|
||||||
|
std::string word;
|
||||||
|
std::vector<std::string> token_list;
|
||||||
|
std::string token;
|
||||||
|
|
||||||
|
std::string line;
|
||||||
|
int32_t line_num = 0;
|
||||||
|
int32_t num_warn = 0;
|
||||||
|
while (std::getline(is, line)) {
|
||||||
|
++line_num;
|
||||||
|
std::istringstream iss(line);
|
||||||
|
|
||||||
|
token_list.clear();
|
||||||
|
iss >> word;
|
||||||
|
ToLowerCase(&word);
|
||||||
|
|
||||||
|
if (word2ids_.count(word)) {
|
||||||
|
num_warn += 1;
|
||||||
|
if (num_warn < 10) {
|
||||||
|
SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
|
||||||
|
word.c_str(), line_num, line.c_str());
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (iss >> token) {
|
||||||
|
token_list.push_back(std::move(token));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
|
||||||
|
|
||||||
|
if (ids.empty()) {
|
||||||
|
SHERPA_ONNX_LOGE(
|
||||||
|
"Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
|
||||||
|
word.c_str(), line_num, line.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
word2ids_.insert({std::move(word), std::move(ids)});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitJieba(const std::string &dict_dir) {
|
||||||
|
std::string dict = dict_dir + "/jieba.dict.utf8";
|
||||||
|
std::string hmm = dict_dir + "/hmm_model.utf8";
|
||||||
|
std::string user_dict = dict_dir + "/user.dict.utf8";
|
||||||
|
std::string idf = dict_dir + "/idf.utf8";
|
||||||
|
std::string stop_word = dict_dir + "/stop_words.utf8";
|
||||||
|
|
||||||
|
AssertFileExists(dict);
|
||||||
|
AssertFileExists(hmm);
|
||||||
|
AssertFileExists(user_dict);
|
||||||
|
AssertFileExists(idf);
|
||||||
|
AssertFileExists(stop_word);
|
||||||
|
|
||||||
|
jieba_ =
|
||||||
|
std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
OfflineTtsKokoroModelMetaData meta_data_;
|
||||||
|
|
||||||
|
// word to token IDs
|
||||||
|
std::unordered_map<std::string, std::vector<int32_t>> word2ids_;
|
||||||
|
|
||||||
|
// tokens.txt is saved in token2id_
|
||||||
|
std::unordered_map<std::string, int32_t> token2id_;
|
||||||
|
|
||||||
|
std::unique_ptr<cppjieba::Jieba> jieba_;
|
||||||
|
bool debug_ = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
KokoroMultiLangLexicon::~KokoroMultiLangLexicon() = default;
|
||||||
|
|
||||||
|
KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||||||
|
const std::string &tokens, const std::string &lexicon,
|
||||||
|
const std::string &dict_dir, const std::string &data_dir,
|
||||||
|
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
|
||||||
|
: impl_(std::make_unique<Impl>(tokens, lexicon, dict_dir, data_dir,
|
||||||
|
meta_data, debug)) {}
|
||||||
|
|
||||||
|
template <typename Manager>
|
||||||
|
KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||||||
|
Manager *mgr, const std::string &tokens, const std::string &lexicon,
|
||||||
|
const std::string &dict_dir, const std::string &data_dir,
|
||||||
|
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
|
||||||
|
: impl_(std::make_unique<Impl>(mgr, tokens, lexicon, dict_dir, data_dir,
|
||||||
|
meta_data, debug)) {}
|
||||||
|
|
||||||
|
std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
|
||||||
|
const std::string &text, const std::string & /*unused_voice = ""*/) const {
|
||||||
|
return impl_->ConvertTextToTokenIds(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if __ANDROID_API__ >= 9
|
||||||
|
template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||||||
|
AAssetManager *mgr, const std::string &tokens, const std::string &lexicon,
|
||||||
|
const std::string &dict_dir, const std::string &data_dir,
|
||||||
|
const OfflineTtsKokoroModelMetaData &meta_data, bool debug);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if __OHOS__
|
||||||
|
template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||||||
|
NativeResourceManager *mgr, const std::string &tokens,
|
||||||
|
const std::string &lexicon, const std::string &dict_dir,
|
||||||
|
const std::string &data_dir, const OfflineTtsKokoroModelMetaData &meta_data,
|
||||||
|
bool debug);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace sherpa_onnx
|
||||||
45
sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h
Normal file
45
sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#ifndef SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
|
||||||
|
#define SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||||
|
#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
class KokoroMultiLangLexicon : public OfflineTtsFrontend {
|
||||||
|
public:
|
||||||
|
~KokoroMultiLangLexicon() override;
|
||||||
|
|
||||||
|
KokoroMultiLangLexicon(const std::string &tokens, const std::string &lexicon,
|
||||||
|
const std::string &dict_dir,
|
||||||
|
const std::string &data_dir,
|
||||||
|
const OfflineTtsKokoroModelMetaData &meta_data,
|
||||||
|
bool debug);
|
||||||
|
|
||||||
|
template <typename Manager>
|
||||||
|
KokoroMultiLangLexicon(Manager *mgr, const std::string &tokens,
|
||||||
|
const std::string &lexicon,
|
||||||
|
const std::string &dict_dir,
|
||||||
|
const std::string &data_dir,
|
||||||
|
const OfflineTtsKokoroModelMetaData &meta_data,
|
||||||
|
bool debug);
|
||||||
|
|
||||||
|
std::vector<TokenIDs> ConvertTextToTokenIds(
|
||||||
|
const std::string &text, const std::string &voice = "") const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
class Impl;
|
||||||
|
std::unique_ptr<Impl> impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace sherpa_onnx
|
||||||
|
|
||||||
|
#endif // SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
|
||||||
@@ -6,7 +6,9 @@
|
|||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <regex> // NOLINT
|
#include <regex> // NOLINT
|
||||||
|
#include <sstream>
|
||||||
#include <strstream>
|
#include <strstream>
|
||||||
|
#include <unordered_map>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#if __ANDROID_API__ >= 9
|
#if __ANDROID_API__ >= 9
|
||||||
#include "android/asset_manager.h"
|
#include "android/asset_manager.h"
|
||||||
|
|||||||
@@ -7,7 +7,6 @@
|
|||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||||
|
|||||||
@@ -19,6 +19,9 @@ struct TokenIDs {
|
|||||||
/*implicit*/ TokenIDs(std::vector<int64_t> tokens) // NOLINT
|
/*implicit*/ TokenIDs(std::vector<int64_t> tokens) // NOLINT
|
||||||
: tokens{std::move(tokens)} {}
|
: tokens{std::move(tokens)} {}
|
||||||
|
|
||||||
|
/*implicit*/ TokenIDs(const std::vector<int32_t> &tokens) // NOLINT
|
||||||
|
: tokens{tokens.begin(), tokens.end()} {}
|
||||||
|
|
||||||
TokenIDs(std::vector<int64_t> tokens, // NOLINT
|
TokenIDs(std::vector<int64_t> tokens, // NOLINT
|
||||||
std::vector<int64_t> tones) // NOLINT
|
std::vector<int64_t> tones) // NOLINT
|
||||||
: tokens{std::move(tokens)}, tones{std::move(tones)} {}
|
: tokens{std::move(tokens)}, tones{std::move(tones)} {}
|
||||||
@@ -51,6 +54,9 @@ class OfflineTtsFrontend {
|
|||||||
const std::string &text, const std::string &voice = "") const = 0;
|
const std::string &text, const std::string &voice = "") const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// implementation is in ./piper-phonemize-lexicon.cc
|
||||||
|
void InitEspeak(const std::string &data_dir);
|
||||||
|
|
||||||
} // namespace sherpa_onnx
|
} // namespace sherpa_onnx
|
||||||
|
|
||||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
|
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
#include "fst/extensions/far/far.h"
|
#include "fst/extensions/far/far.h"
|
||||||
#include "kaldifst/csrc/kaldi-fst-io.h"
|
#include "kaldifst/csrc/kaldi-fst-io.h"
|
||||||
#include "kaldifst/csrc/text-normalizer.h"
|
#include "kaldifst/csrc/text-normalizer.h"
|
||||||
|
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
|
||||||
#include "sherpa-onnx/csrc/lexicon.h"
|
#include "sherpa-onnx/csrc/lexicon.h"
|
||||||
#include "sherpa-onnx/csrc/macros.h"
|
#include "sherpa-onnx/csrc/macros.h"
|
||||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||||
@@ -314,6 +315,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
|||||||
template <typename Manager>
|
template <typename Manager>
|
||||||
void InitFrontend(Manager *mgr) {
|
void InitFrontend(Manager *mgr) {
|
||||||
const auto &meta_data = model_->GetMetaData();
|
const auto &meta_data = model_->GetMetaData();
|
||||||
|
|
||||||
|
if (meta_data.version >= 2) {
|
||||||
|
// this is a multi-lingual model, we require that you pass lexicon
|
||||||
|
// and dict_dir
|
||||||
|
if (config_.model.kokoro.lexicon.empty() ||
|
||||||
|
config_.model.kokoro.dict_dir.empty()) {
|
||||||
|
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
|
||||||
|
SHERPA_ONNX_LOGE(
|
||||||
|
"You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
|
||||||
|
"v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
|
||||||
|
SHERPA_ONNX_EXIT(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
frontend_ = std::make_unique<KokoroMultiLangLexicon>(
|
||||||
|
mgr, config_.model.kokoro.tokens, config_.model.kokoro.lexicon,
|
||||||
|
config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir,
|
||||||
|
meta_data, config_.model.debug);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
||||||
mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir,
|
mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir,
|
||||||
meta_data);
|
meta_data);
|
||||||
@@ -321,7 +343,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
|||||||
|
|
||||||
void InitFrontend() {
|
void InitFrontend() {
|
||||||
const auto &meta_data = model_->GetMetaData();
|
const auto &meta_data = model_->GetMetaData();
|
||||||
|
if (meta_data.version >= 2) {
|
||||||
|
// this is a multi-lingual model, we require that you pass lexicon
|
||||||
|
// and dict_dir
|
||||||
|
if (config_.model.kokoro.lexicon.empty() ||
|
||||||
|
config_.model.kokoro.dict_dir.empty()) {
|
||||||
|
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
|
||||||
|
SHERPA_ONNX_LOGE(
|
||||||
|
"You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
|
||||||
|
"v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
|
||||||
|
SHERPA_ONNX_EXIT(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
frontend_ = std::make_unique<KokoroMultiLangLexicon>(
|
||||||
|
config_.model.kokoro.tokens, config_.model.kokoro.lexicon,
|
||||||
|
config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir,
|
||||||
|
meta_data, config_.model.debug);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// this is for kokoro v0.19, which supports only English
|
||||||
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
||||||
config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data);
|
config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
|
|
||||||
#include "sherpa-onnx/csrc/file-utils.h"
|
#include "sherpa-onnx/csrc/file-utils.h"
|
||||||
#include "sherpa-onnx/csrc/macros.h"
|
#include "sherpa-onnx/csrc/macros.h"
|
||||||
|
#include "sherpa-onnx/csrc/text-utils.h"
|
||||||
|
|
||||||
namespace sherpa_onnx {
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
@@ -17,8 +18,16 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
|
|||||||
"Path to voices.bin for Kokoro models");
|
"Path to voices.bin for Kokoro models");
|
||||||
po->Register("kokoro-tokens", &tokens,
|
po->Register("kokoro-tokens", &tokens,
|
||||||
"Path to tokens.txt for Kokoro models");
|
"Path to tokens.txt for Kokoro models");
|
||||||
|
po->Register(
|
||||||
|
"kokoro-lexicon", &lexicon,
|
||||||
|
"Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0"
|
||||||
|
"You can pass multiple files, separated by ','. Example: "
|
||||||
|
"./lexicon-us-en.txt,./lexicon-zh.txt");
|
||||||
po->Register("kokoro-data-dir", &data_dir,
|
po->Register("kokoro-data-dir", &data_dir,
|
||||||
"Path to the directory containing dict for espeak-ng.");
|
"Path to the directory containing dict for espeak-ng.");
|
||||||
|
po->Register("kokoro-dict-dir", &dict_dir,
|
||||||
|
"Path to the directory containing dict for jieba. "
|
||||||
|
"Used only for Kokoro >= v1.0");
|
||||||
po->Register("kokoro-length-scale", &length_scale,
|
po->Register("kokoro-length-scale", &length_scale,
|
||||||
"Speech speed. Larger->Slower; Smaller->faster.");
|
"Speech speed. Larger->Slower; Smaller->faster.");
|
||||||
}
|
}
|
||||||
@@ -44,6 +53,19 @@ bool OfflineTtsKokoroModelConfig::Validate() const {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!lexicon.empty()) {
|
||||||
|
std::vector<std::string> files;
|
||||||
|
SplitStringToVector(lexicon, ",", false, &files);
|
||||||
|
for (const auto &f : files) {
|
||||||
|
if (!FileExists(f)) {
|
||||||
|
SHERPA_ONNX_LOGE(
|
||||||
|
"lexicon '%s' does not exist. Please re-check --kokoro-lexicon",
|
||||||
|
f.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (data_dir.empty()) {
|
if (data_dir.empty()) {
|
||||||
SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir");
|
SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir");
|
||||||
return false;
|
return false;
|
||||||
@@ -77,6 +99,21 @@ bool OfflineTtsKokoroModelConfig::Validate() const {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!dict_dir.empty()) {
|
||||||
|
std::vector<std::string> required_files = {
|
||||||
|
"jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8",
|
||||||
|
"idf.utf8", "stop_words.utf8",
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const auto &f : required_files) {
|
||||||
|
if (!FileExists(dict_dir + "/" + f)) {
|
||||||
|
SHERPA_ONNX_LOGE("'%s/%s' does not exist. Please check kokoro-dict-dir",
|
||||||
|
dict_dir.c_str(), f.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -87,7 +124,9 @@ std::string OfflineTtsKokoroModelConfig::ToString() const {
|
|||||||
os << "model=\"" << model << "\", ";
|
os << "model=\"" << model << "\", ";
|
||||||
os << "voices=\"" << voices << "\", ";
|
os << "voices=\"" << voices << "\", ";
|
||||||
os << "tokens=\"" << tokens << "\", ";
|
os << "tokens=\"" << tokens << "\", ";
|
||||||
|
os << "lexicon=\"" << lexicon << "\", ";
|
||||||
os << "data_dir=\"" << data_dir << "\", ";
|
os << "data_dir=\"" << data_dir << "\", ";
|
||||||
|
os << "dict_dir=\"" << dict_dir << "\", ";
|
||||||
os << "length_scale=" << length_scale << ")";
|
os << "length_scale=" << length_scale << ")";
|
||||||
|
|
||||||
return os.str();
|
return os.str();
|
||||||
|
|||||||
@@ -16,8 +16,14 @@ struct OfflineTtsKokoroModelConfig {
|
|||||||
std::string voices;
|
std::string voices;
|
||||||
std::string tokens;
|
std::string tokens;
|
||||||
|
|
||||||
|
// Note: You can pass multiple files, separated by ",", to lexicon
|
||||||
|
// Example: lexicon = "./lexicon-gb-en.txt,./lexicon-zh.txt";
|
||||||
|
std::string lexicon;
|
||||||
|
|
||||||
std::string data_dir;
|
std::string data_dir;
|
||||||
|
|
||||||
|
std::string dict_dir;
|
||||||
|
|
||||||
// speed = 1 / length_scale
|
// speed = 1 / length_scale
|
||||||
float length_scale = 1.0;
|
float length_scale = 1.0;
|
||||||
|
|
||||||
@@ -26,11 +32,15 @@ struct OfflineTtsKokoroModelConfig {
|
|||||||
OfflineTtsKokoroModelConfig(const std::string &model,
|
OfflineTtsKokoroModelConfig(const std::string &model,
|
||||||
const std::string &voices,
|
const std::string &voices,
|
||||||
const std::string &tokens,
|
const std::string &tokens,
|
||||||
const std::string &data_dir, float length_scale)
|
const std::string &lexicon,
|
||||||
|
const std::string &data_dir,
|
||||||
|
const std::string &dict_dir, float length_scale)
|
||||||
: model(model),
|
: model(model),
|
||||||
voices(voices),
|
voices(voices),
|
||||||
tokens(tokens),
|
tokens(tokens),
|
||||||
|
lexicon(lexicon),
|
||||||
data_dir(data_dir),
|
data_dir(data_dir),
|
||||||
|
dict_dir(dict_dir),
|
||||||
length_scale(length_scale) {}
|
length_scale(length_scale) {}
|
||||||
|
|
||||||
void Register(ParseOptions *po);
|
void Register(ParseOptions *po);
|
||||||
|
|||||||
@@ -32,10 +32,9 @@
|
|||||||
|
|
||||||
namespace sherpa_onnx {
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
static void CallPhonemizeEspeak(
|
void CallPhonemizeEspeak(const std::string &text,
|
||||||
const std::string &text,
|
piper::eSpeakPhonemeConfig &config, // NOLINT
|
||||||
piper::eSpeakPhonemeConfig &config, // NOLINT
|
std::vector<std::vector<piper::Phoneme>> *phonemes) {
|
||||||
std::vector<std::vector<piper::Phoneme>> *phonemes) {
|
|
||||||
static std::mutex espeak_mutex;
|
static std::mutex espeak_mutex;
|
||||||
|
|
||||||
std::lock_guard<std::mutex> lock(espeak_mutex);
|
std::lock_guard<std::mutex> lock(espeak_mutex);
|
||||||
@@ -245,7 +244,7 @@ static std::vector<int64_t> CoquiPhonemesToIds(
|
|||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void InitEspeak(const std::string &data_dir) {
|
void InitEspeak(const std::string &data_dir) {
|
||||||
static std::once_flag init_flag;
|
static std::once_flag init_flag;
|
||||||
std::call_once(init_flag, [data_dir]() {
|
std::call_once(init_flag, [data_dir]() {
|
||||||
int32_t result =
|
int32_t result =
|
||||||
|
|||||||
@@ -241,7 +241,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/,
|
|||||||
jlong ptr, jstring text,
|
jlong ptr, jstring text,
|
||||||
jint sid, jfloat speed) {
|
jint sid, jfloat speed) {
|
||||||
const char *p_text = env->GetStringUTFChars(text, nullptr);
|
const char *p_text = env->GetStringUTFChars(text, nullptr);
|
||||||
SHERPA_ONNX_LOGE("string is: %s", p_text);
|
|
||||||
|
|
||||||
auto audio = reinterpret_cast<sherpa_onnx::OfflineTts *>(ptr)->Generate(
|
auto audio = reinterpret_cast<sherpa_onnx::OfflineTts *>(ptr)->Generate(
|
||||||
p_text, sid, speed);
|
p_text, sid, speed);
|
||||||
@@ -267,7 +266,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl(
|
|||||||
JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid,
|
JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid,
|
||||||
jfloat speed, jobject callback) {
|
jfloat speed, jobject callback) {
|
||||||
const char *p_text = env->GetStringUTFChars(text, nullptr);
|
const char *p_text = env->GetStringUTFChars(text, nullptr);
|
||||||
SHERPA_ONNX_LOGE("string is: %s", p_text);
|
|
||||||
|
|
||||||
std::function<int32_t(const float *, int32_t, float)> callback_wrapper =
|
std::function<int32_t(const float *, int32_t, float)> callback_wrapper =
|
||||||
[env, callback](const float *samples, int32_t n,
|
[env, callback](const float *samples, int32_t n,
|
||||||
|
|||||||
@@ -16,13 +16,17 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
|
|||||||
py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig")
|
py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig")
|
||||||
.def(py::init<>())
|
.def(py::init<>())
|
||||||
.def(py::init<const std::string &, const std::string &,
|
.def(py::init<const std::string &, const std::string &,
|
||||||
|
const std::string &, const std::string &,
|
||||||
const std::string &, const std::string &, float>(),
|
const std::string &, const std::string &, float>(),
|
||||||
py::arg("model"), py::arg("voices"), py::arg("tokens"),
|
py::arg("model"), py::arg("voices"), py::arg("tokens"),
|
||||||
py::arg("data_dir"), py::arg("length_scale") = 1.0)
|
py::arg("lexicon") = "", py::arg("data_dir"),
|
||||||
|
py::arg("dict_dir") = "", py::arg("length_scale") = 1.0)
|
||||||
.def_readwrite("model", &PyClass::model)
|
.def_readwrite("model", &PyClass::model)
|
||||||
.def_readwrite("voices", &PyClass::voices)
|
.def_readwrite("voices", &PyClass::voices)
|
||||||
.def_readwrite("tokens", &PyClass::tokens)
|
.def_readwrite("tokens", &PyClass::tokens)
|
||||||
|
.def_readwrite("lexicon", &PyClass::lexicon)
|
||||||
.def_readwrite("data_dir", &PyClass::data_dir)
|
.def_readwrite("data_dir", &PyClass::data_dir)
|
||||||
|
.def_readwrite("dict_dir", &PyClass::dict_dir)
|
||||||
.def_readwrite("length_scale", &PyClass::length_scale)
|
.def_readwrite("length_scale", &PyClass::length_scale)
|
||||||
.def("__str__", &PyClass::ToString)
|
.def("__str__", &PyClass::ToString)
|
||||||
.def("validate", &PyClass::Validate);
|
.def("validate", &PyClass::Validate);
|
||||||
|
|||||||
Reference in New Issue
Block a user