Support heteronyms in Chinese TTS (#738)
This commit is contained in:
8
.github/scripts/test-nodejs-npm.sh
vendored
8
.github/scripts/test-nodejs-npm.sh
vendored
@@ -70,9 +70,9 @@ rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
|
||||
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||
node ./test-offline-tts-en.js
|
||||
rm vits-piper-en_US-amy-low.tar.bz2
|
||||
rm vits-piper-en_US-amy-low*
|
||||
|
||||
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||
tar xvf vits-zh-aishell3.tar.bz2
|
||||
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||
node ./test-offline-tts-zh.js
|
||||
rm vits-zh-aishell3.tar.bz2
|
||||
rm vits-icefall-zh-aishell3*
|
||||
|
||||
1
.github/workflows/arm-linux-gnueabihf.yaml
vendored
1
.github/workflows/arm-linux-gnueabihf.yaml
vendored
@@ -173,6 +173,7 @@ jobs:
|
||||
rm -v $dst/lib/libasound.so
|
||||
rm -v $dst/lib/libonnxruntime.so
|
||||
rm -v $dst/lib/libsherpa-onnx-fst.so
|
||||
rm -v $dst/lib/libsherpa-onnx-fstfar.so
|
||||
fi
|
||||
|
||||
tree $dst
|
||||
|
||||
1
.github/workflows/riscv64-linux.yaml
vendored
1
.github/workflows/riscv64-linux.yaml
vendored
@@ -211,6 +211,7 @@ jobs:
|
||||
rm -fv $dst/lib/libasound.so
|
||||
rm -fv $dst/lib/libonnxruntime.so
|
||||
rm -fv $dst/lib/libsherpa-onnx-fst.so
|
||||
rm -fv $dst/lib/libsherpa-onnx-fstfar.so
|
||||
fi
|
||||
|
||||
tree $dst
|
||||
|
||||
6
.github/workflows/test-go.yaml
vendored
6
.github/workflows/test-go.yaml
vendored
@@ -111,9 +111,11 @@ jobs:
|
||||
rm -rf vits-vctk
|
||||
|
||||
echo "Test vits-zh-aishell3"
|
||||
git clone https://huggingface.co/csukuangfj/vits-zh-aishell3
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||
rm vits-icefall-zh-aishell3.tar.bz2
|
||||
./run-vits-zh-aishell3.sh
|
||||
rm -rf vits-zh-aishell3
|
||||
rm -rf vits-icefall-zh-aishell3
|
||||
|
||||
echo "Test vits-piper-en_US-lessac-medium"
|
||||
git clone https://huggingface.co/csukuangfj/vits-piper-en_US-lessac-medium
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -90,3 +90,4 @@ sherpa-onnx-paraformer-trilingual-zh-cantonese-en
|
||||
sr-data
|
||||
*xcworkspace/xcuserdata/*
|
||||
|
||||
vits-icefall-*
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
|
||||
project(sherpa-onnx)
|
||||
|
||||
set(SHERPA_ONNX_VERSION "1.9.16")
|
||||
set(SHERPA_ONNX_VERSION "1.9.17")
|
||||
|
||||
# Disable warning about
|
||||
#
|
||||
|
||||
@@ -155,6 +155,7 @@ class MainActivity : AppCompatActivity() {
|
||||
var modelDir: String?
|
||||
var modelName: String?
|
||||
var ruleFsts: String?
|
||||
var ruleFars: String?
|
||||
var lexicon: String?
|
||||
var dataDir: String?
|
||||
var assets: AssetManager? = application.assets
|
||||
@@ -165,6 +166,7 @@ class MainActivity : AppCompatActivity() {
|
||||
modelDir = null
|
||||
modelName = null
|
||||
ruleFsts = null
|
||||
ruleFars = null
|
||||
lexicon = null
|
||||
dataDir = null
|
||||
|
||||
@@ -181,9 +183,11 @@ class MainActivity : AppCompatActivity() {
|
||||
// dataDir = "vits-piper-en_US-amy-low/espeak-ng-data"
|
||||
|
||||
// Example 3:
|
||||
// modelDir = "vits-zh-aishell3"
|
||||
// modelName = "vits-aishell3.onnx"
|
||||
// ruleFsts = "vits-zh-aishell3/rule.fst"
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
// modelDir = "vits-icefall-zh-aishell3"
|
||||
// modelName = "model.onnx"
|
||||
// ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,"
|
||||
// ruleFars = "vits-icefall-zh-aishell3/rule.far"
|
||||
// lexicon = "lexicon.txt"
|
||||
|
||||
// Example 4:
|
||||
@@ -202,7 +206,8 @@ class MainActivity : AppCompatActivity() {
|
||||
val config = getOfflineTtsConfig(
|
||||
modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "",
|
||||
dataDir = dataDir ?: "",
|
||||
ruleFsts = ruleFsts ?: ""
|
||||
ruleFsts = ruleFsts ?: "",
|
||||
ruleFars = ruleFars ?: "",
|
||||
)!!
|
||||
|
||||
tts = OfflineTts(assetManager = assets, config = config)
|
||||
|
||||
@@ -23,6 +23,7 @@ data class OfflineTtsModelConfig(
|
||||
data class OfflineTtsConfig(
|
||||
var model: OfflineTtsModelConfig,
|
||||
var ruleFsts: String = "",
|
||||
var ruleFars: String = "",
|
||||
var maxNumSentences: Int = 1,
|
||||
)
|
||||
|
||||
@@ -151,7 +152,8 @@ fun getOfflineTtsConfig(
|
||||
modelName: String,
|
||||
lexicon: String,
|
||||
dataDir: String,
|
||||
ruleFsts: String
|
||||
ruleFsts: String,
|
||||
ruleFars: String
|
||||
): OfflineTtsConfig? {
|
||||
return OfflineTtsConfig(
|
||||
model = OfflineTtsModelConfig(
|
||||
@@ -166,5 +168,6 @@ fun getOfflineTtsConfig(
|
||||
provider = "cpu",
|
||||
),
|
||||
ruleFsts = ruleFsts,
|
||||
ruleFars = ruleFars,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ object TtsEngine {
|
||||
private var modelDir: String? = null
|
||||
private var modelName: String? = null
|
||||
private var ruleFsts: String? = null
|
||||
private var ruleFars: String? = null
|
||||
private var lexicon: String? = null
|
||||
private var dataDir: String? = null
|
||||
private var assets: AssetManager? = null
|
||||
@@ -50,6 +51,7 @@ object TtsEngine {
|
||||
modelDir = null
|
||||
modelName = null
|
||||
ruleFsts = null
|
||||
ruleFars = null
|
||||
lexicon = null
|
||||
dataDir = null
|
||||
lang = null
|
||||
@@ -73,9 +75,10 @@ object TtsEngine {
|
||||
|
||||
// Example 3:
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
// modelDir = "vits-zh-aishell3"
|
||||
// modelName = "vits-aishell3.onnx"
|
||||
// ruleFsts = "vits-zh-aishell3/rule.fst"
|
||||
// modelDir = "vits-icefall-zh-aishell3"
|
||||
// modelName = "model.onnx"
|
||||
// ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst"
|
||||
// ruleFars = "vits-icefall-zh-aishell3/rule.far"
|
||||
// lexicon = "lexicon.txt"
|
||||
// lang = "zho"
|
||||
|
||||
@@ -108,7 +111,8 @@ object TtsEngine {
|
||||
val config = getOfflineTtsConfig(
|
||||
modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "",
|
||||
dataDir = dataDir ?: "",
|
||||
ruleFsts = ruleFsts ?: ""
|
||||
ruleFsts = ruleFsts ?: "",
|
||||
ruleFars = ruleFars ?: ""
|
||||
)!!
|
||||
|
||||
tts = OfflineTts(assetManager = assets, config = config)
|
||||
|
||||
@@ -124,6 +124,7 @@ echo "Generate xcframework"
|
||||
|
||||
mkdir -p "build/simulator/lib"
|
||||
for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \
|
||||
libsherpa-onnx-fstfar.a \
|
||||
libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a \
|
||||
libucd.a libpiper_phonemize.a libespeak-ng.a; do
|
||||
lipo -create build/simulator_arm64/lib/${f} \
|
||||
@@ -137,6 +138,7 @@ libtool -static -o build/simulator/sherpa-onnx.a \
|
||||
build/simulator/lib/libkaldi-native-fbank-core.a \
|
||||
build/simulator/lib/libsherpa-onnx-c-api.a \
|
||||
build/simulator/lib/libsherpa-onnx-core.a \
|
||||
build/simulator/lib/libsherpa-onnx-fstfar.a \
|
||||
build/simulator/lib/libsherpa-onnx-fst.a \
|
||||
build/simulator/lib/libsherpa-onnx-kaldifst-core.a \
|
||||
build/simulator/lib/libkaldi-decoder-core.a \
|
||||
@@ -148,6 +150,7 @@ libtool -static -o build/os64/sherpa-onnx.a \
|
||||
build/os64/lib/libkaldi-native-fbank-core.a \
|
||||
build/os64/lib/libsherpa-onnx-c-api.a \
|
||||
build/os64/lib/libsherpa-onnx-core.a \
|
||||
build/os64/lib/libsherpa-onnx-fstfar.a \
|
||||
build/os64/lib/libsherpa-onnx-fst.a \
|
||||
build/os64/lib/libsherpa-onnx-kaldifst-core.a \
|
||||
build/os64/lib/libkaldi-decoder-core.a \
|
||||
|
||||
@@ -27,6 +27,7 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \
|
||||
./install/lib/libsherpa-onnx-c-api.a \
|
||||
./install/lib/libsherpa-onnx-core.a \
|
||||
./install/lib/libkaldi-native-fbank-core.a \
|
||||
./install/lib/libsherpa-onnx-fstfar.a \
|
||||
./install/lib/libsherpa-onnx-fst.a \
|
||||
./install/lib/libsherpa-onnx-kaldifst-core.a \
|
||||
./install/lib/libkaldi-decoder-core.a \
|
||||
|
||||
@@ -4,7 +4,7 @@ CUR_DIR :=$(shell pwd)
|
||||
CFLAGS := -I ../ -I ../build/_deps/cargs-src/include/
|
||||
LDFLAGS := -L ../build/lib
|
||||
LDFLAGS += -L ../build/_deps/onnxruntime-src/lib
|
||||
LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime
|
||||
LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime
|
||||
LDFLAGS += -framework Foundation
|
||||
LDFLAGS += -lc++
|
||||
LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/lib
|
||||
|
||||
@@ -78,6 +78,7 @@ def get_binaries():
|
||||
"piper_phonemize.dll",
|
||||
"sherpa-onnx-c-api.dll",
|
||||
"sherpa-onnx-core.dll",
|
||||
"sherpa-onnx-fstfar.lib",
|
||||
"sherpa-onnx-fst.lib",
|
||||
"sherpa-onnx-kaldifst-core.lib",
|
||||
"sherpa-onnx-portaudio.dll",
|
||||
|
||||
@@ -64,12 +64,22 @@ function(download_kaldi_decoder)
|
||||
kaldifst_core
|
||||
fst
|
||||
DESTINATION ..)
|
||||
if(SHERPA_ONNX_ENABLE_TTS)
|
||||
install(TARGETS
|
||||
fstfar
|
||||
DESTINATION ..)
|
||||
endif()
|
||||
else()
|
||||
install(TARGETS
|
||||
kaldi-decoder-core
|
||||
kaldifst_core
|
||||
fst
|
||||
DESTINATION lib)
|
||||
if(SHERPA_ONNX_ENABLE_TTS)
|
||||
install(TARGETS
|
||||
fstfar
|
||||
DESTINATION lib)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WIN32 AND BUILD_SHARED_LIBS)
|
||||
@@ -78,6 +88,11 @@ function(download_kaldi_decoder)
|
||||
kaldifst_core
|
||||
fst
|
||||
DESTINATION bin)
|
||||
if(SHERPA_ONNX_ENABLE_TTS)
|
||||
install(TARGETS
|
||||
fstfar
|
||||
DESTINATION bin)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
@@ -50,13 +50,7 @@ function(download_kaldifst)
|
||||
${kaldifst_SOURCE_DIR}/
|
||||
)
|
||||
|
||||
target_include_directories(fst
|
||||
PUBLIC
|
||||
${openfst_SOURCE_DIR}/src/include
|
||||
)
|
||||
|
||||
set_target_properties(kaldifst_core PROPERTIES OUTPUT_NAME "sherpa-onnx-kaldifst-core")
|
||||
set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst")
|
||||
endfunction()
|
||||
|
||||
download_kaldifst()
|
||||
|
||||
@@ -4,7 +4,7 @@ function(download_openfst)
|
||||
include(FetchContent)
|
||||
|
||||
set(openfst_URL "https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz")
|
||||
set(openfst_URL2 "https://huggingface.co/csukuangfj/kaldi-hmm-gmm-cmake-deps/resolve/main/openfst-win-1.6.5.1.tar.gz")
|
||||
set(openfst_URL2 "https://hub.nuaa.cf/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz")
|
||||
set(openfst_HASH "SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e")
|
||||
|
||||
# If you don't have access to the Internet,
|
||||
@@ -31,7 +31,7 @@ function(download_openfst)
|
||||
set(HAVE_COMPACT OFF CACHE BOOL "" FORCE)
|
||||
set(HAVE_COMPRESS OFF CACHE BOOL "" FORCE)
|
||||
set(HAVE_CONST OFF CACHE BOOL "" FORCE)
|
||||
set(HAVE_FAR OFF CACHE BOOL "" FORCE)
|
||||
set(HAVE_FAR ON CACHE BOOL "" FORCE)
|
||||
set(HAVE_GRM OFF CACHE BOOL "" FORCE)
|
||||
set(HAVE_PDT OFF CACHE BOOL "" FORCE)
|
||||
set(HAVE_MPDT OFF CACHE BOOL "" FORCE)
|
||||
@@ -70,20 +70,21 @@ function(download_openfst)
|
||||
add_subdirectory(${openfst_SOURCE_DIR} ${openfst_BINARY_DIR} EXCLUDE_FROM_ALL)
|
||||
set(openfst_SOURCE_DIR ${openfst_SOURCE_DIR} PARENT_SCOPE)
|
||||
|
||||
# Rename libfst.so.6 to libkaldifst_fst.so.6 to avoid potential conflicts
|
||||
# when kaldifst is installed.
|
||||
set_target_properties(fst PROPERTIES OUTPUT_NAME "kaldifst_fst")
|
||||
# Rename libfst.so.6 to libsherpa-onnx-fst.so.6 to avoid potential conflicts
|
||||
# when sherpa-onnx is installed.
|
||||
set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst")
|
||||
set_target_properties(fstfar PROPERTIES OUTPUT_NAME "sherpa-onnx-fstfar")
|
||||
|
||||
install(TARGETS fst
|
||||
DESTINATION lib
|
||||
target_include_directories(fst
|
||||
PUBLIC
|
||||
${openfst_SOURCE_DIR}/src/include
|
||||
)
|
||||
|
||||
if(KALDIFST_BUILD_PYTHON)
|
||||
set_target_properties(fstscript PROPERTIES OUTPUT_NAME "kaldifst_fstscript")
|
||||
install(TARGETS fstscript
|
||||
DESTINATION lib
|
||||
)
|
||||
endif()
|
||||
target_include_directories(fstfar
|
||||
PUBLIC
|
||||
${openfst_SOURCE_DIR}/src/include
|
||||
)
|
||||
# installed in ./kaldi-decoder.cmake
|
||||
endfunction()
|
||||
|
||||
download_openfst()
|
||||
|
||||
@@ -13,4 +13,4 @@ Cflags: -I"${includedir}"
|
||||
# Note: -lcargs is required only for the following file
|
||||
# https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c
|
||||
# We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c
|
||||
Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@
|
||||
Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@
|
||||
|
||||
@@ -20,6 +20,9 @@ class OfflineTtsDemo
|
||||
[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
|
||||
public string RuleFsts { get; set; }
|
||||
|
||||
[Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
|
||||
public string RuleFars { get; set; }
|
||||
|
||||
[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
|
||||
public string DataDir { get; set; }
|
||||
|
||||
@@ -72,14 +75,15 @@ class OfflineTtsDemo
|
||||
string usage = @"
|
||||
# vits-aishell3
|
||||
|
||||
wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||
tar xf vits-zh-aishell3.tar.bz2
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||
|
||||
dotnet run \
|
||||
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
|
||||
--vits-tokens=./vits-zh-aishell3/tokens.txt \
|
||||
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \
|
||||
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \
|
||||
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
||||
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
||||
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
||||
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
|
||||
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
|
||||
--sid=66 \
|
||||
--debug=1 \
|
||||
--output-filename=./aishell3-66.wav \
|
||||
@@ -127,6 +131,7 @@ to download more models.
|
||||
config.Model.Debug = options.Debug;
|
||||
config.Model.Provider = "cpu";
|
||||
config.RuleFsts = options.RuleFsts;
|
||||
config.RuleFars = options.RuleFars;
|
||||
config.MaxNumSentences = options.MaxNumSentences;
|
||||
|
||||
OfflineTts tts = new OfflineTts(config);
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
if [ ! -f ./vits-zh-aishell3/vits-aishell3.onnx ]; then
|
||||
# wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||
curl -OL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||
tar xf vits-zh-aishell3.tar.bz2
|
||||
rm vits-zh-aishell3.tar.bz2
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||
rm vits-icefall-zh-aishell3.tar.bz2
|
||||
fi
|
||||
|
||||
dotnet run \
|
||||
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
|
||||
--vits-tokens=./vits-zh-aishell3/tokens.txt \
|
||||
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \
|
||||
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \
|
||||
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
||||
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
||||
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
||||
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
|
||||
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
|
||||
--sid=66 \
|
||||
--debug=1 \
|
||||
--output-filename=./aishell3-66.wav \
|
||||
--text="这是一个语音合成测试, 写于公元 2024 年 1 月 28 号, 23点27分,星期天。"
|
||||
--text="这是一个语音合成测试, 写于公元 2024 年 1 月 28 号, 23点27分,星期天。长沙长大,去过长白山和长安街。行行出状元。行行,银行行长,行业。"
|
||||
|
||||
@@ -26,6 +26,7 @@ func main() {
|
||||
flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message")
|
||||
flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use")
|
||||
flag.StringVar(&config.RuleFsts, "tts-rule-fsts", "", "Path to rule.fst")
|
||||
flag.StringVar(&config.RuleFars, "tts-rule-fars", "", "Path to rule.far")
|
||||
flag.IntVar(&config.MaxNumSentences, "tts-max-num-sentences", 1, "Batch size")
|
||||
|
||||
flag.IntVar(&sid, "sid", 0, "Speaker ID. Used only for multi-speaker models")
|
||||
|
||||
@@ -6,21 +6,32 @@
|
||||
|
||||
for sid in 10 33 99; do
|
||||
./non-streaming-tts \
|
||||
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
|
||||
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \
|
||||
--vits-tokens=./vits-zh-aishell3/tokens.txt \
|
||||
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
||||
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
||||
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
||||
--sid=$sid \
|
||||
--debug=1 \
|
||||
--output-filename=./liliana-$sid.wav \
|
||||
"林美丽最美丽、最漂亮、最可爱!"
|
||||
|
||||
./non-streaming-tts \
|
||||
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
|
||||
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \
|
||||
--vits-tokens=./vits-zh-aishell3/tokens.txt \
|
||||
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \
|
||||
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
||||
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
||||
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
||||
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
|
||||
--sid=$sid \
|
||||
--debug=1 \
|
||||
--output-filename=./numbers-$sid.wav \
|
||||
"数字12345.6789怎么念"
|
||||
|
||||
./non-streaming-tts \
|
||||
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
||||
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
||||
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
||||
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
|
||||
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
|
||||
--sid=$sid \
|
||||
--debug=1 \
|
||||
--output-filename=./heteronym-$sid.wav \
|
||||
"万古长存长沙长大长白山长孙长安街"
|
||||
done
|
||||
|
||||
@@ -7,10 +7,9 @@
|
||||
|
||||
import Foundation
|
||||
|
||||
|
||||
// used to get the path to espeak-ng-data
|
||||
func resourceURL(to path: String) -> String {
|
||||
return URL(string: path, relativeTo: Bundle.main.resourceURL)!.path
|
||||
return URL(string: path, relativeTo: Bundle.main.resourceURL)!.path
|
||||
}
|
||||
|
||||
func getResource(_ forResource: String, _ ofType: String) -> String {
|
||||
@@ -50,8 +49,7 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper {
|
||||
// See the following link
|
||||
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3
|
||||
|
||||
// vits-vctk.onnx
|
||||
let model = getResource("vits-aishell3", "onnx")
|
||||
let model = getResource("model", "onnx")
|
||||
|
||||
// lexicon.txt
|
||||
let lexicon = getResource("lexicon", "txt")
|
||||
@@ -59,9 +57,19 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper {
|
||||
// tokens.txt
|
||||
let tokens = getResource("tokens", "txt")
|
||||
|
||||
// rule.fst
|
||||
let ruleFsts = getResource("rule", "fst")
|
||||
|
||||
// rule.far
|
||||
let ruleFars = getResource("rule", "far")
|
||||
|
||||
let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: lexicon, tokens: tokens)
|
||||
let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits)
|
||||
var config = sherpaOnnxOfflineTtsConfig(model: modelConfig)
|
||||
var config = sherpaOnnxOfflineTtsConfig(
|
||||
model: modelConfig,
|
||||
ruleFsts: ruleFsts,
|
||||
ruleFars: ruleFars
|
||||
)
|
||||
return SherpaOnnxOfflineTtsWrapper(config: &config)
|
||||
}
|
||||
|
||||
@@ -69,7 +77,6 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper {
|
||||
func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper {
|
||||
// please see https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
|
||||
// vits-vctk.onnx
|
||||
let model = getResource("en_US-amy-low", "onnx")
|
||||
|
||||
// tokens.txt
|
||||
@@ -78,7 +85,8 @@ func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper {
|
||||
// in this case, we don't need lexicon.txt
|
||||
let dataDir = resourceURL(to: "espeak-ng-data")
|
||||
|
||||
let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: "", tokens: tokens, dataDir: dataDir)
|
||||
let vits = sherpaOnnxOfflineTtsVitsModelConfig(
|
||||
model: model, lexicon: "", tokens: tokens, dataDir: dataDir)
|
||||
let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits)
|
||||
var config = sherpaOnnxOfflineTtsConfig(model: modelConfig)
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
sherpa-onnx-core.lib;
|
||||
kaldi-decoder-core.lib;
|
||||
sherpa-onnx-kaldifst-core.lib;
|
||||
sherpa-onnx-fstfar.lib;
|
||||
sherpa-onnx-fst.lib;
|
||||
kaldi-native-fbank-core.lib;
|
||||
onnxruntime.lib;
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
sherpa-onnx-core.lib;
|
||||
kaldi-decoder-core.lib;
|
||||
sherpa-onnx-kaldifst-core.lib;
|
||||
sherpa-onnx-fstfar.lib;
|
||||
sherpa-onnx-fst.lib;
|
||||
kaldi-native-fbank-core.lib;
|
||||
onnxruntime.lib;
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
sherpa-onnx-core.lib;
|
||||
kaldi-decoder-core.lib;
|
||||
sherpa-onnx-kaldifst-core.lib;
|
||||
sherpa-onnx-fstfar.lib;
|
||||
sherpa-onnx-fst.lib;
|
||||
kaldi-native-fbank-core.lib;
|
||||
onnxruntime.lib;
|
||||
|
||||
@@ -43,8 +43,8 @@ for text-to-speech.
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||
tar xvf vits-zh-aishell3.tar.bz2
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||
node ./test-offline-tts-zh.js
|
||||
```
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ function createOfflineTts() {
|
||||
let offlineTtsConfig = {
|
||||
offlineTtsModelConfig: offlineTtsModelConfig,
|
||||
ruleFsts: '',
|
||||
ruleFars: '',
|
||||
maxNumSentences: 1,
|
||||
};
|
||||
|
||||
|
||||
@@ -4,9 +4,9 @@ const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createOfflineTts() {
|
||||
let offlineTtsVitsModelConfig = {
|
||||
model: './vits-zh-aishell3/vits-aishell3.onnx',
|
||||
lexicon: './vits-zh-aishell3/lexicon.txt',
|
||||
tokens: './vits-zh-aishell3/tokens.txt',
|
||||
model: './vits-icefall-zh-aishell3/vits-aishell3.onnx',
|
||||
lexicon: './vits-icefall-zh-aishell3/lexicon.txt',
|
||||
tokens: './vits-icefall-zh-aishell3/tokens.txt',
|
||||
dataDir: '',
|
||||
noiseScale: 0.667,
|
||||
noiseScaleW: 0.8,
|
||||
@@ -21,7 +21,9 @@ function createOfflineTts() {
|
||||
|
||||
let offlineTtsConfig = {
|
||||
offlineTtsModelConfig: offlineTtsModelConfig,
|
||||
ruleFsts: './vits-zh-aishell3/rule.fst',
|
||||
ruleFsts:
|
||||
'./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst,./vits-icefall-zh-aishell3/new_heteronym.fst',
|
||||
ruleFars: './vits-icefall-zh-aishell3/rule.far',
|
||||
maxNumSentences: 1,
|
||||
};
|
||||
|
||||
|
||||
@@ -56,6 +56,11 @@ sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt
|
||||
sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt
|
||||
{% endif %}
|
||||
|
||||
{% if tts_model.rule_fars %}
|
||||
rule_fars={{ tts_model.rule_fars }}
|
||||
sed -i.bak s%"ruleFsts = null"%"ruleFars = \"$rule_fars\""% ./TtsEngine.kt
|
||||
{% endif %}
|
||||
|
||||
{% if tts_model.data_dir %}
|
||||
data_dir={{ tts_model.data_dir }}
|
||||
sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./TtsEngine.kt
|
||||
|
||||
@@ -54,6 +54,11 @@ sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt
|
||||
sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt
|
||||
{% endif %}
|
||||
|
||||
{% if tts_model.rule_fars %}
|
||||
rule_fars={{ tts_model.rule_fars }}
|
||||
sed -i.bak s%"ruleFsts = null"%"ruleFars = \"$rule_fars\""% ./MainActivity.kt
|
||||
{% endif %}
|
||||
|
||||
{% if tts_model.data_dir %}
|
||||
data_dir={{ tts_model.data_dir }}
|
||||
sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./MainActivity.kt
|
||||
|
||||
@@ -33,6 +33,7 @@ class TtsModel:
|
||||
model_name: str = ""
|
||||
lang: str = "" # en, zh, fr, de, etc.
|
||||
rule_fsts: Optional[List[str]] = None
|
||||
rule_fars: Optional[List[str]] = None
|
||||
data_dir: Optional[str] = None
|
||||
is_char: bool = False
|
||||
lang_iso_639_3: str = ""
|
||||
@@ -241,98 +242,94 @@ def get_mimic3_models() -> List[TtsModel]:
|
||||
|
||||
|
||||
def get_vits_models() -> List[TtsModel]:
|
||||
return [
|
||||
chinese_models = [
|
||||
# Chinese
|
||||
TtsModel(
|
||||
model_dir="vits-icefall-zh-aishell3",
|
||||
model_name="model.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/rule.fst",
|
||||
rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst",
|
||||
rule_fars="vits-icefall-zh-aishell3/rule.far",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-aishell3",
|
||||
model_name="vits-aishell3.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-aishell3/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-doom",
|
||||
model_name="doom.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-doom/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-echo",
|
||||
model_name="echo.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-echo/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-zenyatta",
|
||||
model_name="zenyatta.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-zenyatta/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-abyssinvoker",
|
||||
model_name="abyssinvoker.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-abyssinvoker/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-keqing",
|
||||
model_name="keqing.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-keqing/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-eula",
|
||||
model_name="eula.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-eula/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-bronya",
|
||||
model_name="bronya.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-bronya/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-theresa",
|
||||
model_name="theresa.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-theresa/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-fanchen-wnj",
|
||||
model_name="vits-zh-hf-fanchen-wnj.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-fanchen-wnj/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-fanchen-C",
|
||||
model_name="vits-zh-hf-fanchen-C.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-fanchen-C/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe",
|
||||
model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-fanchen-ZhiHuiLaoZhe/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new",
|
||||
model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-fanchen-unity",
|
||||
model_name="vits-zh-hf-fanchen-unity.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-zh-hf-fanchen-unity/rule.fst",
|
||||
),
|
||||
]
|
||||
|
||||
rule_fsts = ["phone.fst", "date.fst", "number.fst", "new_heteronym.fst"]
|
||||
for m in chinese_models:
|
||||
s = [f"{m.model_dir}/{r}" for r in rule_fsts]
|
||||
m.rule_fsts = ",".join(s)
|
||||
m.rule_fars = f"{m.model_dir}/rule.far"
|
||||
|
||||
all_models = chinese_models + [
|
||||
TtsModel(
|
||||
model_dir="vits-cantonese-hf-xiaomaiiwn",
|
||||
model_name="vits-cantonese-hf-xiaomaiiwn.onnx",
|
||||
@@ -346,6 +343,8 @@ def get_vits_models() -> List[TtsModel]:
|
||||
# fmt: on
|
||||
]
|
||||
|
||||
return all_models
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
|
||||
@@ -40,6 +40,7 @@ def process_linux(s):
|
||||
"libpiper_phonemize.so.1",
|
||||
"libsherpa-onnx-c-api.so",
|
||||
"libsherpa-onnx-core.so",
|
||||
"libsherpa-onnx-fstfar.so.7",
|
||||
"libsherpa-onnx-fst.so.6",
|
||||
"libsherpa-onnx-kaldifst-core.so",
|
||||
"libucd.so",
|
||||
@@ -68,6 +69,7 @@ def process_macos(s):
|
||||
"libpiper_phonemize.1.dylib",
|
||||
"libsherpa-onnx-c-api.dylib",
|
||||
"libsherpa-onnx-core.dylib",
|
||||
"libsherpa-onnx-fstfar.7.dylib",
|
||||
"libsherpa-onnx-fst.6.dylib",
|
||||
"libsherpa-onnx-kaldifst-core.dylib",
|
||||
"libucd.dylib",
|
||||
@@ -96,6 +98,7 @@ def process_windows(s, rid):
|
||||
"piper_phonemize.dll",
|
||||
"sherpa-onnx-c-api.dll",
|
||||
"sherpa-onnx-core.dll",
|
||||
"sherpa-onnx-fstfar.lib",
|
||||
"sherpa-onnx-fst.lib",
|
||||
"sherpa-onnx-kaldifst-core.lib",
|
||||
"ucd.dll",
|
||||
|
||||
@@ -67,6 +67,7 @@ namespace SherpaOnnx
|
||||
Model = new OfflineTtsModelConfig();
|
||||
RuleFsts = "";
|
||||
MaxNumSentences = 1;
|
||||
RuleFars = "";
|
||||
}
|
||||
public OfflineTtsModelConfig Model;
|
||||
|
||||
@@ -74,6 +75,9 @@ namespace SherpaOnnx
|
||||
public string RuleFsts;
|
||||
|
||||
public int MaxNumSentences;
|
||||
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string RuleFars;
|
||||
}
|
||||
|
||||
public class OfflineTtsGeneratedAudio
|
||||
|
||||
@@ -41,6 +41,7 @@ if [ ! -f /tmp/linux/libsherpa-onnx-core.so ]; then
|
||||
cd ..
|
||||
rm -v libpiper_phonemize.so libpiper_phonemize.so.1.2.0
|
||||
rm -v libsherpa-onnx-fst.so
|
||||
rm -v libsherpa-onnx-fstfar.so
|
||||
rm -v libonnxruntime.so
|
||||
rm -v libcargs.so
|
||||
rm -rf wheel
|
||||
@@ -67,6 +68,7 @@ if [ ! -f /tmp/macos/libsherpa-onnx-core.dylib ]; then
|
||||
rm -v libonnxruntime.dylib
|
||||
rm -v libpiper_phonemize.1.2.0.dylib libpiper_phonemize.dylib
|
||||
rm -v libsherpa-onnx-fst.dylib
|
||||
rm -v libsherpa-onnx-fstfar.dylib
|
||||
rm -rf wheel
|
||||
ls -lh
|
||||
cd ..
|
||||
|
||||
@@ -2,5 +2,5 @@
|
||||
|
||||
package sherpa_onnx
|
||||
|
||||
// #cgo LDFLAGS: -L ${SRCDIR}/lib/x86_64-apple-darwin -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -Wl,-rpath,${SRCDIR}/lib/x86_64-apple-darwin
|
||||
// #cgo LDFLAGS: -L ${SRCDIR}/lib/x86_64-apple-darwin -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -Wl,-rpath,${SRCDIR}/lib/x86_64-apple-darwin
|
||||
import "C"
|
||||
|
||||
@@ -554,6 +554,7 @@ type OfflineTtsModelConfig struct {
|
||||
type OfflineTtsConfig struct {
|
||||
Model OfflineTtsModelConfig
|
||||
RuleFsts string
|
||||
RuleFars string
|
||||
MaxNumSentences int
|
||||
}
|
||||
|
||||
@@ -583,6 +584,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
|
||||
c.rule_fsts = C.CString(config.RuleFsts)
|
||||
defer C.free(unsafe.Pointer(c.rule_fsts))
|
||||
|
||||
c.rule_fars = C.CString(config.RuleFars)
|
||||
defer C.free(unsafe.Pointer(c.rule_fars))
|
||||
|
||||
c.max_num_sentences = C.int(config.MaxNumSentences)
|
||||
|
||||
c.model.vits.model = C.CString(config.Model.Vits.Model)
|
||||
|
||||
@@ -818,6 +818,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
|
||||
tts_config.model.debug = config->model.debug;
|
||||
tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu");
|
||||
tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
|
||||
tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, "");
|
||||
tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2);
|
||||
|
||||
if (tts_config.model.debug) {
|
||||
|
||||
@@ -783,6 +783,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig {
|
||||
SherpaOnnxOfflineTtsModelConfig model;
|
||||
const char *rule_fsts;
|
||||
int32_t max_num_sentences;
|
||||
const char *rule_fars;
|
||||
} SherpaOnnxOfflineTtsConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio {
|
||||
|
||||
@@ -164,6 +164,7 @@ endif()
|
||||
|
||||
if(SHERPA_ONNX_ENABLE_TTS)
|
||||
target_link_libraries(sherpa-onnx-core piper_phonemize)
|
||||
target_link_libraries(sherpa-onnx-core fstfar fst)
|
||||
endif()
|
||||
|
||||
if(SHERPA_ONNX_ENABLE_CHECK)
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
#endif
|
||||
|
||||
#include <memory>
|
||||
#include <regex> // NOLINT
|
||||
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
@@ -26,6 +25,55 @@
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
static std::vector<std::string> ProcessHeteronyms(
|
||||
const std::vector<std::string> &words) {
|
||||
std::vector<std::string> ans;
|
||||
ans.reserve(words.size());
|
||||
|
||||
int32_t num_words = static_cast<int32_t>(words.size());
|
||||
int32_t i = 0;
|
||||
int32_t prev = -1;
|
||||
while (i < num_words) {
|
||||
// start of a phrase #$|
|
||||
if ((i + 2 < num_words) && words[i] == "#" && words[i + 1] == "$" &&
|
||||
words[i + 2] == "|") {
|
||||
if (prev == -1) {
|
||||
prev = i + 3;
|
||||
}
|
||||
i = i + 3;
|
||||
continue;
|
||||
}
|
||||
|
||||
// end of a phrase |$#
|
||||
if ((i + 2 < num_words) && words[i] == "|" && words[i + 1] == "$" &&
|
||||
words[i + 2] == "#") {
|
||||
if (prev != -1) {
|
||||
std::ostringstream os;
|
||||
for (int32_t k = prev; k < i; ++k) {
|
||||
if (words[k] != "|" && words[k] != "$" && words[k] != "#") {
|
||||
os << words[k];
|
||||
}
|
||||
}
|
||||
ans.push_back(os.str());
|
||||
|
||||
prev = -1;
|
||||
}
|
||||
|
||||
i += 3;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (prev == -1) {
|
||||
// not inside a phrase
|
||||
ans.push_back(words[i]);
|
||||
}
|
||||
|
||||
++i;
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
static void ToLowerCase(std::string *in_out) {
|
||||
std::transform(in_out->begin(), in_out->end(), in_out->begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
@@ -148,36 +196,9 @@ std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsChinese(
|
||||
const std::string &_text) const {
|
||||
std::string text(_text);
|
||||
ToLowerCase(&text);
|
||||
std::vector<std::string> words;
|
||||
if (pattern_) {
|
||||
// Handle polyphones
|
||||
size_t pos = 0;
|
||||
auto begin = std::sregex_iterator(text.begin(), text.end(), *pattern_);
|
||||
auto end = std::sregex_iterator();
|
||||
for (std::sregex_iterator i = begin; i != end; ++i) {
|
||||
std::smatch match = *i;
|
||||
if (pos < match.position()) {
|
||||
auto this_segment = text.substr(pos, match.position() - pos);
|
||||
auto this_segment_words = SplitUtf8(this_segment);
|
||||
words.insert(words.end(), this_segment_words.begin(),
|
||||
this_segment_words.end());
|
||||
pos = match.position() + match.length();
|
||||
} else if (pos == match.position()) {
|
||||
pos = match.position() + match.length();
|
||||
}
|
||||
|
||||
words.push_back(match.str());
|
||||
}
|
||||
|
||||
if (pos < text.size()) {
|
||||
auto this_segment = text.substr(pos, text.size() - pos);
|
||||
auto this_segment_words = SplitUtf8(this_segment);
|
||||
words.insert(words.end(), this_segment_words.begin(),
|
||||
this_segment_words.end());
|
||||
}
|
||||
} else {
|
||||
words = SplitUtf8(text);
|
||||
}
|
||||
std::vector<std::string> words = SplitUtf8(text);
|
||||
words = ProcessHeteronyms(words);
|
||||
|
||||
if (debug_) {
|
||||
fprintf(stderr, "Input text in string: %s\n", text.c_str());
|
||||
@@ -357,9 +378,6 @@ void Lexicon::InitLexicon(std::istream &is) {
|
||||
std::string line;
|
||||
std::string phone;
|
||||
|
||||
std::ostringstream os;
|
||||
std::string sep;
|
||||
|
||||
while (std::getline(is, line)) {
|
||||
std::istringstream iss(line);
|
||||
|
||||
@@ -381,18 +399,9 @@ void Lexicon::InitLexicon(std::istream &is) {
|
||||
if (ids.empty()) {
|
||||
continue;
|
||||
}
|
||||
if (language_ == Language::kChinese && word.size() > 3) {
|
||||
// this is not a single word;
|
||||
os << sep << word;
|
||||
sep = "|";
|
||||
}
|
||||
|
||||
word2ids_.insert({std::move(word), std::move(ids)});
|
||||
}
|
||||
|
||||
if (!sep.empty()) {
|
||||
pattern_ = std::make_unique<std::regex>(os.str());
|
||||
}
|
||||
}
|
||||
|
||||
void Lexicon::InitPunctuations(const std::string &punctuations) {
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <regex> // NOLINT
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
@@ -65,9 +64,6 @@ class Lexicon : public OfflineTtsFrontend {
|
||||
std::unordered_map<std::string, int32_t> token2id_;
|
||||
Language language_;
|
||||
bool debug_;
|
||||
|
||||
// for Chinese polyphones
|
||||
std::unique_ptr<std::regex> pattern_;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
#include "android/asset_manager.h"
|
||||
#include "android/asset_manager_jni.h"
|
||||
#endif
|
||||
|
||||
#include "fst/extensions/far/far.h"
|
||||
#include "kaldifst/csrc/kaldi-fst-io.h"
|
||||
#include "kaldifst/csrc/text-normalizer.h"
|
||||
#include "sherpa-onnx/csrc/lexicon.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
@@ -46,6 +49,32 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f));
|
||||
}
|
||||
}
|
||||
|
||||
if (!config.rule_fars.empty()) {
|
||||
if (config.model.debug) {
|
||||
SHERPA_ONNX_LOGE("Loading FST archives");
|
||||
}
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fars, ",", false, &files);
|
||||
for (const auto &f : files) {
|
||||
if (config.model.debug) {
|
||||
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
|
||||
}
|
||||
std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
|
||||
fst::FarReader<fst::StdArc>::Open(f));
|
||||
for (; !reader->Done(); reader->Next()) {
|
||||
std::unique_ptr<fst::StdConstFst> r(
|
||||
fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
|
||||
|
||||
tn_list_.push_back(
|
||||
std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
|
||||
}
|
||||
}
|
||||
|
||||
if (config.model.debug) {
|
||||
SHERPA_ONNX_LOGE("FST archives loaded!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
|
||||
@@ -20,7 +20,14 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
|
||||
"It not empty, it contains a list of rule FST filenames."
|
||||
"Multiple filenames are separated by a comma and they are "
|
||||
"applied from left to right. An example value: "
|
||||
"rule1.fst,rule2,fst,rule3.fst");
|
||||
"rule1.fst,rule2.fst,rule3.fst");
|
||||
|
||||
po->Register("tts-rule-fars", &rule_fars,
|
||||
"It not empty, it contains a list of rule FST archive filenames."
|
||||
"Multiple filenames are separated by a comma and they are "
|
||||
"applied from left to right. An example value: "
|
||||
"rule1.far,rule2.far,rule3.far. Note that an *.far can contain "
|
||||
"multiple *.fst files");
|
||||
|
||||
po->Register(
|
||||
"tts-max-num-sentences", &max_num_sentences,
|
||||
@@ -41,6 +48,17 @@ bool OfflineTtsConfig::Validate() const {
|
||||
}
|
||||
}
|
||||
|
||||
if (!rule_fars.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(rule_fars, ",", false, &files);
|
||||
for (const auto &f : files) {
|
||||
if (!FileExists(f)) {
|
||||
SHERPA_ONNX_LOGE("Rule far %s does not exist. ", f.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return model.Validate();
|
||||
}
|
||||
|
||||
@@ -50,6 +68,7 @@ std::string OfflineTtsConfig::ToString() const {
|
||||
os << "OfflineTtsConfig(";
|
||||
os << "model=" << model.ToString() << ", ";
|
||||
os << "rule_fsts=\"" << rule_fsts << "\", ";
|
||||
os << "rule_fars=\"" << rule_fars << "\", ";
|
||||
os << "max_num_sentences=" << max_num_sentences << ")";
|
||||
|
||||
return os.str();
|
||||
|
||||
@@ -29,6 +29,9 @@ struct OfflineTtsConfig {
|
||||
// If there are multiple rules, they are applied from left to right.
|
||||
std::string rule_fsts;
|
||||
|
||||
// If there are multiple FST archives, they are applied from left to right.
|
||||
std::string rule_fars;
|
||||
|
||||
// Maximum number of sentences that we process at a time.
|
||||
// This is to avoid OOM for very long input text.
|
||||
// If you set it to -1, then we process all sentences in a single batch.
|
||||
@@ -36,9 +39,11 @@ struct OfflineTtsConfig {
|
||||
|
||||
OfflineTtsConfig() = default;
|
||||
OfflineTtsConfig(const OfflineTtsModelConfig &model,
|
||||
const std::string &rule_fsts, int32_t max_num_sentences)
|
||||
const std::string &rule_fsts, const std::string &rule_fars,
|
||||
int32_t max_num_sentences)
|
||||
: model(model),
|
||||
rule_fsts(rule_fsts),
|
||||
rule_fars(rule_fars),
|
||||
max_num_sentences(max_num_sentences) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
|
||||
@@ -878,6 +878,13 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
|
||||
ans.rule_fsts = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
// for ruleFars
|
||||
fid = env->GetFieldID(cls, "ruleFars", "Ljava/lang/String;");
|
||||
s = (jstring)env->GetObjectField(config, fid);
|
||||
p = env->GetStringUTFChars(s, nullptr);
|
||||
ans.rule_fars = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
fid = env->GetFieldID(cls, "maxNumSentences", "I");
|
||||
ans.max_num_sentences = env->GetIntField(config, fid);
|
||||
|
||||
|
||||
@@ -32,11 +32,12 @@ static void PybindOfflineTtsConfig(py::module *m) {
|
||||
py::class_<PyClass>(*m, "OfflineTtsConfig")
|
||||
.def(py::init<>())
|
||||
.def(py::init<const OfflineTtsModelConfig &, const std::string &,
|
||||
int32_t>(),
|
||||
const std::string &, int32_t>(),
|
||||
py::arg("model"), py::arg("rule_fsts") = "",
|
||||
py::arg("max_num_sentences") = 2)
|
||||
py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2)
|
||||
.def_readwrite("model", &PyClass::model)
|
||||
.def_readwrite("rule_fsts", &PyClass::rule_fsts)
|
||||
.def_readwrite("rule_fars", &PyClass::rule_fars)
|
||||
.def_readwrite("max_num_sentences", &PyClass::max_num_sentences)
|
||||
.def("validate", &PyClass::Validate)
|
||||
.def("__str__", &PyClass::ToString);
|
||||
|
||||
@@ -652,12 +652,14 @@ func sherpaOnnxOfflineTtsModelConfig(
|
||||
func sherpaOnnxOfflineTtsConfig(
|
||||
model: SherpaOnnxOfflineTtsModelConfig,
|
||||
ruleFsts: String = "",
|
||||
ruleFars: String = "",
|
||||
maxNumSenetences: Int = 2
|
||||
) -> SherpaOnnxOfflineTtsConfig {
|
||||
return SherpaOnnxOfflineTtsConfig(
|
||||
model: model,
|
||||
rule_fsts: toCPointer(ruleFsts),
|
||||
max_num_sentences: Int32(maxNumSenetences)
|
||||
max_num_sentences: Int32(maxNumSenetences),
|
||||
rule_fars: toCPointer(ruleFars)
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
||||
function initSherpaOnnxOfflineTtsConfig(config, Module) {
|
||||
const modelConfig =
|
||||
initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module);
|
||||
const len = modelConfig.len + 2 * 4;
|
||||
const len = modelConfig.len + 3 * 4;
|
||||
const ptr = Module._malloc(len);
|
||||
|
||||
let offset = 0;
|
||||
@@ -98,12 +98,19 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) {
|
||||
offset += modelConfig.len;
|
||||
|
||||
const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts) + 1;
|
||||
const buffer = Module._malloc(ruleFstsLen);
|
||||
const ruleFarsLen = Module.lengthBytesUTF8(config.ruleFars) + 1;
|
||||
|
||||
const buffer = Module._malloc(ruleFstsLen + ruleFarsLen);
|
||||
Module.stringToUTF8(config.ruleFsts, buffer, ruleFstsLen);
|
||||
Module.stringToUTF8(config.ruleFars, buffer + ruleFstsLen, ruleFarsLen);
|
||||
|
||||
Module.setValue(ptr + offset, buffer, 'i8*');
|
||||
offset += 4;
|
||||
|
||||
Module.setValue(ptr + offset, config.maxNumSentences, 'i32');
|
||||
offset += 4;
|
||||
|
||||
Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*');
|
||||
|
||||
return {
|
||||
buffer: buffer, ptr: ptr, len: len, config: modelConfig,
|
||||
@@ -190,6 +197,7 @@ function createOfflineTts(Module, myConfig) {
|
||||
let offlineTtsConfig = {
|
||||
offlineTtsModelConfig: offlineTtsModelConfig,
|
||||
ruleFsts: '',
|
||||
ruleFars: '',
|
||||
maxNumSentences: 1,
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
|
||||
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4,
|
||||
"");
|
||||
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
|
||||
sizeof(SherpaOnnxOfflineTtsModelConfig) + 2 * 4,
|
||||
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
|
||||
"");
|
||||
|
||||
void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
||||
@@ -40,6 +40,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
||||
|
||||
fprintf(stdout, "----------tts config----------\n");
|
||||
fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts);
|
||||
fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars);
|
||||
fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user