Support heteronyms in Chinese TTS (#738)

This commit is contained in:
Fangjun Kuang
2024-04-08 11:01:30 +08:00
committed by GitHub
parent c1c0f5bafd
commit a5f8fbc83f
49 changed files with 308 additions and 143 deletions

View File

@@ -70,9 +70,9 @@ rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2 tar xf vits-piper-en_US-amy-low.tar.bz2
node ./test-offline-tts-en.js node ./test-offline-tts-en.js
rm vits-piper-en_US-amy-low.tar.bz2 rm vits-piper-en_US-amy-low*
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-zh-aishell3.tar.bz2 tar xvf vits-icefall-zh-aishell3.tar.bz2
node ./test-offline-tts-zh.js node ./test-offline-tts-zh.js
rm vits-zh-aishell3.tar.bz2 rm vits-icefall-zh-aishell3*

View File

@@ -173,6 +173,7 @@ jobs:
rm -v $dst/lib/libasound.so rm -v $dst/lib/libasound.so
rm -v $dst/lib/libonnxruntime.so rm -v $dst/lib/libonnxruntime.so
rm -v $dst/lib/libsherpa-onnx-fst.so rm -v $dst/lib/libsherpa-onnx-fst.so
rm -v $dst/lib/libsherpa-onnx-fstfar.so
fi fi
tree $dst tree $dst

View File

@@ -211,6 +211,7 @@ jobs:
rm -fv $dst/lib/libasound.so rm -fv $dst/lib/libasound.so
rm -fv $dst/lib/libonnxruntime.so rm -fv $dst/lib/libonnxruntime.so
rm -fv $dst/lib/libsherpa-onnx-fst.so rm -fv $dst/lib/libsherpa-onnx-fst.so
rm -fv $dst/lib/libsherpa-onnx-fstfar.so
fi fi
tree $dst tree $dst

View File

@@ -111,9 +111,11 @@ jobs:
rm -rf vits-vctk rm -rf vits-vctk
echo "Test vits-zh-aishell3" echo "Test vits-zh-aishell3"
git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
rm vits-icefall-zh-aishell3.tar.bz2
./run-vits-zh-aishell3.sh ./run-vits-zh-aishell3.sh
rm -rf vits-zh-aishell3 rm -rf vits-icefall-zh-aishell3
echo "Test vits-piper-en_US-lessac-medium" echo "Test vits-piper-en_US-lessac-medium"
git clone https://huggingface.co/csukuangfj/vits-piper-en_US-lessac-medium git clone https://huggingface.co/csukuangfj/vits-piper-en_US-lessac-medium

1
.gitignore vendored
View File

@@ -90,3 +90,4 @@ sherpa-onnx-paraformer-trilingual-zh-cantonese-en
sr-data sr-data
*xcworkspace/xcuserdata/* *xcworkspace/xcuserdata/*
vits-icefall-*

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR) cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx) project(sherpa-onnx)
set(SHERPA_ONNX_VERSION "1.9.16") set(SHERPA_ONNX_VERSION "1.9.17")
# Disable warning about # Disable warning about
# #

View File

@@ -155,6 +155,7 @@ class MainActivity : AppCompatActivity() {
var modelDir: String? var modelDir: String?
var modelName: String? var modelName: String?
var ruleFsts: String? var ruleFsts: String?
var ruleFars: String?
var lexicon: String? var lexicon: String?
var dataDir: String? var dataDir: String?
var assets: AssetManager? = application.assets var assets: AssetManager? = application.assets
@@ -165,6 +166,7 @@ class MainActivity : AppCompatActivity() {
modelDir = null modelDir = null
modelName = null modelName = null
ruleFsts = null ruleFsts = null
ruleFars = null
lexicon = null lexicon = null
dataDir = null dataDir = null
@@ -181,9 +183,11 @@ class MainActivity : AppCompatActivity() {
// dataDir = "vits-piper-en_US-amy-low/espeak-ng-data" // dataDir = "vits-piper-en_US-amy-low/espeak-ng-data"
// Example 3: // Example 3:
// modelDir = "vits-zh-aishell3" // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
// modelName = "vits-aishell3.onnx" // modelDir = "vits-icefall-zh-aishell3"
// ruleFsts = "vits-zh-aishell3/rule.fst" // modelName = "model.onnx"
// ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,"
// ruleFars = "vits-icefall-zh-aishell3/rule.far"
// lexicon = "lexicon.txt" // lexicon = "lexicon.txt"
// Example 4: // Example 4:
@@ -202,7 +206,8 @@ class MainActivity : AppCompatActivity() {
val config = getOfflineTtsConfig( val config = getOfflineTtsConfig(
modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "", modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "",
dataDir = dataDir ?: "", dataDir = dataDir ?: "",
ruleFsts = ruleFsts ?: "" ruleFsts = ruleFsts ?: "",
ruleFars = ruleFars ?: "",
)!! )!!
tts = OfflineTts(assetManager = assets, config = config) tts = OfflineTts(assetManager = assets, config = config)

View File

@@ -23,6 +23,7 @@ data class OfflineTtsModelConfig(
data class OfflineTtsConfig( data class OfflineTtsConfig(
var model: OfflineTtsModelConfig, var model: OfflineTtsModelConfig,
var ruleFsts: String = "", var ruleFsts: String = "",
var ruleFars: String = "",
var maxNumSentences: Int = 1, var maxNumSentences: Int = 1,
) )
@@ -151,7 +152,8 @@ fun getOfflineTtsConfig(
modelName: String, modelName: String,
lexicon: String, lexicon: String,
dataDir: String, dataDir: String,
ruleFsts: String ruleFsts: String,
ruleFars: String
): OfflineTtsConfig? { ): OfflineTtsConfig? {
return OfflineTtsConfig( return OfflineTtsConfig(
model = OfflineTtsModelConfig( model = OfflineTtsModelConfig(
@@ -166,5 +168,6 @@ fun getOfflineTtsConfig(
provider = "cpu", provider = "cpu",
), ),
ruleFsts = ruleFsts, ruleFsts = ruleFsts,
ruleFars = ruleFars,
) )
} }

View File

@@ -39,6 +39,7 @@ object TtsEngine {
private var modelDir: String? = null private var modelDir: String? = null
private var modelName: String? = null private var modelName: String? = null
private var ruleFsts: String? = null private var ruleFsts: String? = null
private var ruleFars: String? = null
private var lexicon: String? = null private var lexicon: String? = null
private var dataDir: String? = null private var dataDir: String? = null
private var assets: AssetManager? = null private var assets: AssetManager? = null
@@ -50,6 +51,7 @@ object TtsEngine {
modelDir = null modelDir = null
modelName = null modelName = null
ruleFsts = null ruleFsts = null
ruleFars = null
lexicon = null lexicon = null
dataDir = null dataDir = null
lang = null lang = null
@@ -73,9 +75,10 @@ object TtsEngine {
// Example 3: // Example 3:
// https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
// modelDir = "vits-zh-aishell3" // modelDir = "vits-icefall-zh-aishell3"
// modelName = "vits-aishell3.onnx" // modelName = "model.onnx"
// ruleFsts = "vits-zh-aishell3/rule.fst" // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst"
// ruleFars = "vits-icefall-zh-aishell3/rule.far"
// lexicon = "lexicon.txt" // lexicon = "lexicon.txt"
// lang = "zho" // lang = "zho"
@@ -108,7 +111,8 @@ object TtsEngine {
val config = getOfflineTtsConfig( val config = getOfflineTtsConfig(
modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "", modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "",
dataDir = dataDir ?: "", dataDir = dataDir ?: "",
ruleFsts = ruleFsts ?: "" ruleFsts = ruleFsts ?: "",
ruleFars = ruleFars ?: ""
)!! )!!
tts = OfflineTts(assetManager = assets, config = config) tts = OfflineTts(assetManager = assets, config = config)

View File

@@ -124,6 +124,7 @@ echo "Generate xcframework"
mkdir -p "build/simulator/lib" mkdir -p "build/simulator/lib"
for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \ for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \
libsherpa-onnx-fstfar.a \
libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a \ libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a \
libucd.a libpiper_phonemize.a libespeak-ng.a; do libucd.a libpiper_phonemize.a libespeak-ng.a; do
lipo -create build/simulator_arm64/lib/${f} \ lipo -create build/simulator_arm64/lib/${f} \
@@ -137,6 +138,7 @@ libtool -static -o build/simulator/sherpa-onnx.a \
build/simulator/lib/libkaldi-native-fbank-core.a \ build/simulator/lib/libkaldi-native-fbank-core.a \
build/simulator/lib/libsherpa-onnx-c-api.a \ build/simulator/lib/libsherpa-onnx-c-api.a \
build/simulator/lib/libsherpa-onnx-core.a \ build/simulator/lib/libsherpa-onnx-core.a \
build/simulator/lib/libsherpa-onnx-fstfar.a \
build/simulator/lib/libsherpa-onnx-fst.a \ build/simulator/lib/libsherpa-onnx-fst.a \
build/simulator/lib/libsherpa-onnx-kaldifst-core.a \ build/simulator/lib/libsherpa-onnx-kaldifst-core.a \
build/simulator/lib/libkaldi-decoder-core.a \ build/simulator/lib/libkaldi-decoder-core.a \
@@ -148,6 +150,7 @@ libtool -static -o build/os64/sherpa-onnx.a \
build/os64/lib/libkaldi-native-fbank-core.a \ build/os64/lib/libkaldi-native-fbank-core.a \
build/os64/lib/libsherpa-onnx-c-api.a \ build/os64/lib/libsherpa-onnx-c-api.a \
build/os64/lib/libsherpa-onnx-core.a \ build/os64/lib/libsherpa-onnx-core.a \
build/os64/lib/libsherpa-onnx-fstfar.a \
build/os64/lib/libsherpa-onnx-fst.a \ build/os64/lib/libsherpa-onnx-fst.a \
build/os64/lib/libsherpa-onnx-kaldifst-core.a \ build/os64/lib/libsherpa-onnx-kaldifst-core.a \
build/os64/lib/libkaldi-decoder-core.a \ build/os64/lib/libkaldi-decoder-core.a \

View File

@@ -27,6 +27,7 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \
./install/lib/libsherpa-onnx-c-api.a \ ./install/lib/libsherpa-onnx-c-api.a \
./install/lib/libsherpa-onnx-core.a \ ./install/lib/libsherpa-onnx-core.a \
./install/lib/libkaldi-native-fbank-core.a \ ./install/lib/libkaldi-native-fbank-core.a \
./install/lib/libsherpa-onnx-fstfar.a \
./install/lib/libsherpa-onnx-fst.a \ ./install/lib/libsherpa-onnx-fst.a \
./install/lib/libsherpa-onnx-kaldifst-core.a \ ./install/lib/libsherpa-onnx-kaldifst-core.a \
./install/lib/libkaldi-decoder-core.a \ ./install/lib/libkaldi-decoder-core.a \

View File

@@ -4,7 +4,7 @@ CUR_DIR :=$(shell pwd)
CFLAGS := -I ../ -I ../build/_deps/cargs-src/include/ CFLAGS := -I ../ -I ../build/_deps/cargs-src/include/
LDFLAGS := -L ../build/lib LDFLAGS := -L ../build/lib
LDFLAGS += -L ../build/_deps/onnxruntime-src/lib LDFLAGS += -L ../build/_deps/onnxruntime-src/lib
LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime
LDFLAGS += -framework Foundation LDFLAGS += -framework Foundation
LDFLAGS += -lc++ LDFLAGS += -lc++
LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/lib LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/lib

View File

@@ -78,6 +78,7 @@ def get_binaries():
"piper_phonemize.dll", "piper_phonemize.dll",
"sherpa-onnx-c-api.dll", "sherpa-onnx-c-api.dll",
"sherpa-onnx-core.dll", "sherpa-onnx-core.dll",
"sherpa-onnx-fstfar.lib",
"sherpa-onnx-fst.lib", "sherpa-onnx-fst.lib",
"sherpa-onnx-kaldifst-core.lib", "sherpa-onnx-kaldifst-core.lib",
"sherpa-onnx-portaudio.dll", "sherpa-onnx-portaudio.dll",

View File

@@ -64,12 +64,22 @@ function(download_kaldi_decoder)
kaldifst_core kaldifst_core
fst fst
DESTINATION ..) DESTINATION ..)
if(SHERPA_ONNX_ENABLE_TTS)
install(TARGETS
fstfar
DESTINATION ..)
endif()
else() else()
install(TARGETS install(TARGETS
kaldi-decoder-core kaldi-decoder-core
kaldifst_core kaldifst_core
fst fst
DESTINATION lib) DESTINATION lib)
if(SHERPA_ONNX_ENABLE_TTS)
install(TARGETS
fstfar
DESTINATION lib)
endif()
endif() endif()
if(WIN32 AND BUILD_SHARED_LIBS) if(WIN32 AND BUILD_SHARED_LIBS)
@@ -78,6 +88,11 @@ function(download_kaldi_decoder)
kaldifst_core kaldifst_core
fst fst
DESTINATION bin) DESTINATION bin)
if(SHERPA_ONNX_ENABLE_TTS)
install(TARGETS
fstfar
DESTINATION bin)
endif()
endif() endif()
endfunction() endfunction()

View File

@@ -50,13 +50,7 @@ function(download_kaldifst)
${kaldifst_SOURCE_DIR}/ ${kaldifst_SOURCE_DIR}/
) )
target_include_directories(fst
PUBLIC
${openfst_SOURCE_DIR}/src/include
)
set_target_properties(kaldifst_core PROPERTIES OUTPUT_NAME "sherpa-onnx-kaldifst-core") set_target_properties(kaldifst_core PROPERTIES OUTPUT_NAME "sherpa-onnx-kaldifst-core")
set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst")
endfunction() endfunction()
download_kaldifst() download_kaldifst()

View File

@@ -4,7 +4,7 @@ function(download_openfst)
include(FetchContent) include(FetchContent)
set(openfst_URL "https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz") set(openfst_URL "https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz")
set(openfst_URL2 "https://huggingface.co/csukuangfj/kaldi-hmm-gmm-cmake-deps/resolve/main/openfst-win-1.6.5.1.tar.gz") set(openfst_URL2 "https://hub.nuaa.cf/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz")
set(openfst_HASH "SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e") set(openfst_HASH "SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e")
# If you don't have access to the Internet, # If you don't have access to the Internet,
@@ -31,7 +31,7 @@ function(download_openfst)
set(HAVE_COMPACT OFF CACHE BOOL "" FORCE) set(HAVE_COMPACT OFF CACHE BOOL "" FORCE)
set(HAVE_COMPRESS OFF CACHE BOOL "" FORCE) set(HAVE_COMPRESS OFF CACHE BOOL "" FORCE)
set(HAVE_CONST OFF CACHE BOOL "" FORCE) set(HAVE_CONST OFF CACHE BOOL "" FORCE)
set(HAVE_FAR OFF CACHE BOOL "" FORCE) set(HAVE_FAR ON CACHE BOOL "" FORCE)
set(HAVE_GRM OFF CACHE BOOL "" FORCE) set(HAVE_GRM OFF CACHE BOOL "" FORCE)
set(HAVE_PDT OFF CACHE BOOL "" FORCE) set(HAVE_PDT OFF CACHE BOOL "" FORCE)
set(HAVE_MPDT OFF CACHE BOOL "" FORCE) set(HAVE_MPDT OFF CACHE BOOL "" FORCE)
@@ -70,20 +70,21 @@ function(download_openfst)
add_subdirectory(${openfst_SOURCE_DIR} ${openfst_BINARY_DIR} EXCLUDE_FROM_ALL) add_subdirectory(${openfst_SOURCE_DIR} ${openfst_BINARY_DIR} EXCLUDE_FROM_ALL)
set(openfst_SOURCE_DIR ${openfst_SOURCE_DIR} PARENT_SCOPE) set(openfst_SOURCE_DIR ${openfst_SOURCE_DIR} PARENT_SCOPE)
# Rename libfst.so.6 to libkaldifst_fst.so.6 to avoid potential conflicts # Rename libfst.so.6 to libsherpa-onnx-fst.so.6 to avoid potential conflicts
# when kaldifst is installed. # when sherpa-onnx is installed.
set_target_properties(fst PROPERTIES OUTPUT_NAME "kaldifst_fst") set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst")
set_target_properties(fstfar PROPERTIES OUTPUT_NAME "sherpa-onnx-fstfar")
install(TARGETS fst target_include_directories(fst
DESTINATION lib PUBLIC
${openfst_SOURCE_DIR}/src/include
) )
if(KALDIFST_BUILD_PYTHON) target_include_directories(fstfar
set_target_properties(fstscript PROPERTIES OUTPUT_NAME "kaldifst_fstscript") PUBLIC
install(TARGETS fstscript ${openfst_SOURCE_DIR}/src/include
DESTINATION lib )
) # installed in ./kaldi-decoder.cmake
endif()
endfunction() endfunction()
download_openfst() download_openfst()

View File

@@ -13,4 +13,4 @@ Cflags: -I"${includedir}"
# Note: -lcargs is required only for the following file # Note: -lcargs is required only for the following file
# https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c
# We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c
Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@

View File

@@ -20,6 +20,9 @@ class OfflineTtsDemo
[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")] [Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
public string RuleFsts { get; set; } public string RuleFsts { get; set; }
[Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
public string RuleFars { get; set; }
[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] [Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
public string DataDir { get; set; } public string DataDir { get; set; }
@@ -72,14 +75,15 @@ class OfflineTtsDemo
string usage = @" string usage = @"
# vits-aishell3 # vits-aishell3
wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xf vits-zh-aishell3.tar.bz2 tar xvf vits-icefall-zh-aishell3.tar.bz2
dotnet run \ dotnet run \
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ --vits-model=./vits-icefall-zh-aishell3/model.onnx \
--vits-tokens=./vits-zh-aishell3/tokens.txt \ --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \ --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \ --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
--sid=66 \ --sid=66 \
--debug=1 \ --debug=1 \
--output-filename=./aishell3-66.wav \ --output-filename=./aishell3-66.wav \
@@ -127,6 +131,7 @@ to download more models.
config.Model.Debug = options.Debug; config.Model.Debug = options.Debug;
config.Model.Provider = "cpu"; config.Model.Provider = "cpu";
config.RuleFsts = options.RuleFsts; config.RuleFsts = options.RuleFsts;
config.RuleFars = options.RuleFars;
config.MaxNumSentences = options.MaxNumSentences; config.MaxNumSentences = options.MaxNumSentences;
OfflineTts tts = new OfflineTts(config); OfflineTts tts = new OfflineTts(config);

View File

@@ -1,18 +1,18 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -ex set -ex
if [ ! -f ./vits-zh-aishell3/vits-aishell3.onnx ]; then if [ ! -f ./vits-zh-aishell3/vits-aishell3.onnx ]; then
# wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
curl -OL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 tar xvf vits-icefall-zh-aishell3.tar.bz2
tar xf vits-zh-aishell3.tar.bz2 rm vits-icefall-zh-aishell3.tar.bz2
rm vits-zh-aishell3.tar.bz2
fi fi
dotnet run \ dotnet run \
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ --vits-model=./vits-icefall-zh-aishell3/model.onnx \
--vits-tokens=./vits-zh-aishell3/tokens.txt \ --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \ --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \ --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
--sid=66 \ --sid=66 \
--debug=1 \ --debug=1 \
--output-filename=./aishell3-66.wav \ --output-filename=./aishell3-66.wav \
--text="这是一个语音合成测试, 写于公元 2024 年 1 月 28 号, 23点27分星期天。" --text="这是一个语音合成测试, 写于公元 2024 年 1 月 28 号, 23点27分星期天。长沙长大,去过长白山和长安街。行行出状元。行行,银行行长,行业。"

View File

@@ -26,6 +26,7 @@ func main() {
flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message") flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message")
flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use") flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use")
flag.StringVar(&config.RuleFsts, "tts-rule-fsts", "", "Path to rule.fst") flag.StringVar(&config.RuleFsts, "tts-rule-fsts", "", "Path to rule.fst")
flag.StringVar(&config.RuleFars, "tts-rule-fars", "", "Path to rule.far")
flag.IntVar(&config.MaxNumSentences, "tts-max-num-sentences", 1, "Batch size") flag.IntVar(&config.MaxNumSentences, "tts-max-num-sentences", 1, "Batch size")
flag.IntVar(&sid, "sid", 0, "Speaker ID. Used only for multi-speaker models") flag.IntVar(&sid, "sid", 0, "Speaker ID. Used only for multi-speaker models")

View File

@@ -6,21 +6,32 @@
for sid in 10 33 99; do for sid in 10 33 99; do
./non-streaming-tts \ ./non-streaming-tts \
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ --vits-model=./vits-icefall-zh-aishell3/model.onnx \
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \ --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
--vits-tokens=./vits-zh-aishell3/tokens.txt \ --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
--sid=$sid \ --sid=$sid \
--debug=1 \ --debug=1 \
--output-filename=./liliana-$sid.wav \ --output-filename=./liliana-$sid.wav \
"林美丽最美丽、最漂亮、最可爱!" "林美丽最美丽、最漂亮、最可爱!"
./non-streaming-tts \ ./non-streaming-tts \
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ --vits-model=./vits-icefall-zh-aishell3/model.onnx \
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \ --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
--vits-tokens=./vits-zh-aishell3/tokens.txt \ --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \ --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
--sid=$sid \ --sid=$sid \
--debug=1 \ --debug=1 \
--output-filename=./numbers-$sid.wav \ --output-filename=./numbers-$sid.wav \
"数字12345.6789怎么念" "数字12345.6789怎么念"
./non-streaming-tts \
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
--sid=$sid \
--debug=1 \
--output-filename=./heteronym-$sid.wav \
"万古长存长沙长大长白山长孙长安街"
done done

View File

@@ -7,10 +7,9 @@
import Foundation import Foundation
// used to get the path to espeak-ng-data // used to get the path to espeak-ng-data
func resourceURL(to path: String) -> String { func resourceURL(to path: String) -> String {
return URL(string: path, relativeTo: Bundle.main.resourceURL)!.path return URL(string: path, relativeTo: Bundle.main.resourceURL)!.path
} }
func getResource(_ forResource: String, _ ofType: String) -> String { func getResource(_ forResource: String, _ ofType: String) -> String {
@@ -50,8 +49,7 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper {
// See the following link // See the following link
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3 // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3
// vits-vctk.onnx let model = getResource("model", "onnx")
let model = getResource("vits-aishell3", "onnx")
// lexicon.txt // lexicon.txt
let lexicon = getResource("lexicon", "txt") let lexicon = getResource("lexicon", "txt")
@@ -59,9 +57,19 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper {
// tokens.txt // tokens.txt
let tokens = getResource("tokens", "txt") let tokens = getResource("tokens", "txt")
// rule.fst
let ruleFsts = getResource("rule", "fst")
// rule.far
let ruleFars = getResource("rule", "far")
let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: lexicon, tokens: tokens) let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: lexicon, tokens: tokens)
let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits)
var config = sherpaOnnxOfflineTtsConfig(model: modelConfig) var config = sherpaOnnxOfflineTtsConfig(
model: modelConfig,
ruleFsts: ruleFsts,
ruleFars: ruleFars
)
return SherpaOnnxOfflineTtsWrapper(config: &config) return SherpaOnnxOfflineTtsWrapper(config: &config)
} }
@@ -69,7 +77,6 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper {
func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper { func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper {
// please see https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 // please see https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
// vits-vctk.onnx
let model = getResource("en_US-amy-low", "onnx") let model = getResource("en_US-amy-low", "onnx")
// tokens.txt // tokens.txt
@@ -78,7 +85,8 @@ func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper {
// in this case, we don't need lexicon.txt // in this case, we don't need lexicon.txt
let dataDir = resourceURL(to: "espeak-ng-data") let dataDir = resourceURL(to: "espeak-ng-data")
let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: "", tokens: tokens, dataDir: dataDir) let vits = sherpaOnnxOfflineTtsVitsModelConfig(
model: model, lexicon: "", tokens: tokens, dataDir: dataDir)
let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits)
var config = sherpaOnnxOfflineTtsConfig(model: modelConfig) var config = sherpaOnnxOfflineTtsConfig(model: modelConfig)

View File

@@ -11,6 +11,7 @@
sherpa-onnx-core.lib; sherpa-onnx-core.lib;
kaldi-decoder-core.lib; kaldi-decoder-core.lib;
sherpa-onnx-kaldifst-core.lib; sherpa-onnx-kaldifst-core.lib;
sherpa-onnx-fstfar.lib;
sherpa-onnx-fst.lib; sherpa-onnx-fst.lib;
kaldi-native-fbank-core.lib; kaldi-native-fbank-core.lib;
onnxruntime.lib; onnxruntime.lib;

View File

@@ -11,6 +11,7 @@
sherpa-onnx-core.lib; sherpa-onnx-core.lib;
kaldi-decoder-core.lib; kaldi-decoder-core.lib;
sherpa-onnx-kaldifst-core.lib; sherpa-onnx-kaldifst-core.lib;
sherpa-onnx-fstfar.lib;
sherpa-onnx-fst.lib; sherpa-onnx-fst.lib;
kaldi-native-fbank-core.lib; kaldi-native-fbank-core.lib;
onnxruntime.lib; onnxruntime.lib;

View File

@@ -11,6 +11,7 @@
sherpa-onnx-core.lib; sherpa-onnx-core.lib;
kaldi-decoder-core.lib; kaldi-decoder-core.lib;
sherpa-onnx-kaldifst-core.lib; sherpa-onnx-kaldifst-core.lib;
sherpa-onnx-fstfar.lib;
sherpa-onnx-fst.lib; sherpa-onnx-fst.lib;
kaldi-native-fbank-core.lib; kaldi-native-fbank-core.lib;
onnxruntime.lib; onnxruntime.lib;

View File

@@ -43,8 +43,8 @@ for text-to-speech.
You can use the following command to run it: You can use the following command to run it:
```bash ```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-zh-aishell3.tar.bz2 tar xvf vits-icefall-zh-aishell3.tar.bz2
node ./test-offline-tts-zh.js node ./test-offline-tts-zh.js
``` ```

View File

@@ -22,6 +22,7 @@ function createOfflineTts() {
let offlineTtsConfig = { let offlineTtsConfig = {
offlineTtsModelConfig: offlineTtsModelConfig, offlineTtsModelConfig: offlineTtsModelConfig,
ruleFsts: '', ruleFsts: '',
ruleFars: '',
maxNumSentences: 1, maxNumSentences: 1,
}; };

View File

@@ -4,9 +4,9 @@ const sherpa_onnx = require('sherpa-onnx');
function createOfflineTts() { function createOfflineTts() {
let offlineTtsVitsModelConfig = { let offlineTtsVitsModelConfig = {
model: './vits-zh-aishell3/vits-aishell3.onnx', model: './vits-icefall-zh-aishell3/vits-aishell3.onnx',
lexicon: './vits-zh-aishell3/lexicon.txt', lexicon: './vits-icefall-zh-aishell3/lexicon.txt',
tokens: './vits-zh-aishell3/tokens.txt', tokens: './vits-icefall-zh-aishell3/tokens.txt',
dataDir: '', dataDir: '',
noiseScale: 0.667, noiseScale: 0.667,
noiseScaleW: 0.8, noiseScaleW: 0.8,
@@ -21,7 +21,9 @@ function createOfflineTts() {
let offlineTtsConfig = { let offlineTtsConfig = {
offlineTtsModelConfig: offlineTtsModelConfig, offlineTtsModelConfig: offlineTtsModelConfig,
ruleFsts: './vits-zh-aishell3/rule.fst', ruleFsts:
'./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst,./vits-icefall-zh-aishell3/new_heteronym.fst',
ruleFars: './vits-icefall-zh-aishell3/rule.far',
maxNumSentences: 1, maxNumSentences: 1,
}; };

View File

@@ -56,6 +56,11 @@ sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt
sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt
{% endif %} {% endif %}
{% if tts_model.rule_fars %}
rule_fars={{ tts_model.rule_fars }}
sed -i.bak s%"ruleFsts = null"%"ruleFars = \"$rule_fars\""% ./TtsEngine.kt
{% endif %}
{% if tts_model.data_dir %} {% if tts_model.data_dir %}
data_dir={{ tts_model.data_dir }} data_dir={{ tts_model.data_dir }}
sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./TtsEngine.kt sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./TtsEngine.kt

View File

@@ -54,6 +54,11 @@ sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt
sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt
{% endif %} {% endif %}
{% if tts_model.rule_fars %}
rule_fars={{ tts_model.rule_fars }}
sed -i.bak s%"ruleFsts = null"%"ruleFars = \"$rule_fars\""% ./MainActivity.kt
{% endif %}
{% if tts_model.data_dir %} {% if tts_model.data_dir %}
data_dir={{ tts_model.data_dir }} data_dir={{ tts_model.data_dir }}
sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./MainActivity.kt sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./MainActivity.kt

View File

@@ -33,6 +33,7 @@ class TtsModel:
model_name: str = "" model_name: str = ""
lang: str = "" # en, zh, fr, de, etc. lang: str = "" # en, zh, fr, de, etc.
rule_fsts: Optional[List[str]] = None rule_fsts: Optional[List[str]] = None
rule_fars: Optional[List[str]] = None
data_dir: Optional[str] = None data_dir: Optional[str] = None
is_char: bool = False is_char: bool = False
lang_iso_639_3: str = "" lang_iso_639_3: str = ""
@@ -241,98 +242,94 @@ def get_mimic3_models() -> List[TtsModel]:
def get_vits_models() -> List[TtsModel]: def get_vits_models() -> List[TtsModel]:
return [ chinese_models = [
# Chinese # Chinese
TtsModel( TtsModel(
model_dir="vits-icefall-zh-aishell3", model_dir="vits-icefall-zh-aishell3",
model_name="model.onnx", model_name="model.onnx",
lang="zh", lang="zh",
rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/rule.fst", rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst",
rule_fars="vits-icefall-zh-aishell3/rule.far",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-aishell3", model_dir="vits-zh-aishell3",
model_name="vits-aishell3.onnx", model_name="vits-aishell3.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-aishell3/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-doom", model_dir="vits-zh-hf-doom",
model_name="doom.onnx", model_name="doom.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-doom/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-echo", model_dir="vits-zh-hf-echo",
model_name="echo.onnx", model_name="echo.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-echo/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-zenyatta", model_dir="vits-zh-hf-zenyatta",
model_name="zenyatta.onnx", model_name="zenyatta.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-zenyatta/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-abyssinvoker", model_dir="vits-zh-hf-abyssinvoker",
model_name="abyssinvoker.onnx", model_name="abyssinvoker.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-abyssinvoker/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-keqing", model_dir="vits-zh-hf-keqing",
model_name="keqing.onnx", model_name="keqing.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-keqing/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-eula", model_dir="vits-zh-hf-eula",
model_name="eula.onnx", model_name="eula.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-eula/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-bronya", model_dir="vits-zh-hf-bronya",
model_name="bronya.onnx", model_name="bronya.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-bronya/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-theresa", model_dir="vits-zh-hf-theresa",
model_name="theresa.onnx", model_name="theresa.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-theresa/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-fanchen-wnj", model_dir="vits-zh-hf-fanchen-wnj",
model_name="vits-zh-hf-fanchen-wnj.onnx", model_name="vits-zh-hf-fanchen-wnj.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-fanchen-wnj/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-fanchen-C", model_dir="vits-zh-hf-fanchen-C",
model_name="vits-zh-hf-fanchen-C.onnx", model_name="vits-zh-hf-fanchen-C.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-fanchen-C/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe", model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe",
model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe.onnx", model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-fanchen-ZhiHuiLaoZhe/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new", model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new",
model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new.onnx", model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new/rule.fst",
), ),
TtsModel( TtsModel(
model_dir="vits-zh-hf-fanchen-unity", model_dir="vits-zh-hf-fanchen-unity",
model_name="vits-zh-hf-fanchen-unity.onnx", model_name="vits-zh-hf-fanchen-unity.onnx",
lang="zh", lang="zh",
rule_fsts="vits-zh-hf-fanchen-unity/rule.fst",
), ),
]
rule_fsts = ["phone.fst", "date.fst", "number.fst", "new_heteronym.fst"]
for m in chinese_models:
s = [f"{m.model_dir}/{r}" for r in rule_fsts]
m.rule_fsts = ",".join(s)
m.rule_fars = f"{m.model_dir}/rule.far"
all_models = chinese_models + [
TtsModel( TtsModel(
model_dir="vits-cantonese-hf-xiaomaiiwn", model_dir="vits-cantonese-hf-xiaomaiiwn",
model_name="vits-cantonese-hf-xiaomaiiwn.onnx", model_name="vits-cantonese-hf-xiaomaiiwn.onnx",
@@ -346,6 +343,8 @@ def get_vits_models() -> List[TtsModel]:
# fmt: on # fmt: on
] ]
return all_models
def main(): def main():
args = get_args() args = get_args()

View File

@@ -40,6 +40,7 @@ def process_linux(s):
"libpiper_phonemize.so.1", "libpiper_phonemize.so.1",
"libsherpa-onnx-c-api.so", "libsherpa-onnx-c-api.so",
"libsherpa-onnx-core.so", "libsherpa-onnx-core.so",
"libsherpa-onnx-fstfar.so.7",
"libsherpa-onnx-fst.so.6", "libsherpa-onnx-fst.so.6",
"libsherpa-onnx-kaldifst-core.so", "libsherpa-onnx-kaldifst-core.so",
"libucd.so", "libucd.so",
@@ -68,6 +69,7 @@ def process_macos(s):
"libpiper_phonemize.1.dylib", "libpiper_phonemize.1.dylib",
"libsherpa-onnx-c-api.dylib", "libsherpa-onnx-c-api.dylib",
"libsherpa-onnx-core.dylib", "libsherpa-onnx-core.dylib",
"libsherpa-onnx-fstfar.7.dylib",
"libsherpa-onnx-fst.6.dylib", "libsherpa-onnx-fst.6.dylib",
"libsherpa-onnx-kaldifst-core.dylib", "libsherpa-onnx-kaldifst-core.dylib",
"libucd.dylib", "libucd.dylib",
@@ -96,6 +98,7 @@ def process_windows(s, rid):
"piper_phonemize.dll", "piper_phonemize.dll",
"sherpa-onnx-c-api.dll", "sherpa-onnx-c-api.dll",
"sherpa-onnx-core.dll", "sherpa-onnx-core.dll",
"sherpa-onnx-fstfar.lib",
"sherpa-onnx-fst.lib", "sherpa-onnx-fst.lib",
"sherpa-onnx-kaldifst-core.lib", "sherpa-onnx-kaldifst-core.lib",
"ucd.dll", "ucd.dll",

View File

@@ -67,6 +67,7 @@ namespace SherpaOnnx
Model = new OfflineTtsModelConfig(); Model = new OfflineTtsModelConfig();
RuleFsts = ""; RuleFsts = "";
MaxNumSentences = 1; MaxNumSentences = 1;
RuleFars = "";
} }
public OfflineTtsModelConfig Model; public OfflineTtsModelConfig Model;
@@ -74,6 +75,9 @@ namespace SherpaOnnx
public string RuleFsts; public string RuleFsts;
public int MaxNumSentences; public int MaxNumSentences;
[MarshalAs(UnmanagedType.LPStr)]
public string RuleFars;
} }
public class OfflineTtsGeneratedAudio public class OfflineTtsGeneratedAudio

View File

@@ -41,6 +41,7 @@ if [ ! -f /tmp/linux/libsherpa-onnx-core.so ]; then
cd .. cd ..
rm -v libpiper_phonemize.so libpiper_phonemize.so.1.2.0 rm -v libpiper_phonemize.so libpiper_phonemize.so.1.2.0
rm -v libsherpa-onnx-fst.so rm -v libsherpa-onnx-fst.so
rm -v libsherpa-onnx-fstfar.so
rm -v libonnxruntime.so rm -v libonnxruntime.so
rm -v libcargs.so rm -v libcargs.so
rm -rf wheel rm -rf wheel
@@ -67,6 +68,7 @@ if [ ! -f /tmp/macos/libsherpa-onnx-core.dylib ]; then
rm -v libonnxruntime.dylib rm -v libonnxruntime.dylib
rm -v libpiper_phonemize.1.2.0.dylib libpiper_phonemize.dylib rm -v libpiper_phonemize.1.2.0.dylib libpiper_phonemize.dylib
rm -v libsherpa-onnx-fst.dylib rm -v libsherpa-onnx-fst.dylib
rm -v libsherpa-onnx-fstfar.dylib
rm -rf wheel rm -rf wheel
ls -lh ls -lh
cd .. cd ..

View File

@@ -2,5 +2,5 @@
package sherpa_onnx package sherpa_onnx
// #cgo LDFLAGS: -L ${SRCDIR}/lib/x86_64-apple-darwin -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -Wl,-rpath,${SRCDIR}/lib/x86_64-apple-darwin // #cgo LDFLAGS: -L ${SRCDIR}/lib/x86_64-apple-darwin -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -Wl,-rpath,${SRCDIR}/lib/x86_64-apple-darwin
import "C" import "C"

View File

@@ -554,6 +554,7 @@ type OfflineTtsModelConfig struct {
type OfflineTtsConfig struct { type OfflineTtsConfig struct {
Model OfflineTtsModelConfig Model OfflineTtsModelConfig
RuleFsts string RuleFsts string
RuleFars string
MaxNumSentences int MaxNumSentences int
} }
@@ -583,6 +584,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
c.rule_fsts = C.CString(config.RuleFsts) c.rule_fsts = C.CString(config.RuleFsts)
defer C.free(unsafe.Pointer(c.rule_fsts)) defer C.free(unsafe.Pointer(c.rule_fsts))
c.rule_fars = C.CString(config.RuleFars)
defer C.free(unsafe.Pointer(c.rule_fars))
c.max_num_sentences = C.int(config.MaxNumSentences) c.max_num_sentences = C.int(config.MaxNumSentences)
c.model.vits.model = C.CString(config.Model.Vits.Model) c.model.vits.model = C.CString(config.Model.Vits.Model)

View File

@@ -818,6 +818,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
tts_config.model.debug = config->model.debug; tts_config.model.debug = config->model.debug;
tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu");
tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, "");
tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2); tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2);
if (tts_config.model.debug) { if (tts_config.model.debug) {

View File

@@ -783,6 +783,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig {
SherpaOnnxOfflineTtsModelConfig model; SherpaOnnxOfflineTtsModelConfig model;
const char *rule_fsts; const char *rule_fsts;
int32_t max_num_sentences; int32_t max_num_sentences;
const char *rule_fars;
} SherpaOnnxOfflineTtsConfig; } SherpaOnnxOfflineTtsConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio {

View File

@@ -164,6 +164,7 @@ endif()
if(SHERPA_ONNX_ENABLE_TTS) if(SHERPA_ONNX_ENABLE_TTS)
target_link_libraries(sherpa-onnx-core piper_phonemize) target_link_libraries(sherpa-onnx-core piper_phonemize)
target_link_libraries(sherpa-onnx-core fstfar fst)
endif() endif()
if(SHERPA_ONNX_ENABLE_CHECK) if(SHERPA_ONNX_ENABLE_CHECK)

View File

@@ -18,7 +18,6 @@
#endif #endif
#include <memory> #include <memory>
#include <regex> // NOLINT
#include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/onnx-utils.h" #include "sherpa-onnx/csrc/onnx-utils.h"
@@ -26,6 +25,55 @@
namespace sherpa_onnx { namespace sherpa_onnx {
static std::vector<std::string> ProcessHeteronyms(
const std::vector<std::string> &words) {
std::vector<std::string> ans;
ans.reserve(words.size());
int32_t num_words = static_cast<int32_t>(words.size());
int32_t i = 0;
int32_t prev = -1;
while (i < num_words) {
// start of a phrase #$|
if ((i + 2 < num_words) && words[i] == "#" && words[i + 1] == "$" &&
words[i + 2] == "|") {
if (prev == -1) {
prev = i + 3;
}
i = i + 3;
continue;
}
// end of a phrase |$#
if ((i + 2 < num_words) && words[i] == "|" && words[i + 1] == "$" &&
words[i + 2] == "#") {
if (prev != -1) {
std::ostringstream os;
for (int32_t k = prev; k < i; ++k) {
if (words[k] != "|" && words[k] != "$" && words[k] != "#") {
os << words[k];
}
}
ans.push_back(os.str());
prev = -1;
}
i += 3;
continue;
}
if (prev == -1) {
// not inside a phrase
ans.push_back(words[i]);
}
++i;
}
return ans;
}
static void ToLowerCase(std::string *in_out) { static void ToLowerCase(std::string *in_out) {
std::transform(in_out->begin(), in_out->end(), in_out->begin(), std::transform(in_out->begin(), in_out->end(), in_out->begin(),
[](unsigned char c) { return std::tolower(c); }); [](unsigned char c) { return std::tolower(c); });
@@ -148,36 +196,9 @@ std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsChinese(
const std::string &_text) const { const std::string &_text) const {
std::string text(_text); std::string text(_text);
ToLowerCase(&text); ToLowerCase(&text);
std::vector<std::string> words;
if (pattern_) {
// Handle polyphones
size_t pos = 0;
auto begin = std::sregex_iterator(text.begin(), text.end(), *pattern_);
auto end = std::sregex_iterator();
for (std::sregex_iterator i = begin; i != end; ++i) {
std::smatch match = *i;
if (pos < match.position()) {
auto this_segment = text.substr(pos, match.position() - pos);
auto this_segment_words = SplitUtf8(this_segment);
words.insert(words.end(), this_segment_words.begin(),
this_segment_words.end());
pos = match.position() + match.length();
} else if (pos == match.position()) {
pos = match.position() + match.length();
}
words.push_back(match.str()); std::vector<std::string> words = SplitUtf8(text);
} words = ProcessHeteronyms(words);
if (pos < text.size()) {
auto this_segment = text.substr(pos, text.size() - pos);
auto this_segment_words = SplitUtf8(this_segment);
words.insert(words.end(), this_segment_words.begin(),
this_segment_words.end());
}
} else {
words = SplitUtf8(text);
}
if (debug_) { if (debug_) {
fprintf(stderr, "Input text in string: %s\n", text.c_str()); fprintf(stderr, "Input text in string: %s\n", text.c_str());
@@ -357,9 +378,6 @@ void Lexicon::InitLexicon(std::istream &is) {
std::string line; std::string line;
std::string phone; std::string phone;
std::ostringstream os;
std::string sep;
while (std::getline(is, line)) { while (std::getline(is, line)) {
std::istringstream iss(line); std::istringstream iss(line);
@@ -381,18 +399,9 @@ void Lexicon::InitLexicon(std::istream &is) {
if (ids.empty()) { if (ids.empty()) {
continue; continue;
} }
if (language_ == Language::kChinese && word.size() > 3) {
// this is not a single word;
os << sep << word;
sep = "|";
}
word2ids_.insert({std::move(word), std::move(ids)}); word2ids_.insert({std::move(word), std::move(ids)});
} }
if (!sep.empty()) {
pattern_ = std::make_unique<std::regex>(os.str());
}
} }
void Lexicon::InitPunctuations(const std::string &punctuations) { void Lexicon::InitPunctuations(const std::string &punctuations) {

View File

@@ -7,7 +7,6 @@
#include <cstdint> #include <cstdint>
#include <memory> #include <memory>
#include <regex> // NOLINT
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
@@ -65,9 +64,6 @@ class Lexicon : public OfflineTtsFrontend {
std::unordered_map<std::string, int32_t> token2id_; std::unordered_map<std::string, int32_t> token2id_;
Language language_; Language language_;
bool debug_; bool debug_;
// for Chinese polyphones
std::unique_ptr<std::regex> pattern_;
}; };
} // namespace sherpa_onnx } // namespace sherpa_onnx

View File

@@ -15,6 +15,9 @@
#include "android/asset_manager.h" #include "android/asset_manager.h"
#include "android/asset_manager_jni.h" #include "android/asset_manager_jni.h"
#endif #endif
#include "fst/extensions/far/far.h"
#include "kaldifst/csrc/kaldi-fst-io.h"
#include "kaldifst/csrc/text-normalizer.h" #include "kaldifst/csrc/text-normalizer.h"
#include "sherpa-onnx/csrc/lexicon.h" #include "sherpa-onnx/csrc/lexicon.h"
#include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/macros.h"
@@ -46,6 +49,32 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f)); tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f));
} }
} }
if (!config.rule_fars.empty()) {
if (config.model.debug) {
SHERPA_ONNX_LOGE("Loading FST archives");
}
std::vector<std::string> files;
SplitStringToVector(config.rule_fars, ",", false, &files);
for (const auto &f : files) {
if (config.model.debug) {
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
}
std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
fst::FarReader<fst::StdArc>::Open(f));
for (; !reader->Done(); reader->Next()) {
std::unique_ptr<fst::StdConstFst> r(
fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
tn_list_.push_back(
std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
}
}
if (config.model.debug) {
SHERPA_ONNX_LOGE("FST archives loaded!");
}
}
} }
#if __ANDROID_API__ >= 9 #if __ANDROID_API__ >= 9

View File

@@ -20,7 +20,14 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
"It not empty, it contains a list of rule FST filenames." "It not empty, it contains a list of rule FST filenames."
"Multiple filenames are separated by a comma and they are " "Multiple filenames are separated by a comma and they are "
"applied from left to right. An example value: " "applied from left to right. An example value: "
"rule1.fst,rule2,fst,rule3.fst"); "rule1.fst,rule2.fst,rule3.fst");
po->Register("tts-rule-fars", &rule_fars,
"It not empty, it contains a list of rule FST archive filenames."
"Multiple filenames are separated by a comma and they are "
"applied from left to right. An example value: "
"rule1.far,rule2.far,rule3.far. Note that an *.far can contain "
"multiple *.fst files");
po->Register( po->Register(
"tts-max-num-sentences", &max_num_sentences, "tts-max-num-sentences", &max_num_sentences,
@@ -41,6 +48,17 @@ bool OfflineTtsConfig::Validate() const {
} }
} }
if (!rule_fars.empty()) {
std::vector<std::string> files;
SplitStringToVector(rule_fars, ",", false, &files);
for (const auto &f : files) {
if (!FileExists(f)) {
SHERPA_ONNX_LOGE("Rule far %s does not exist. ", f.c_str());
return false;
}
}
}
return model.Validate(); return model.Validate();
} }
@@ -50,6 +68,7 @@ std::string OfflineTtsConfig::ToString() const {
os << "OfflineTtsConfig("; os << "OfflineTtsConfig(";
os << "model=" << model.ToString() << ", "; os << "model=" << model.ToString() << ", ";
os << "rule_fsts=\"" << rule_fsts << "\", "; os << "rule_fsts=\"" << rule_fsts << "\", ";
os << "rule_fars=\"" << rule_fars << "\", ";
os << "max_num_sentences=" << max_num_sentences << ")"; os << "max_num_sentences=" << max_num_sentences << ")";
return os.str(); return os.str();

View File

@@ -29,6 +29,9 @@ struct OfflineTtsConfig {
// If there are multiple rules, they are applied from left to right. // If there are multiple rules, they are applied from left to right.
std::string rule_fsts; std::string rule_fsts;
// If there are multiple FST archives, they are applied from left to right.
std::string rule_fars;
// Maximum number of sentences that we process at a time. // Maximum number of sentences that we process at a time.
// This is to avoid OOM for very long input text. // This is to avoid OOM for very long input text.
// If you set it to -1, then we process all sentences in a single batch. // If you set it to -1, then we process all sentences in a single batch.
@@ -36,9 +39,11 @@ struct OfflineTtsConfig {
OfflineTtsConfig() = default; OfflineTtsConfig() = default;
OfflineTtsConfig(const OfflineTtsModelConfig &model, OfflineTtsConfig(const OfflineTtsModelConfig &model,
const std::string &rule_fsts, int32_t max_num_sentences) const std::string &rule_fsts, const std::string &rule_fars,
int32_t max_num_sentences)
: model(model), : model(model),
rule_fsts(rule_fsts), rule_fsts(rule_fsts),
rule_fars(rule_fars),
max_num_sentences(max_num_sentences) {} max_num_sentences(max_num_sentences) {}
void Register(ParseOptions *po); void Register(ParseOptions *po);

View File

@@ -878,6 +878,13 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
ans.rule_fsts = p; ans.rule_fsts = p;
env->ReleaseStringUTFChars(s, p); env->ReleaseStringUTFChars(s, p);
// for ruleFars
fid = env->GetFieldID(cls, "ruleFars", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.rule_fars = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(cls, "maxNumSentences", "I"); fid = env->GetFieldID(cls, "maxNumSentences", "I");
ans.max_num_sentences = env->GetIntField(config, fid); ans.max_num_sentences = env->GetIntField(config, fid);

View File

@@ -32,11 +32,12 @@ static void PybindOfflineTtsConfig(py::module *m) {
py::class_<PyClass>(*m, "OfflineTtsConfig") py::class_<PyClass>(*m, "OfflineTtsConfig")
.def(py::init<>()) .def(py::init<>())
.def(py::init<const OfflineTtsModelConfig &, const std::string &, .def(py::init<const OfflineTtsModelConfig &, const std::string &,
int32_t>(), const std::string &, int32_t>(),
py::arg("model"), py::arg("rule_fsts") = "", py::arg("model"), py::arg("rule_fsts") = "",
py::arg("max_num_sentences") = 2) py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2)
.def_readwrite("model", &PyClass::model) .def_readwrite("model", &PyClass::model)
.def_readwrite("rule_fsts", &PyClass::rule_fsts) .def_readwrite("rule_fsts", &PyClass::rule_fsts)
.def_readwrite("rule_fars", &PyClass::rule_fars)
.def_readwrite("max_num_sentences", &PyClass::max_num_sentences) .def_readwrite("max_num_sentences", &PyClass::max_num_sentences)
.def("validate", &PyClass::Validate) .def("validate", &PyClass::Validate)
.def("__str__", &PyClass::ToString); .def("__str__", &PyClass::ToString);

View File

@@ -652,12 +652,14 @@ func sherpaOnnxOfflineTtsModelConfig(
func sherpaOnnxOfflineTtsConfig( func sherpaOnnxOfflineTtsConfig(
model: SherpaOnnxOfflineTtsModelConfig, model: SherpaOnnxOfflineTtsModelConfig,
ruleFsts: String = "", ruleFsts: String = "",
ruleFars: String = "",
maxNumSenetences: Int = 2 maxNumSenetences: Int = 2
) -> SherpaOnnxOfflineTtsConfig { ) -> SherpaOnnxOfflineTtsConfig {
return SherpaOnnxOfflineTtsConfig( return SherpaOnnxOfflineTtsConfig(
model: model, model: model,
rule_fsts: toCPointer(ruleFsts), rule_fsts: toCPointer(ruleFsts),
max_num_sentences: Int32(maxNumSenetences) max_num_sentences: Int32(maxNumSenetences),
rule_fars: toCPointer(ruleFars)
) )
} }

View File

@@ -90,7 +90,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
function initSherpaOnnxOfflineTtsConfig(config, Module) { function initSherpaOnnxOfflineTtsConfig(config, Module) {
const modelConfig = const modelConfig =
initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module); initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module);
const len = modelConfig.len + 2 * 4; const len = modelConfig.len + 3 * 4;
const ptr = Module._malloc(len); const ptr = Module._malloc(len);
let offset = 0; let offset = 0;
@@ -98,12 +98,19 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) {
offset += modelConfig.len; offset += modelConfig.len;
const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts) + 1; const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts) + 1;
const buffer = Module._malloc(ruleFstsLen); const ruleFarsLen = Module.lengthBytesUTF8(config.ruleFars) + 1;
const buffer = Module._malloc(ruleFstsLen + ruleFarsLen);
Module.stringToUTF8(config.ruleFsts, buffer, ruleFstsLen); Module.stringToUTF8(config.ruleFsts, buffer, ruleFstsLen);
Module.stringToUTF8(config.ruleFars, buffer + ruleFstsLen, ruleFarsLen);
Module.setValue(ptr + offset, buffer, 'i8*'); Module.setValue(ptr + offset, buffer, 'i8*');
offset += 4; offset += 4;
Module.setValue(ptr + offset, config.maxNumSentences, 'i32'); Module.setValue(ptr + offset, config.maxNumSentences, 'i32');
offset += 4;
Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*');
return { return {
buffer: buffer, ptr: ptr, len: len, config: modelConfig, buffer: buffer, ptr: ptr, len: len, config: modelConfig,
@@ -190,6 +197,7 @@ function createOfflineTts(Module, myConfig) {
let offlineTtsConfig = { let offlineTtsConfig = {
offlineTtsModelConfig: offlineTtsModelConfig, offlineTtsModelConfig: offlineTtsModelConfig,
ruleFsts: '', ruleFsts: '',
ruleFars: '',
maxNumSentences: 1, maxNumSentences: 1,
} }

View File

@@ -18,7 +18,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4, sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4,
""); "");
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
sizeof(SherpaOnnxOfflineTtsModelConfig) + 2 * 4, sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
""); "");
void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
@@ -40,6 +40,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
fprintf(stdout, "----------tts config----------\n"); fprintf(stdout, "----------tts config----------\n");
fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts); fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts);
fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars);
fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences); fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences);
} }