diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 1531aff2..95dcf027 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -70,9 +70,9 @@ rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 tar xf vits-piper-en_US-amy-low.tar.bz2 node ./test-offline-tts-en.js -rm vits-piper-en_US-amy-low.tar.bz2 +rm vits-piper-en_US-amy-low* -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 -tar xvf vits-zh-aishell3.tar.bz2 +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 +tar xvf vits-icefall-zh-aishell3.tar.bz2 node ./test-offline-tts-zh.js -rm vits-zh-aishell3.tar.bz2 +rm vits-icefall-zh-aishell3* diff --git a/.github/workflows/arm-linux-gnueabihf.yaml b/.github/workflows/arm-linux-gnueabihf.yaml index 76ad9fcf..76223988 100644 --- a/.github/workflows/arm-linux-gnueabihf.yaml +++ b/.github/workflows/arm-linux-gnueabihf.yaml @@ -173,6 +173,7 @@ jobs: rm -v $dst/lib/libasound.so rm -v $dst/lib/libonnxruntime.so rm -v $dst/lib/libsherpa-onnx-fst.so + rm -v $dst/lib/libsherpa-onnx-fstfar.so fi tree $dst diff --git a/.github/workflows/riscv64-linux.yaml b/.github/workflows/riscv64-linux.yaml index b1008b51..a5869a4b 100644 --- a/.github/workflows/riscv64-linux.yaml +++ b/.github/workflows/riscv64-linux.yaml @@ -211,6 +211,7 @@ jobs: rm -fv $dst/lib/libasound.so rm -fv $dst/lib/libonnxruntime.so rm -fv $dst/lib/libsherpa-onnx-fst.so + rm -fv $dst/lib/libsherpa-onnx-fstfar.so fi tree $dst diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index 17af77e6..e7bf9cfd 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -111,9 +111,11 @@ jobs: rm -rf vits-vctk echo "Test vits-zh-aishell3" - git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 + tar xvf vits-icefall-zh-aishell3.tar.bz2 + rm vits-icefall-zh-aishell3.tar.bz2 ./run-vits-zh-aishell3.sh - rm -rf vits-zh-aishell3 + rm -rf vits-icefall-zh-aishell3 echo "Test vits-piper-en_US-lessac-medium" git clone https://huggingface.co/csukuangfj/vits-piper-en_US-lessac-medium diff --git a/.gitignore b/.gitignore index c2c87424..a51cd0ea 100644 --- a/.gitignore +++ b/.gitignore @@ -90,3 +90,4 @@ sherpa-onnx-paraformer-trilingual-zh-cantonese-en sr-data *xcworkspace/xcuserdata/* +vits-icefall-* diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a9fd578..670b4b3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.9.16") +set(SHERPA_ONNX_VERSION "1.9.17") # Disable warning about # diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt index 86c565e3..9f8e6325 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt @@ -155,6 +155,7 @@ class MainActivity : AppCompatActivity() { var modelDir: String? var modelName: String? var ruleFsts: String? + var ruleFars: String? var lexicon: String? var dataDir: String? var assets: AssetManager? = application.assets @@ -165,6 +166,7 @@ class MainActivity : AppCompatActivity() { modelDir = null modelName = null ruleFsts = null + ruleFars = null lexicon = null dataDir = null @@ -181,9 +183,11 @@ class MainActivity : AppCompatActivity() { // dataDir = "vits-piper-en_US-amy-low/espeak-ng-data" // Example 3: - // modelDir = "vits-zh-aishell3" - // modelName = "vits-aishell3.onnx" - // ruleFsts = "vits-zh-aishell3/rule.fst" + // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 + // modelDir = "vits-icefall-zh-aishell3" + // modelName = "model.onnx" + // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst," + // ruleFars = "vits-icefall-zh-aishell3/rule.far" // lexicon = "lexicon.txt" // Example 4: @@ -202,7 +206,8 @@ class MainActivity : AppCompatActivity() { val config = getOfflineTtsConfig( modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "", dataDir = dataDir ?: "", - ruleFsts = ruleFsts ?: "" + ruleFsts = ruleFsts ?: "", + ruleFars = ruleFars ?: "", )!! tts = OfflineTts(assetManager = assets, config = config) diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt index be48b6db..2514fcac 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt @@ -23,6 +23,7 @@ data class OfflineTtsModelConfig( data class OfflineTtsConfig( var model: OfflineTtsModelConfig, var ruleFsts: String = "", + var ruleFars: String = "", var maxNumSentences: Int = 1, ) @@ -151,7 +152,8 @@ fun getOfflineTtsConfig( modelName: String, lexicon: String, dataDir: String, - ruleFsts: String + ruleFsts: String, + ruleFars: String ): OfflineTtsConfig? { return OfflineTtsConfig( model = OfflineTtsModelConfig( @@ -166,5 +168,6 @@ fun getOfflineTtsConfig( provider = "cpu", ), ruleFsts = ruleFsts, + ruleFars = ruleFars, ) } diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt index f814a2e0..5699ccf2 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt @@ -39,6 +39,7 @@ object TtsEngine { private var modelDir: String? = null private var modelName: String? = null private var ruleFsts: String? = null + private var ruleFars: String? = null private var lexicon: String? = null private var dataDir: String? = null private var assets: AssetManager? = null @@ -50,6 +51,7 @@ object TtsEngine { modelDir = null modelName = null ruleFsts = null + ruleFars = null lexicon = null dataDir = null lang = null @@ -73,9 +75,10 @@ object TtsEngine { // Example 3: // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 - // modelDir = "vits-zh-aishell3" - // modelName = "vits-aishell3.onnx" - // ruleFsts = "vits-zh-aishell3/rule.fst" + // modelDir = "vits-icefall-zh-aishell3" + // modelName = "model.onnx" + // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst" + // ruleFars = "vits-icefall-zh-aishell3/rule.far" // lexicon = "lexicon.txt" // lang = "zho" @@ -108,7 +111,8 @@ object TtsEngine { val config = getOfflineTtsConfig( modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "", dataDir = dataDir ?: "", - ruleFsts = ruleFsts ?: "" + ruleFsts = ruleFsts ?: "", + ruleFars = ruleFars ?: "" )!! tts = OfflineTts(assetManager = assets, config = config) diff --git a/build-ios.sh b/build-ios.sh index 599a1725..a687dc4d 100755 --- a/build-ios.sh +++ b/build-ios.sh @@ -124,6 +124,7 @@ echo "Generate xcframework" mkdir -p "build/simulator/lib" for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \ + libsherpa-onnx-fstfar.a \ libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a \ libucd.a libpiper_phonemize.a libespeak-ng.a; do lipo -create build/simulator_arm64/lib/${f} \ @@ -137,6 +138,7 @@ libtool -static -o build/simulator/sherpa-onnx.a \ build/simulator/lib/libkaldi-native-fbank-core.a \ build/simulator/lib/libsherpa-onnx-c-api.a \ build/simulator/lib/libsherpa-onnx-core.a \ + build/simulator/lib/libsherpa-onnx-fstfar.a \ build/simulator/lib/libsherpa-onnx-fst.a \ build/simulator/lib/libsherpa-onnx-kaldifst-core.a \ build/simulator/lib/libkaldi-decoder-core.a \ @@ -148,6 +150,7 @@ libtool -static -o build/os64/sherpa-onnx.a \ build/os64/lib/libkaldi-native-fbank-core.a \ build/os64/lib/libsherpa-onnx-c-api.a \ build/os64/lib/libsherpa-onnx-core.a \ + build/os64/lib/libsherpa-onnx-fstfar.a \ build/os64/lib/libsherpa-onnx-fst.a \ build/os64/lib/libsherpa-onnx-kaldifst-core.a \ build/os64/lib/libkaldi-decoder-core.a \ diff --git a/build-swift-macos.sh b/build-swift-macos.sh index e5cdbc08..1b1867c5 100755 --- a/build-swift-macos.sh +++ b/build-swift-macos.sh @@ -27,6 +27,7 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \ ./install/lib/libsherpa-onnx-c-api.a \ ./install/lib/libsherpa-onnx-core.a \ ./install/lib/libkaldi-native-fbank-core.a \ + ./install/lib/libsherpa-onnx-fstfar.a \ ./install/lib/libsherpa-onnx-fst.a \ ./install/lib/libsherpa-onnx-kaldifst-core.a \ ./install/lib/libkaldi-decoder-core.a \ diff --git a/c-api-examples/Makefile b/c-api-examples/Makefile index 3e293142..40d35d86 100644 --- a/c-api-examples/Makefile +++ b/c-api-examples/Makefile @@ -4,7 +4,7 @@ CUR_DIR :=$(shell pwd) CFLAGS := -I ../ -I ../build/_deps/cargs-src/include/ LDFLAGS := -L ../build/lib LDFLAGS += -L ../build/_deps/onnxruntime-src/lib -LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime +LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime LDFLAGS += -framework Foundation LDFLAGS += -lc++ LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/lib diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index 7fb0fc0f..ea52bdd6 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -78,6 +78,7 @@ def get_binaries(): "piper_phonemize.dll", "sherpa-onnx-c-api.dll", "sherpa-onnx-core.dll", + "sherpa-onnx-fstfar.lib", "sherpa-onnx-fst.lib", "sherpa-onnx-kaldifst-core.lib", "sherpa-onnx-portaudio.dll", diff --git a/cmake/kaldi-decoder.cmake b/cmake/kaldi-decoder.cmake index 99ebf9aa..b78ece58 100644 --- a/cmake/kaldi-decoder.cmake +++ b/cmake/kaldi-decoder.cmake @@ -64,12 +64,22 @@ function(download_kaldi_decoder) kaldifst_core fst DESTINATION ..) + if(SHERPA_ONNX_ENABLE_TTS) + install(TARGETS + fstfar + DESTINATION ..) + endif() else() install(TARGETS kaldi-decoder-core kaldifst_core fst DESTINATION lib) + if(SHERPA_ONNX_ENABLE_TTS) + install(TARGETS + fstfar + DESTINATION lib) + endif() endif() if(WIN32 AND BUILD_SHARED_LIBS) @@ -78,6 +88,11 @@ function(download_kaldi_decoder) kaldifst_core fst DESTINATION bin) + if(SHERPA_ONNX_ENABLE_TTS) + install(TARGETS + fstfar + DESTINATION bin) + endif() endif() endfunction() diff --git a/cmake/kaldifst.cmake b/cmake/kaldifst.cmake index 12bcc030..3b5ce3ba 100644 --- a/cmake/kaldifst.cmake +++ b/cmake/kaldifst.cmake @@ -50,13 +50,7 @@ function(download_kaldifst) ${kaldifst_SOURCE_DIR}/ ) - target_include_directories(fst - PUBLIC - ${openfst_SOURCE_DIR}/src/include - ) - set_target_properties(kaldifst_core PROPERTIES OUTPUT_NAME "sherpa-onnx-kaldifst-core") - set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst") endfunction() download_kaldifst() diff --git a/cmake/openfst.cmake b/cmake/openfst.cmake index 34073d2c..575ea8ae 100644 --- a/cmake/openfst.cmake +++ b/cmake/openfst.cmake @@ -4,7 +4,7 @@ function(download_openfst) include(FetchContent) set(openfst_URL "https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz") - set(openfst_URL2 "https://huggingface.co/csukuangfj/kaldi-hmm-gmm-cmake-deps/resolve/main/openfst-win-1.6.5.1.tar.gz") + set(openfst_URL2 "https://hub.nuaa.cf/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz") set(openfst_HASH "SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e") # If you don't have access to the Internet, @@ -31,7 +31,7 @@ function(download_openfst) set(HAVE_COMPACT OFF CACHE BOOL "" FORCE) set(HAVE_COMPRESS OFF CACHE BOOL "" FORCE) set(HAVE_CONST OFF CACHE BOOL "" FORCE) - set(HAVE_FAR OFF CACHE BOOL "" FORCE) + set(HAVE_FAR ON CACHE BOOL "" FORCE) set(HAVE_GRM OFF CACHE BOOL "" FORCE) set(HAVE_PDT OFF CACHE BOOL "" FORCE) set(HAVE_MPDT OFF CACHE BOOL "" FORCE) @@ -70,20 +70,21 @@ function(download_openfst) add_subdirectory(${openfst_SOURCE_DIR} ${openfst_BINARY_DIR} EXCLUDE_FROM_ALL) set(openfst_SOURCE_DIR ${openfst_SOURCE_DIR} PARENT_SCOPE) - # Rename libfst.so.6 to libkaldifst_fst.so.6 to avoid potential conflicts - # when kaldifst is installed. - set_target_properties(fst PROPERTIES OUTPUT_NAME "kaldifst_fst") + # Rename libfst.so.6 to libsherpa-onnx-fst.so.6 to avoid potential conflicts + # when sherpa-onnx is installed. + set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst") + set_target_properties(fstfar PROPERTIES OUTPUT_NAME "sherpa-onnx-fstfar") - install(TARGETS fst - DESTINATION lib + target_include_directories(fst + PUBLIC + ${openfst_SOURCE_DIR}/src/include ) - if(KALDIFST_BUILD_PYTHON) - set_target_properties(fstscript PROPERTIES OUTPUT_NAME "kaldifst_fstscript") - install(TARGETS fstscript - DESTINATION lib - ) - endif() + target_include_directories(fstfar + PUBLIC + ${openfst_SOURCE_DIR}/src/include + ) + # installed in ./kaldi-decoder.cmake endfunction() download_openfst() diff --git a/cmake/sherpa-onnx.pc.in b/cmake/sherpa-onnx.pc.in index aae8abab..0870f3ae 100644 --- a/cmake/sherpa-onnx.pc.in +++ b/cmake/sherpa-onnx.pc.in @@ -13,4 +13,4 @@ Cflags: -I"${includedir}" # Note: -lcargs is required only for the following file # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c -Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ +Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ diff --git a/dotnet-examples/offline-tts/Program.cs b/dotnet-examples/offline-tts/Program.cs index 497200e8..85fcd286 100644 --- a/dotnet-examples/offline-tts/Program.cs +++ b/dotnet-examples/offline-tts/Program.cs @@ -20,6 +20,9 @@ class OfflineTtsDemo [Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")] public string RuleFsts { get; set; } + [Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")] + public string RuleFars { get; set; } + [Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] public string DataDir { get; set; } @@ -72,14 +75,15 @@ class OfflineTtsDemo string usage = @" # vits-aishell3 -wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 -tar xf vits-zh-aishell3.tar.bz2 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 +tar xvf vits-icefall-zh-aishell3.tar.bz2 dotnet run \ - --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ - --vits-tokens=./vits-zh-aishell3/tokens.txt \ - --vits-lexicon=./vits-zh-aishell3/lexicon.txt \ - --tts-rule-fsts=./vits-zh-aishell3/rule.fst \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ --sid=66 \ --debug=1 \ --output-filename=./aishell3-66.wav \ @@ -127,6 +131,7 @@ to download more models. config.Model.Debug = options.Debug; config.Model.Provider = "cpu"; config.RuleFsts = options.RuleFsts; + config.RuleFars = options.RuleFars; config.MaxNumSentences = options.MaxNumSentences; OfflineTts tts = new OfflineTts(config); diff --git a/dotnet-examples/offline-tts/run-aishell3.sh b/dotnet-examples/offline-tts/run-aishell3.sh index 44e5e261..02380f07 100755 --- a/dotnet-examples/offline-tts/run-aishell3.sh +++ b/dotnet-examples/offline-tts/run-aishell3.sh @@ -1,18 +1,18 @@ #!/usr/bin/env bash set -ex if [ ! -f ./vits-zh-aishell3/vits-aishell3.onnx ]; then - # wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 - curl -OL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 - tar xf vits-zh-aishell3.tar.bz2 - rm vits-zh-aishell3.tar.bz2 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 + tar xvf vits-icefall-zh-aishell3.tar.bz2 + rm vits-icefall-zh-aishell3.tar.bz2 fi dotnet run \ - --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ - --vits-tokens=./vits-zh-aishell3/tokens.txt \ - --vits-lexicon=./vits-zh-aishell3/lexicon.txt \ - --tts-rule-fsts=./vits-zh-aishell3/rule.fst \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ --sid=66 \ --debug=1 \ --output-filename=./aishell3-66.wav \ - --text="这是一个语音合成测试, 写于公元 2024 年 1 月 28 号, 23点27分,星期天。" + --text="这是一个语音合成测试, 写于公元 2024 年 1 月 28 号, 23点27分,星期天。长沙长大,去过长白山和长安街。行行出状元。行行,银行行长,行业。" diff --git a/go-api-examples/non-streaming-tts/main.go b/go-api-examples/non-streaming-tts/main.go index 8faa605a..0ddeb8fe 100644 --- a/go-api-examples/non-streaming-tts/main.go +++ b/go-api-examples/non-streaming-tts/main.go @@ -26,6 +26,7 @@ func main() { flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message") flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use") flag.StringVar(&config.RuleFsts, "tts-rule-fsts", "", "Path to rule.fst") + flag.StringVar(&config.RuleFars, "tts-rule-fars", "", "Path to rule.far") flag.IntVar(&config.MaxNumSentences, "tts-max-num-sentences", 1, "Batch size") flag.IntVar(&sid, "sid", 0, "Speaker ID. Used only for multi-speaker models") diff --git a/go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh b/go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh index 7d592e29..2ab0c613 100755 --- a/go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh +++ b/go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh @@ -6,21 +6,32 @@ for sid in 10 33 99; do ./non-streaming-tts \ - --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ - --vits-lexicon=./vits-zh-aishell3/lexicon.txt \ - --vits-tokens=./vits-zh-aishell3/tokens.txt \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ --sid=$sid \ --debug=1 \ --output-filename=./liliana-$sid.wav \ "林美丽最美丽、最漂亮、最可爱!" ./non-streaming-tts \ - --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ - --vits-lexicon=./vits-zh-aishell3/lexicon.txt \ - --vits-tokens=./vits-zh-aishell3/tokens.txt \ - --tts-rule-fsts=./vits-zh-aishell3/rule.fst \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ --sid=$sid \ --debug=1 \ --output-filename=./numbers-$sid.wav \ "数字12345.6789怎么念" + +./non-streaming-tts \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ + --sid=$sid \ + --debug=1 \ + --output-filename=./heteronym-$sid.wav \ + "万古长存长沙长大长白山长孙长安街" done diff --git a/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ViewModel.swift b/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ViewModel.swift index f29de9e8..3e5c381c 100644 --- a/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ViewModel.swift +++ b/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ViewModel.swift @@ -7,10 +7,9 @@ import Foundation - // used to get the path to espeak-ng-data func resourceURL(to path: String) -> String { - return URL(string: path, relativeTo: Bundle.main.resourceURL)!.path + return URL(string: path, relativeTo: Bundle.main.resourceURL)!.path } func getResource(_ forResource: String, _ ofType: String) -> String { @@ -50,8 +49,7 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper { // See the following link // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3 - // vits-vctk.onnx - let model = getResource("vits-aishell3", "onnx") + let model = getResource("model", "onnx") // lexicon.txt let lexicon = getResource("lexicon", "txt") @@ -59,9 +57,19 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper { // tokens.txt let tokens = getResource("tokens", "txt") + // rule.fst + let ruleFsts = getResource("rule", "fst") + + // rule.far + let ruleFars = getResource("rule", "far") + let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: lexicon, tokens: tokens) let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) - var config = sherpaOnnxOfflineTtsConfig(model: modelConfig) + var config = sherpaOnnxOfflineTtsConfig( + model: modelConfig, + ruleFsts: ruleFsts, + ruleFars: ruleFars + ) return SherpaOnnxOfflineTtsWrapper(config: &config) } @@ -69,7 +77,6 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper { func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper { // please see https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 - // vits-vctk.onnx let model = getResource("en_US-amy-low", "onnx") // tokens.txt @@ -78,7 +85,8 @@ func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper { // in this case, we don't need lexicon.txt let dataDir = resourceURL(to: "espeak-ng-data") - let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: "", tokens: tokens, dataDir: dataDir) + let vits = sherpaOnnxOfflineTtsVitsModelConfig( + model: model, lexicon: "", tokens: tokens, dataDir: dataDir) let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) var config = sherpaOnnxOfflineTtsConfig(model: modelConfig) diff --git a/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props b/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props index ae9e8518..e81f4b62 100644 --- a/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props +++ b/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props @@ -11,6 +11,7 @@ sherpa-onnx-core.lib; kaldi-decoder-core.lib; sherpa-onnx-kaldifst-core.lib; + sherpa-onnx-fstfar.lib; sherpa-onnx-fst.lib; kaldi-native-fbank-core.lib; onnxruntime.lib; diff --git a/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props b/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props index ae9e8518..e81f4b62 100644 --- a/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props +++ b/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props @@ -11,6 +11,7 @@ sherpa-onnx-core.lib; kaldi-decoder-core.lib; sherpa-onnx-kaldifst-core.lib; + sherpa-onnx-fstfar.lib; sherpa-onnx-fst.lib; kaldi-native-fbank-core.lib; onnxruntime.lib; diff --git a/mfc-examples/StreamingSpeechRecognition/sherpa-onnx-deps.props b/mfc-examples/StreamingSpeechRecognition/sherpa-onnx-deps.props index ae9e8518..e81f4b62 100644 --- a/mfc-examples/StreamingSpeechRecognition/sherpa-onnx-deps.props +++ b/mfc-examples/StreamingSpeechRecognition/sherpa-onnx-deps.props @@ -11,6 +11,7 @@ sherpa-onnx-core.lib; kaldi-decoder-core.lib; sherpa-onnx-kaldifst-core.lib; + sherpa-onnx-fstfar.lib; sherpa-onnx-fst.lib; kaldi-native-fbank-core.lib; onnxruntime.lib; diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 9c13bee5..29c93a27 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -43,8 +43,8 @@ for text-to-speech. You can use the following command to run it: ```bash -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 -tar xvf vits-zh-aishell3.tar.bz2 +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 +tar xvf vits-icefall-zh-aishell3.tar.bz2 node ./test-offline-tts-zh.js ``` diff --git a/nodejs-examples/test-offline-tts-en.js b/nodejs-examples/test-offline-tts-en.js index a8778017..c3bd67b4 100644 --- a/nodejs-examples/test-offline-tts-en.js +++ b/nodejs-examples/test-offline-tts-en.js @@ -22,6 +22,7 @@ function createOfflineTts() { let offlineTtsConfig = { offlineTtsModelConfig: offlineTtsModelConfig, ruleFsts: '', + ruleFars: '', maxNumSentences: 1, }; diff --git a/nodejs-examples/test-offline-tts-zh.js b/nodejs-examples/test-offline-tts-zh.js index bc808803..a53748c7 100644 --- a/nodejs-examples/test-offline-tts-zh.js +++ b/nodejs-examples/test-offline-tts-zh.js @@ -4,9 +4,9 @@ const sherpa_onnx = require('sherpa-onnx'); function createOfflineTts() { let offlineTtsVitsModelConfig = { - model: './vits-zh-aishell3/vits-aishell3.onnx', - lexicon: './vits-zh-aishell3/lexicon.txt', - tokens: './vits-zh-aishell3/tokens.txt', + model: './vits-icefall-zh-aishell3/vits-aishell3.onnx', + lexicon: './vits-icefall-zh-aishell3/lexicon.txt', + tokens: './vits-icefall-zh-aishell3/tokens.txt', dataDir: '', noiseScale: 0.667, noiseScaleW: 0.8, @@ -21,7 +21,9 @@ function createOfflineTts() { let offlineTtsConfig = { offlineTtsModelConfig: offlineTtsModelConfig, - ruleFsts: './vits-zh-aishell3/rule.fst', + ruleFsts: + './vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst,./vits-icefall-zh-aishell3/new_heteronym.fst', + ruleFars: './vits-icefall-zh-aishell3/rule.far', maxNumSentences: 1, }; diff --git a/scripts/apk/build-apk-tts-engine.sh.in b/scripts/apk/build-apk-tts-engine.sh.in index 5e46f8eb..08d57038 100644 --- a/scripts/apk/build-apk-tts-engine.sh.in +++ b/scripts/apk/build-apk-tts-engine.sh.in @@ -56,6 +56,11 @@ sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt {% endif %} +{% if tts_model.rule_fars %} + rule_fars={{ tts_model.rule_fars }} + sed -i.bak s%"ruleFsts = null"%"ruleFars = \"$rule_fars\""% ./TtsEngine.kt +{% endif %} + {% if tts_model.data_dir %} data_dir={{ tts_model.data_dir }} sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./TtsEngine.kt diff --git a/scripts/apk/build-apk-tts.sh.in b/scripts/apk/build-apk-tts.sh.in index a3dd6b48..5eb377e3 100644 --- a/scripts/apk/build-apk-tts.sh.in +++ b/scripts/apk/build-apk-tts.sh.in @@ -54,6 +54,11 @@ sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt {% endif %} +{% if tts_model.rule_fars %} + rule_fars={{ tts_model.rule_fars }} + sed -i.bak s%"ruleFsts = null"%"ruleFars = \"$rule_fars\""% ./MainActivity.kt +{% endif %} + {% if tts_model.data_dir %} data_dir={{ tts_model.data_dir }} sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./MainActivity.kt diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index 2c539d79..1221c4d3 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -33,6 +33,7 @@ class TtsModel: model_name: str = "" lang: str = "" # en, zh, fr, de, etc. rule_fsts: Optional[List[str]] = None + rule_fars: Optional[List[str]] = None data_dir: Optional[str] = None is_char: bool = False lang_iso_639_3: str = "" @@ -241,98 +242,94 @@ def get_mimic3_models() -> List[TtsModel]: def get_vits_models() -> List[TtsModel]: - return [ + chinese_models = [ # Chinese TtsModel( model_dir="vits-icefall-zh-aishell3", model_name="model.onnx", lang="zh", - rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/rule.fst", + rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst", + rule_fars="vits-icefall-zh-aishell3/rule.far", ), TtsModel( model_dir="vits-zh-aishell3", model_name="vits-aishell3.onnx", lang="zh", - rule_fsts="vits-zh-aishell3/rule.fst", ), TtsModel( model_dir="vits-zh-hf-doom", model_name="doom.onnx", lang="zh", - rule_fsts="vits-zh-hf-doom/rule.fst", ), TtsModel( model_dir="vits-zh-hf-echo", model_name="echo.onnx", lang="zh", - rule_fsts="vits-zh-hf-echo/rule.fst", ), TtsModel( model_dir="vits-zh-hf-zenyatta", model_name="zenyatta.onnx", lang="zh", - rule_fsts="vits-zh-hf-zenyatta/rule.fst", ), TtsModel( model_dir="vits-zh-hf-abyssinvoker", model_name="abyssinvoker.onnx", lang="zh", - rule_fsts="vits-zh-hf-abyssinvoker/rule.fst", ), TtsModel( model_dir="vits-zh-hf-keqing", model_name="keqing.onnx", lang="zh", - rule_fsts="vits-zh-hf-keqing/rule.fst", ), TtsModel( model_dir="vits-zh-hf-eula", model_name="eula.onnx", lang="zh", - rule_fsts="vits-zh-hf-eula/rule.fst", ), TtsModel( model_dir="vits-zh-hf-bronya", model_name="bronya.onnx", lang="zh", - rule_fsts="vits-zh-hf-bronya/rule.fst", ), TtsModel( model_dir="vits-zh-hf-theresa", model_name="theresa.onnx", lang="zh", - rule_fsts="vits-zh-hf-theresa/rule.fst", ), TtsModel( model_dir="vits-zh-hf-fanchen-wnj", model_name="vits-zh-hf-fanchen-wnj.onnx", lang="zh", - rule_fsts="vits-zh-hf-fanchen-wnj/rule.fst", ), TtsModel( model_dir="vits-zh-hf-fanchen-C", model_name="vits-zh-hf-fanchen-C.onnx", lang="zh", - rule_fsts="vits-zh-hf-fanchen-C/rule.fst", ), TtsModel( model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe", model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe.onnx", lang="zh", - rule_fsts="vits-zh-hf-fanchen-ZhiHuiLaoZhe/rule.fst", ), TtsModel( model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new", model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new.onnx", lang="zh", - rule_fsts="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new/rule.fst", ), TtsModel( model_dir="vits-zh-hf-fanchen-unity", model_name="vits-zh-hf-fanchen-unity.onnx", lang="zh", - rule_fsts="vits-zh-hf-fanchen-unity/rule.fst", ), + ] + + rule_fsts = ["phone.fst", "date.fst", "number.fst", "new_heteronym.fst"] + for m in chinese_models: + s = [f"{m.model_dir}/{r}" for r in rule_fsts] + m.rule_fsts = ",".join(s) + m.rule_fars = f"{m.model_dir}/rule.far" + + all_models = chinese_models + [ TtsModel( model_dir="vits-cantonese-hf-xiaomaiiwn", model_name="vits-cantonese-hf-xiaomaiiwn.onnx", @@ -346,6 +343,8 @@ def get_vits_models() -> List[TtsModel]: # fmt: on ] + return all_models + def main(): args = get_args() diff --git a/scripts/dotnet/generate.py b/scripts/dotnet/generate.py index f24353f6..5268211b 100755 --- a/scripts/dotnet/generate.py +++ b/scripts/dotnet/generate.py @@ -40,6 +40,7 @@ def process_linux(s): "libpiper_phonemize.so.1", "libsherpa-onnx-c-api.so", "libsherpa-onnx-core.so", + "libsherpa-onnx-fstfar.so.7", "libsherpa-onnx-fst.so.6", "libsherpa-onnx-kaldifst-core.so", "libucd.so", @@ -68,6 +69,7 @@ def process_macos(s): "libpiper_phonemize.1.dylib", "libsherpa-onnx-c-api.dylib", "libsherpa-onnx-core.dylib", + "libsherpa-onnx-fstfar.7.dylib", "libsherpa-onnx-fst.6.dylib", "libsherpa-onnx-kaldifst-core.dylib", "libucd.dylib", @@ -96,6 +98,7 @@ def process_windows(s, rid): "piper_phonemize.dll", "sherpa-onnx-c-api.dll", "sherpa-onnx-core.dll", + "sherpa-onnx-fstfar.lib", "sherpa-onnx-fst.lib", "sherpa-onnx-kaldifst-core.lib", "ucd.dll", diff --git a/scripts/dotnet/offline.cs b/scripts/dotnet/offline.cs index 1a8612f3..c885ca5b 100644 --- a/scripts/dotnet/offline.cs +++ b/scripts/dotnet/offline.cs @@ -67,6 +67,7 @@ namespace SherpaOnnx Model = new OfflineTtsModelConfig(); RuleFsts = ""; MaxNumSentences = 1; + RuleFars = ""; } public OfflineTtsModelConfig Model; @@ -74,6 +75,9 @@ namespace SherpaOnnx public string RuleFsts; public int MaxNumSentences; + + [MarshalAs(UnmanagedType.LPStr)] + public string RuleFars; } public class OfflineTtsGeneratedAudio diff --git a/scripts/dotnet/run.sh b/scripts/dotnet/run.sh index d723a2d8..5bd8627c 100755 --- a/scripts/dotnet/run.sh +++ b/scripts/dotnet/run.sh @@ -41,6 +41,7 @@ if [ ! -f /tmp/linux/libsherpa-onnx-core.so ]; then cd .. rm -v libpiper_phonemize.so libpiper_phonemize.so.1.2.0 rm -v libsherpa-onnx-fst.so + rm -v libsherpa-onnx-fstfar.so rm -v libonnxruntime.so rm -v libcargs.so rm -rf wheel @@ -67,6 +68,7 @@ if [ ! -f /tmp/macos/libsherpa-onnx-core.dylib ]; then rm -v libonnxruntime.dylib rm -v libpiper_phonemize.1.2.0.dylib libpiper_phonemize.dylib rm -v libsherpa-onnx-fst.dylib + rm -v libsherpa-onnx-fstfar.dylib rm -rf wheel ls -lh cd .. diff --git a/scripts/go/_internal/build_darwin_amd64.go b/scripts/go/_internal/build_darwin_amd64.go index 577dfa95..29d1bd68 100644 --- a/scripts/go/_internal/build_darwin_amd64.go +++ b/scripts/go/_internal/build_darwin_amd64.go @@ -2,5 +2,5 @@ package sherpa_onnx -// #cgo LDFLAGS: -L ${SRCDIR}/lib/x86_64-apple-darwin -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -Wl,-rpath,${SRCDIR}/lib/x86_64-apple-darwin +// #cgo LDFLAGS: -L ${SRCDIR}/lib/x86_64-apple-darwin -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -Wl,-rpath,${SRCDIR}/lib/x86_64-apple-darwin import "C" diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 361d9775..99ecd84d 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -554,6 +554,7 @@ type OfflineTtsModelConfig struct { type OfflineTtsConfig struct { Model OfflineTtsModelConfig RuleFsts string + RuleFars string MaxNumSentences int } @@ -583,6 +584,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { c.rule_fsts = C.CString(config.RuleFsts) defer C.free(unsafe.Pointer(c.rule_fsts)) + c.rule_fars = C.CString(config.RuleFars) + defer C.free(unsafe.Pointer(c.rule_fars)) + c.max_num_sentences = C.int(config.MaxNumSentences) c.model.vits.model = C.CString(config.Model.Vits.Model) diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 9292687a..c349dd3f 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -818,6 +818,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( tts_config.model.debug = config->model.debug; tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); + tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2); if (tts_config.model.debug) { diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 78641f9b..276b3590 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -783,6 +783,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { SherpaOnnxOfflineTtsModelConfig model; const char *rule_fsts; int32_t max_num_sentences; + const char *rule_fars; } SherpaOnnxOfflineTtsConfig; SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 1ebdc626..423a777f 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -164,6 +164,7 @@ endif() if(SHERPA_ONNX_ENABLE_TTS) target_link_libraries(sherpa-onnx-core piper_phonemize) + target_link_libraries(sherpa-onnx-core fstfar fst) endif() if(SHERPA_ONNX_ENABLE_CHECK) diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index 14c3d37a..e3a87eba 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -18,7 +18,6 @@ #endif #include -#include // NOLINT #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/onnx-utils.h" @@ -26,6 +25,55 @@ namespace sherpa_onnx { +static std::vector ProcessHeteronyms( + const std::vector &words) { + std::vector ans; + ans.reserve(words.size()); + + int32_t num_words = static_cast(words.size()); + int32_t i = 0; + int32_t prev = -1; + while (i < num_words) { + // start of a phrase #$| + if ((i + 2 < num_words) && words[i] == "#" && words[i + 1] == "$" && + words[i + 2] == "|") { + if (prev == -1) { + prev = i + 3; + } + i = i + 3; + continue; + } + + // end of a phrase |$# + if ((i + 2 < num_words) && words[i] == "|" && words[i + 1] == "$" && + words[i + 2] == "#") { + if (prev != -1) { + std::ostringstream os; + for (int32_t k = prev; k < i; ++k) { + if (words[k] != "|" && words[k] != "$" && words[k] != "#") { + os << words[k]; + } + } + ans.push_back(os.str()); + + prev = -1; + } + + i += 3; + continue; + } + + if (prev == -1) { + // not inside a phrase + ans.push_back(words[i]); + } + + ++i; + } + + return ans; +} + static void ToLowerCase(std::string *in_out) { std::transform(in_out->begin(), in_out->end(), in_out->begin(), [](unsigned char c) { return std::tolower(c); }); @@ -148,36 +196,9 @@ std::vector> Lexicon::ConvertTextToTokenIdsChinese( const std::string &_text) const { std::string text(_text); ToLowerCase(&text); - std::vector words; - if (pattern_) { - // Handle polyphones - size_t pos = 0; - auto begin = std::sregex_iterator(text.begin(), text.end(), *pattern_); - auto end = std::sregex_iterator(); - for (std::sregex_iterator i = begin; i != end; ++i) { - std::smatch match = *i; - if (pos < match.position()) { - auto this_segment = text.substr(pos, match.position() - pos); - auto this_segment_words = SplitUtf8(this_segment); - words.insert(words.end(), this_segment_words.begin(), - this_segment_words.end()); - pos = match.position() + match.length(); - } else if (pos == match.position()) { - pos = match.position() + match.length(); - } - words.push_back(match.str()); - } - - if (pos < text.size()) { - auto this_segment = text.substr(pos, text.size() - pos); - auto this_segment_words = SplitUtf8(this_segment); - words.insert(words.end(), this_segment_words.begin(), - this_segment_words.end()); - } - } else { - words = SplitUtf8(text); - } + std::vector words = SplitUtf8(text); + words = ProcessHeteronyms(words); if (debug_) { fprintf(stderr, "Input text in string: %s\n", text.c_str()); @@ -357,9 +378,6 @@ void Lexicon::InitLexicon(std::istream &is) { std::string line; std::string phone; - std::ostringstream os; - std::string sep; - while (std::getline(is, line)) { std::istringstream iss(line); @@ -381,18 +399,9 @@ void Lexicon::InitLexicon(std::istream &is) { if (ids.empty()) { continue; } - if (language_ == Language::kChinese && word.size() > 3) { - // this is not a single word; - os << sep << word; - sep = "|"; - } word2ids_.insert({std::move(word), std::move(ids)}); } - - if (!sep.empty()) { - pattern_ = std::make_unique(os.str()); - } } void Lexicon::InitPunctuations(const std::string &punctuations) { diff --git a/sherpa-onnx/csrc/lexicon.h b/sherpa-onnx/csrc/lexicon.h index 97b0ff7b..e26a2dec 100644 --- a/sherpa-onnx/csrc/lexicon.h +++ b/sherpa-onnx/csrc/lexicon.h @@ -7,7 +7,6 @@ #include #include -#include // NOLINT #include #include #include @@ -65,9 +64,6 @@ class Lexicon : public OfflineTtsFrontend { std::unordered_map token2id_; Language language_; bool debug_; - - // for Chinese polyphones - std::unique_ptr pattern_; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index 6bcfc0ca..f7ddd8e4 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -15,6 +15,9 @@ #include "android/asset_manager.h" #include "android/asset_manager_jni.h" #endif + +#include "fst/extensions/far/far.h" +#include "kaldifst/csrc/kaldi-fst-io.h" #include "kaldifst/csrc/text-normalizer.h" #include "sherpa-onnx/csrc/lexicon.h" #include "sherpa-onnx/csrc/macros.h" @@ -46,6 +49,32 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { tn_list_.push_back(std::make_unique(f)); } } + + if (!config.rule_fars.empty()) { + if (config.model.debug) { + SHERPA_ONNX_LOGE("Loading FST archives"); + } + std::vector files; + SplitStringToVector(config.rule_fars, ",", false, &files); + for (const auto &f : files) { + if (config.model.debug) { + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); + } + std::unique_ptr> reader( + fst::FarReader::Open(f)); + for (; !reader->Done(); reader->Next()) { + std::unique_ptr r( + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); + + tn_list_.push_back( + std::make_unique(std::move(r))); + } + } + + if (config.model.debug) { + SHERPA_ONNX_LOGE("FST archives loaded!"); + } + } } #if __ANDROID_API__ >= 9 diff --git a/sherpa-onnx/csrc/offline-tts.cc b/sherpa-onnx/csrc/offline-tts.cc index 6f7e472a..34d4a39c 100644 --- a/sherpa-onnx/csrc/offline-tts.cc +++ b/sherpa-onnx/csrc/offline-tts.cc @@ -20,7 +20,14 @@ void OfflineTtsConfig::Register(ParseOptions *po) { "It not empty, it contains a list of rule FST filenames." "Multiple filenames are separated by a comma and they are " "applied from left to right. An example value: " - "rule1.fst,rule2,fst,rule3.fst"); + "rule1.fst,rule2.fst,rule3.fst"); + + po->Register("tts-rule-fars", &rule_fars, + "It not empty, it contains a list of rule FST archive filenames." + "Multiple filenames are separated by a comma and they are " + "applied from left to right. An example value: " + "rule1.far,rule2.far,rule3.far. Note that an *.far can contain " + "multiple *.fst files"); po->Register( "tts-max-num-sentences", &max_num_sentences, @@ -41,6 +48,17 @@ bool OfflineTtsConfig::Validate() const { } } + if (!rule_fars.empty()) { + std::vector files; + SplitStringToVector(rule_fars, ",", false, &files); + for (const auto &f : files) { + if (!FileExists(f)) { + SHERPA_ONNX_LOGE("Rule far %s does not exist. ", f.c_str()); + return false; + } + } + } + return model.Validate(); } @@ -50,6 +68,7 @@ std::string OfflineTtsConfig::ToString() const { os << "OfflineTtsConfig("; os << "model=" << model.ToString() << ", "; os << "rule_fsts=\"" << rule_fsts << "\", "; + os << "rule_fars=\"" << rule_fars << "\", "; os << "max_num_sentences=" << max_num_sentences << ")"; return os.str(); diff --git a/sherpa-onnx/csrc/offline-tts.h b/sherpa-onnx/csrc/offline-tts.h index 354057bf..0f4cd121 100644 --- a/sherpa-onnx/csrc/offline-tts.h +++ b/sherpa-onnx/csrc/offline-tts.h @@ -29,6 +29,9 @@ struct OfflineTtsConfig { // If there are multiple rules, they are applied from left to right. std::string rule_fsts; + // If there are multiple FST archives, they are applied from left to right. + std::string rule_fars; + // Maximum number of sentences that we process at a time. // This is to avoid OOM for very long input text. // If you set it to -1, then we process all sentences in a single batch. @@ -36,9 +39,11 @@ struct OfflineTtsConfig { OfflineTtsConfig() = default; OfflineTtsConfig(const OfflineTtsModelConfig &model, - const std::string &rule_fsts, int32_t max_num_sentences) + const std::string &rule_fsts, const std::string &rule_fars, + int32_t max_num_sentences) : model(model), rule_fsts(rule_fsts), + rule_fars(rule_fars), max_num_sentences(max_num_sentences) {} void Register(ParseOptions *po); diff --git a/sherpa-onnx/jni/jni.cc b/sherpa-onnx/jni/jni.cc index 281fd4ee..23596e97 100644 --- a/sherpa-onnx/jni/jni.cc +++ b/sherpa-onnx/jni/jni.cc @@ -878,6 +878,13 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { ans.rule_fsts = p; env->ReleaseStringUTFChars(s, p); + // for ruleFars + fid = env->GetFieldID(cls, "ruleFars", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.rule_fars = p; + env->ReleaseStringUTFChars(s, p); + fid = env->GetFieldID(cls, "maxNumSentences", "I"); ans.max_num_sentences = env->GetIntField(config, fid); diff --git a/sherpa-onnx/python/csrc/offline-tts.cc b/sherpa-onnx/python/csrc/offline-tts.cc index ff31ded9..dad33092 100644 --- a/sherpa-onnx/python/csrc/offline-tts.cc +++ b/sherpa-onnx/python/csrc/offline-tts.cc @@ -32,11 +32,12 @@ static void PybindOfflineTtsConfig(py::module *m) { py::class_(*m, "OfflineTtsConfig") .def(py::init<>()) .def(py::init(), + const std::string &, int32_t>(), py::arg("model"), py::arg("rule_fsts") = "", - py::arg("max_num_sentences") = 2) + py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2) .def_readwrite("model", &PyClass::model) .def_readwrite("rule_fsts", &PyClass::rule_fsts) + .def_readwrite("rule_fars", &PyClass::rule_fars) .def_readwrite("max_num_sentences", &PyClass::max_num_sentences) .def("validate", &PyClass::Validate) .def("__str__", &PyClass::ToString); diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index b463c866..69d97785 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -652,12 +652,14 @@ func sherpaOnnxOfflineTtsModelConfig( func sherpaOnnxOfflineTtsConfig( model: SherpaOnnxOfflineTtsModelConfig, ruleFsts: String = "", + ruleFars: String = "", maxNumSenetences: Int = 2 ) -> SherpaOnnxOfflineTtsConfig { return SherpaOnnxOfflineTtsConfig( model: model, rule_fsts: toCPointer(ruleFsts), - max_num_sentences: Int32(maxNumSenetences) + max_num_sentences: Int32(maxNumSenetences), + rule_fars: toCPointer(ruleFars) ) } diff --git a/wasm/tts/sherpa-onnx-tts.js b/wasm/tts/sherpa-onnx-tts.js index c291d8a4..03017784 100644 --- a/wasm/tts/sherpa-onnx-tts.js +++ b/wasm/tts/sherpa-onnx-tts.js @@ -90,7 +90,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { function initSherpaOnnxOfflineTtsConfig(config, Module) { const modelConfig = initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module); - const len = modelConfig.len + 2 * 4; + const len = modelConfig.len + 3 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -98,12 +98,19 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) { offset += modelConfig.len; const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts) + 1; - const buffer = Module._malloc(ruleFstsLen); + const ruleFarsLen = Module.lengthBytesUTF8(config.ruleFars) + 1; + + const buffer = Module._malloc(ruleFstsLen + ruleFarsLen); Module.stringToUTF8(config.ruleFsts, buffer, ruleFstsLen); + Module.stringToUTF8(config.ruleFars, buffer + ruleFstsLen, ruleFarsLen); + Module.setValue(ptr + offset, buffer, 'i8*'); offset += 4; Module.setValue(ptr + offset, config.maxNumSentences, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*'); return { buffer: buffer, ptr: ptr, len: len, config: modelConfig, @@ -190,6 +197,7 @@ function createOfflineTts(Module, myConfig) { let offlineTtsConfig = { offlineTtsModelConfig: offlineTtsModelConfig, ruleFsts: '', + ruleFars: '', maxNumSentences: 1, } diff --git a/wasm/tts/sherpa-onnx-wasm-main-tts.cc b/wasm/tts/sherpa-onnx-wasm-main-tts.cc index 71701419..83090dc7 100644 --- a/wasm/tts/sherpa-onnx-wasm-main-tts.cc +++ b/wasm/tts/sherpa-onnx-wasm-main-tts.cc @@ -18,7 +18,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == - sizeof(SherpaOnnxOfflineTtsModelConfig) + 2 * 4, + sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, ""); void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -40,6 +40,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { fprintf(stdout, "----------tts config----------\n"); fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts); + fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars); fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences); }