From 6686c7d3e673773ec0df7800f20412cf9274c300 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 25 Apr 2024 12:28:31 +0800 Subject: [PATCH] Add dict_dir arg to c api to support Chinese TTS models using jieba (#809) --- CMakeLists.txt | 2 +- .../NonStreamingTextToSpeechDlg.cpp | 13 +++++++++++++ nodejs-examples/test-offline-tts-en.js | 1 + nodejs-examples/test-offline-tts-zh.js | 1 + scripts/dotnet/offline.cs | 5 +++++ scripts/go/sherpa_onnx.go | 6 +++++- sherpa-onnx/c-api/c-api.cc | 2 ++ sherpa-onnx/c-api/c-api.h | 1 + swift-api-examples/SherpaOnnx.swift | 6 ++++-- wasm/nodejs/sherpa-onnx-wasm-nodejs.cc | 1 + wasm/tts/sherpa-onnx-tts.js | 13 ++++++++++++- wasm/tts/sherpa-onnx-wasm-main-tts.cc | 3 ++- 12 files changed, 48 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f2d2c72b..99336706 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.9.22") +set(SHERPA_ONNX_VERSION "1.9.23") # Disable warning about # diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp index 05a84199..333a8667 100644 --- a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp +++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp @@ -470,6 +470,19 @@ void CNonStreamingTextToSpeechDlg::Init() { } else if (Exists("./lexicon.txt")) { config.model.vits.lexicon = "./lexicon.txt"; } + + if (Exists("./dict/jieba.dict.utf8")) { + config.model.vits.dict_dir = "./dict"; + } + + if (Exists("./phone.fst") && Exists("./date.fst") && Exists("./number.fst")) { + config.rule_fsts = "./phone.fst,./date.fst,number.fst"; + } + + if (Exists("./rule.far")) { + config.rule_fars = "./rule.far"; + } + config.model.vits.tokens = "./tokens.txt"; tts_ = SherpaOnnxCreateOfflineTts(&config); diff --git a/nodejs-examples/test-offline-tts-en.js b/nodejs-examples/test-offline-tts-en.js index c3bd67b4..b38b4aa0 100644 --- a/nodejs-examples/test-offline-tts-en.js +++ b/nodejs-examples/test-offline-tts-en.js @@ -8,6 +8,7 @@ function createOfflineTts() { lexicon: '', tokens: './vits-piper-en_US-amy-low/tokens.txt', dataDir: './vits-piper-en_US-amy-low/espeak-ng-data', + dictDir: '', noiseScale: 0.667, noiseScaleW: 0.8, lengthScale: 1.0, diff --git a/nodejs-examples/test-offline-tts-zh.js b/nodejs-examples/test-offline-tts-zh.js index d777d490..dd93fb07 100644 --- a/nodejs-examples/test-offline-tts-zh.js +++ b/nodejs-examples/test-offline-tts-zh.js @@ -8,6 +8,7 @@ function createOfflineTts() { lexicon: './vits-icefall-zh-aishell3/lexicon.txt', tokens: './vits-icefall-zh-aishell3/tokens.txt', dataDir: '', + dictDir: '', noiseScale: 0.667, noiseScaleW: 0.8, lengthScale: 1.0, diff --git a/scripts/dotnet/offline.cs b/scripts/dotnet/offline.cs index c885ca5b..c0b7e1d4 100644 --- a/scripts/dotnet/offline.cs +++ b/scripts/dotnet/offline.cs @@ -23,6 +23,8 @@ namespace SherpaOnnx NoiseScale = 0.667F; NoiseScaleW = 0.8F; LengthScale = 1.0F; + + DictDir = ""; } [MarshalAs(UnmanagedType.LPStr)] public string Model; @@ -39,6 +41,9 @@ namespace SherpaOnnx public float NoiseScale; public float NoiseScaleW; public float LengthScale; + + [MarshalAs(UnmanagedType.LPStr)] + public string DictDir; } [StructLayout(LayoutKind.Sequential)] diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 99ecd84d..1feb3c2a 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -532,10 +532,11 @@ type OfflineTtsVitsModelConfig struct { Model string // Path to the VITS onnx model Lexicon string // Path to lexicon.txt Tokens string // Path to tokens.txt - DataDir string // Path to tokens.txt + DataDir string // Path to espeak-ng-data directory NoiseScale float32 // noise scale for vits models. Please use 0.667 in general NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed + DictDir string // Path to dict directory for jieba (used only in Chinese tts) } type OfflineTtsModelConfig struct { @@ -605,6 +606,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { c.model.vits.noise_scale_w = C.float(config.Model.Vits.NoiseScaleW) c.model.vits.length_scale = C.float(config.Model.Vits.LengthScale) + c.model.vits.dict_dir = C.CString(config.Model.Vits.DictDir) + defer C.free(unsafe.Pointer(c.model.vits.dict_dir)) + c.model.num_threads = C.int(config.Model.NumThreads) c.model.debug = C.int(config.Model.Debug) diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 086bd42a..8cdf58e7 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -818,6 +818,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( SHERPA_ONNX_OR(config->model.vits.noise_scale_w, 0.8); tts_config.model.vits.length_scale = SHERPA_ONNX_OR(config->model.vits.length_scale, 1.0); + tts_config.model.vits.dict_dir = + SHERPA_ONNX_OR(config->model.vits.dict_dir, ""); tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); tts_config.model.debug = config->model.debug; diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 8c9af764..3109890e 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -772,6 +772,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { float noise_scale; float noise_scale_w; float length_scale; // < 1, faster in speed; > 1, slower in speed + const char *dict_dir; } SherpaOnnxOfflineTtsVitsModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 69d97785..97b4e556 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -623,7 +623,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig( dataDir: String = "", noiseScale: Float = 0.667, noiseScaleW: Float = 0.8, - lengthScale: Float = 1.0 + lengthScale: Float = 1.0, + dictDir: String = "" ) -> SherpaOnnxOfflineTtsVitsModelConfig { return SherpaOnnxOfflineTtsVitsModelConfig( model: toCPointer(model), @@ -632,7 +633,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig( data_dir: toCPointer(dataDir), noise_scale: noiseScale, noise_scale_w: noiseScaleW, - length_scale: lengthScale) + length_scale: lengthScale, + dict_dir: toCPointer(dictDir)) } func sherpaOnnxOfflineTtsModelConfig( diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index 22f770ae..cd7e0c50 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -43,6 +43,7 @@ void PrintOfflineTtsConfig(SherpaOnnxOfflineTtsConfig *tts_config) { fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale); fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w); fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); + fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir); fprintf(stdout, "----------tts model config----------\n"); fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); diff --git a/wasm/tts/sherpa-onnx-tts.js b/wasm/tts/sherpa-onnx-tts.js index 03017784..2ee02473 100644 --- a/wasm/tts/sherpa-onnx-tts.js +++ b/wasm/tts/sherpa-onnx-tts.js @@ -18,7 +18,12 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; const dataDirLen = Module.lengthBytesUTF8(config.dataDir) + 1; - const n = modelLen + lexiconLen + tokensLen + dataDirLen; + if (!('dictDir' in config)) { + config.dictDir = '' + } + const dictDirLen = Module.lengthBytesUTF8(config.dictDir) + 1; + + const n = modelLen + lexiconLen + tokensLen + dataDirLen + dictDirLen; const buffer = Module._malloc(n); @@ -38,6 +43,9 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { Module.stringToUTF8(config.dataDir, buffer + offset, dataDirLen); offset += dataDirLen; + Module.stringToUTF8(config.dictDir, buffer + offset, dictDirLen); + offset += dictDirLen; + offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); offset += modelLen; @@ -54,6 +62,8 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { Module.setValue(ptr + 16, config.noiseScale, 'float'); Module.setValue(ptr + 20, config.noiseScaleW, 'float'); Module.setValue(ptr + 24, config.lengthScale, 'float'); + Module.setValue(ptr + 28, buffer + offset, 'i8*'); + offset += dictDirLen; return { buffer: buffer, ptr: ptr, len: len, @@ -184,6 +194,7 @@ function createOfflineTts(Module, myConfig) { lexicon: '', tokens: './tokens.txt', dataDir: './espeak-ng-data', + dictDir: '', noiseScale: 0.667, noiseScaleW: 0.8, lengthScale: 1.0, diff --git a/wasm/tts/sherpa-onnx-wasm-main-tts.cc b/wasm/tts/sherpa-onnx-wasm-main-tts.cc index 83090dc7..872a1c85 100644 --- a/wasm/tts/sherpa-onnx-wasm-main-tts.cc +++ b/wasm/tts/sherpa-onnx-wasm-main-tts.cc @@ -13,7 +13,7 @@ extern "C" { -static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 7 * 4, ""); +static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4, ""); @@ -32,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale); fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w); fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); + fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir); fprintf(stdout, "----------tts model config----------\n"); fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);