Add C# and JavaScript (wasm) API for MatchaTTS models (#1682)
This commit is contained in:
32
.github/scripts/test-dot-net.sh
vendored
32
.github/scripts/test-dot-net.sh
vendored
@@ -2,7 +2,27 @@
|
|||||||
|
|
||||||
cd dotnet-examples/
|
cd dotnet-examples/
|
||||||
|
|
||||||
cd ./offline-speaker-diarization
|
cd ./offline-tts
|
||||||
|
./run-matcha-zh.sh
|
||||||
|
ls -lh *.wav
|
||||||
|
./run-matcha-en.sh
|
||||||
|
ls -lh *.wav
|
||||||
|
./run-aishell3.sh
|
||||||
|
ls -lh *.wav
|
||||||
|
./run-piper.sh
|
||||||
|
ls -lh *.wav
|
||||||
|
./run-hf-fanchen.sh
|
||||||
|
ls -lh *.wav
|
||||||
|
ls -lh
|
||||||
|
|
||||||
|
pushd ../..
|
||||||
|
|
||||||
|
mkdir tts
|
||||||
|
|
||||||
|
cp dotnet-examples/offline-tts/*.wav ./tts
|
||||||
|
popd
|
||||||
|
|
||||||
|
cd ../offline-speaker-diarization
|
||||||
./run.sh
|
./run.sh
|
||||||
rm -rfv *.onnx
|
rm -rfv *.onnx
|
||||||
rm -fv *.wav
|
rm -fv *.wav
|
||||||
@@ -76,14 +96,4 @@ cd ../spoken-language-identification
|
|||||||
./run.sh
|
./run.sh
|
||||||
rm -rf sherpa-onnx-*
|
rm -rf sherpa-onnx-*
|
||||||
|
|
||||||
cd ../offline-tts
|
|
||||||
./run-aishell3.sh
|
|
||||||
./run-piper.sh
|
|
||||||
./run-hf-fanchen.sh
|
|
||||||
ls -lh
|
|
||||||
|
|
||||||
cd ../..
|
|
||||||
|
|
||||||
mkdir tts
|
|
||||||
|
|
||||||
cp dotnet-examples/offline-tts/*.wav ./tts
|
|
||||||
|
|||||||
54
.github/scripts/test-nodejs-npm.sh
vendored
54
.github/scripts/test-nodejs-npm.sh
vendored
@@ -9,6 +9,48 @@ git status
|
|||||||
ls -lh
|
ls -lh
|
||||||
ls -lh node_modules
|
ls -lh node_modules
|
||||||
|
|
||||||
|
# offline tts
|
||||||
|
#
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||||
|
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||||
|
rm matcha-icefall-zh-baker.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
|
||||||
|
node ./test-offline-tts-matcha-zh.js
|
||||||
|
|
||||||
|
rm -rf matcha-icefall-zh-baker
|
||||||
|
rm hifigan_v2.onnx
|
||||||
|
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
|
||||||
|
node ./test-offline-tts-matcha-en.js
|
||||||
|
|
||||||
|
rm -rf matcha-icefall-en_US-ljspeech
|
||||||
|
rm hifigan_v2.onnx
|
||||||
|
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||||
|
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||||
|
node ./test-offline-tts-vits-en.js
|
||||||
|
rm -rf vits-piper-en_US-amy-low*
|
||||||
|
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||||
|
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||||
|
node ./test-offline-tts-vits-zh.js
|
||||||
|
rm -rf vits-icefall-zh-aishell3*
|
||||||
|
|
||||||
|
ls -lh *.wav
|
||||||
|
|
||||||
echo '-----speaker diarization----------'
|
echo '-----speaker diarization----------'
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
@@ -147,15 +189,3 @@ tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
|||||||
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||||
node ./test-online-zipformer2-ctc-hlg.js
|
node ./test-online-zipformer2-ctc-hlg.js
|
||||||
rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
|
rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
|
||||||
|
|
||||||
# offline tts
|
|
||||||
|
|
||||||
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
|
||||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
|
||||||
node ./test-offline-tts-en.js
|
|
||||||
rm -rf vits-piper-en_US-amy-low*
|
|
||||||
|
|
||||||
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
|
||||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
|
||||||
node ./test-offline-tts-zh.js
|
|
||||||
rm -rf vits-icefall-zh-aishell3*
|
|
||||||
|
|||||||
44
.github/workflows/test-dot-net.yaml
vendored
44
.github/workflows/test-dot-net.yaml
vendored
@@ -92,6 +92,50 @@ jobs:
|
|||||||
python-version: ["3.8"]
|
python-version: ["3.8"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- name: Check space
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
df -h
|
||||||
|
|
||||||
|
- name: Free space
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
df -h
|
||||||
|
rm -rf /opt/hostedtoolcache
|
||||||
|
df -h
|
||||||
|
|
||||||
|
- name: Free more space
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
# https://github.com/orgs/community/discussions/25678
|
||||||
|
cd /opt
|
||||||
|
find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';'
|
||||||
|
|
||||||
|
sudo rm -rf /usr/share/dotnet
|
||||||
|
sudo rm -rf "/usr/local/share/boost"
|
||||||
|
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||||
|
|
||||||
|
- name: Free Disk Space (Ubuntu)
|
||||||
|
uses: jlumbroso/free-disk-space@main
|
||||||
|
with:
|
||||||
|
# this might remove tools that are actually needed,
|
||||||
|
# if set to "true" but frees about 6 GB
|
||||||
|
tool-cache: false
|
||||||
|
|
||||||
|
# all of these default to true, but feel free to set to
|
||||||
|
# "false" if necessary for your workflow
|
||||||
|
android: true
|
||||||
|
dotnet: false
|
||||||
|
haskell: true
|
||||||
|
large-packages: true
|
||||||
|
docker-images: false
|
||||||
|
swap-storage: true
|
||||||
|
|
||||||
|
- name: Check space
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
df -h
|
||||||
|
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|||||||
@@ -21,48 +21,56 @@ class OfflineTtsPlayDemo
|
|||||||
{
|
{
|
||||||
class Options
|
class Options
|
||||||
{
|
{
|
||||||
|
|
||||||
[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
|
[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
|
||||||
public string? RuleFsts { get; set; }
|
public string RuleFsts { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
|
[Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
|
||||||
public string? DictDir { get; set; }
|
public string RuleFars { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
|
[Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
|
||||||
public string? DataDir { get; set; }
|
public string DictDir { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
|
[Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
|
||||||
public float LengthScale { get; set; }
|
public string DataDir { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")]
|
[Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
|
||||||
public float NoiseScale { get; set; }
|
public float LengthScale { get; set; } = 1;
|
||||||
|
|
||||||
[Option("vits-noise-scale-w", Required = false, Default = 0.8f, HelpText = "noise_scale_w for VITS models")]
|
[Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")]
|
||||||
public float NoiseScaleW { get; set; }
|
public float NoiseScale { get; set; } = 0.667F;
|
||||||
|
|
||||||
[Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
|
[Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")]
|
||||||
public string? Lexicon { get; set; }
|
public float NoiseScaleW { get; set; } = 0.8F;
|
||||||
|
|
||||||
[Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]
|
[Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
|
||||||
public string? Tokens { get; set; }
|
public string Lexicon { get; set; } = string.Empty;
|
||||||
|
|
||||||
|
[Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")]
|
||||||
|
public string Tokens { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
|
[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
|
||||||
public int MaxNumSentences { get; set; }
|
public int MaxNumSentences { get; set; } = 1;
|
||||||
|
|
||||||
[Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")]
|
[Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")]
|
||||||
public int Debug { get; set; }
|
public int Debug { get; set; } = 0;
|
||||||
|
|
||||||
[Option("vits-model", Required = true, HelpText = "Path to VITS model")]
|
[Option("vits-model", Required = false, HelpText = "Path to VITS model")]
|
||||||
public string? Model { get; set; }
|
public string Model { get; set; } = string.Empty;
|
||||||
|
|
||||||
|
[Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")]
|
||||||
|
public string AcousticModel { get; set; } = "";
|
||||||
|
|
||||||
|
[Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")]
|
||||||
|
public string Vocoder { get; set; } = "";
|
||||||
|
|
||||||
[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
|
[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
|
||||||
public int SpeakerId { get; set; }
|
public int SpeakerId { get; set; } = 0;
|
||||||
|
|
||||||
[Option("text", Required = true, HelpText = "Text to synthesize")]
|
[Option("text", Required = true, HelpText = "Text to synthesize")]
|
||||||
public string? Text { get; set; }
|
public string Text { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
|
[Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
|
||||||
public string? OutputFilename { get; set; }
|
public string OutputFilename { get; set; } = "./generated.wav";
|
||||||
}
|
}
|
||||||
|
|
||||||
static void Main(string[] args)
|
static void Main(string[] args)
|
||||||
@@ -78,6 +86,42 @@ class OfflineTtsPlayDemo
|
|||||||
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
|
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
|
||||||
{
|
{
|
||||||
string usage = @"
|
string usage = @"
|
||||||
|
# matcha-icefall-zh-baker
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||||
|
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||||
|
rm matcha-icefall-zh-baker.tar.bz2
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
|
||||||
|
dotnet run \
|
||||||
|
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
|
||||||
|
--matcha-vocoder=./hifigan_v2.onnx \
|
||||||
|
--lexicon=./matcha-icefall-zh-baker/lexicon.txt \
|
||||||
|
--tokens=./matcha-icefall-zh-baker/tokens.txt \
|
||||||
|
--dict-dir=./matcha-icefall-zh-baker/dict \
|
||||||
|
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
|
||||||
|
--debug=1 \
|
||||||
|
--output-filename=./matcha-zh.wav \
|
||||||
|
--text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'
|
||||||
|
|
||||||
|
# matcha-icefall-en_US-ljspeech
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
|
||||||
|
dotnet run \
|
||||||
|
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
|
||||||
|
--matcha-vocoder=./hifigan_v2.onnx \
|
||||||
|
--tokens=./matcha-icefall-zh-baker/tokens.txt \
|
||||||
|
--data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
|
||||||
|
--debug=1 \
|
||||||
|
--output-filename=./matcha-zh.wav \
|
||||||
|
--text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
|
||||||
|
|
||||||
# vits-aishell3
|
# vits-aishell3
|
||||||
|
|
||||||
wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||||
@@ -85,8 +129,8 @@ tar xf vits-zh-aishell3.tar.bz2
|
|||||||
|
|
||||||
dotnet run \
|
dotnet run \
|
||||||
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
|
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
|
||||||
--vits-tokens=./vits-zh-aishell3/tokens.txt \
|
--tokens=./vits-zh-aishell3/tokens.txt \
|
||||||
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \
|
--lexicon=./vits-zh-aishell3/lexicon.txt \
|
||||||
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \
|
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \
|
||||||
--sid=66 \
|
--sid=66 \
|
||||||
--debug=1 \
|
--debug=1 \
|
||||||
@@ -100,8 +144,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2
|
|||||||
|
|
||||||
dotnet run \
|
dotnet run \
|
||||||
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
||||||
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
---tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
||||||
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
--data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
||||||
--debug=1 \
|
--debug=1 \
|
||||||
--output-filename=./amy.wav \
|
--output-filename=./amy.wav \
|
||||||
--text='This is a text to speech application in dotnet with Next Generation Kaldi'
|
--text='This is a text to speech application in dotnet with Next Generation Kaldi'
|
||||||
@@ -124,6 +168,7 @@ to download more models.
|
|||||||
private static void Run(Options options)
|
private static void Run(Options options)
|
||||||
{
|
{
|
||||||
var config = new OfflineTtsConfig();
|
var config = new OfflineTtsConfig();
|
||||||
|
|
||||||
config.Model.Vits.Model = options.Model;
|
config.Model.Vits.Model = options.Model;
|
||||||
config.Model.Vits.Lexicon = options.Lexicon;
|
config.Model.Vits.Lexicon = options.Lexicon;
|
||||||
config.Model.Vits.Tokens = options.Tokens;
|
config.Model.Vits.Tokens = options.Tokens;
|
||||||
@@ -132,6 +177,16 @@ to download more models.
|
|||||||
config.Model.Vits.NoiseScale = options.NoiseScale;
|
config.Model.Vits.NoiseScale = options.NoiseScale;
|
||||||
config.Model.Vits.NoiseScaleW = options.NoiseScaleW;
|
config.Model.Vits.NoiseScaleW = options.NoiseScaleW;
|
||||||
config.Model.Vits.LengthScale = options.LengthScale;
|
config.Model.Vits.LengthScale = options.LengthScale;
|
||||||
|
|
||||||
|
config.Model.Matcha.AcousticModel = options.AcousticModel;
|
||||||
|
config.Model.Matcha.Vocoder = options.Vocoder;
|
||||||
|
config.Model.Matcha.Lexicon = options.Lexicon;
|
||||||
|
config.Model.Matcha.Tokens = options.Tokens;
|
||||||
|
config.Model.Matcha.DataDir = options.DataDir;
|
||||||
|
config.Model.Matcha.DictDir = options.DictDir;
|
||||||
|
config.Model.Matcha.NoiseScale = options.NoiseScale;
|
||||||
|
config.Model.Matcha.LengthScale = options.LengthScale;
|
||||||
|
|
||||||
config.Model.NumThreads = 1;
|
config.Model.NumThreads = 1;
|
||||||
config.Model.Debug = options.Debug;
|
config.Model.Debug = options.Debug;
|
||||||
config.Model.Provider = "cpu";
|
config.Model.Provider = "cpu";
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ fi
|
|||||||
|
|
||||||
dotnet run \
|
dotnet run \
|
||||||
--vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \
|
--vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \
|
||||||
--vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \
|
--tokens=./vits-zh-hf-fanchen-C/tokens.txt \
|
||||||
--vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
|
--lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
|
||||||
--tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \
|
--tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \
|
||||||
--vits-dict-dir=./vits-zh-hf-fanchen-C/dict \
|
--vits-dict-dir=./vits-zh-hf-fanchen-C/dict \
|
||||||
--sid=100 \
|
--sid=100 \
|
||||||
|
|||||||
26
dotnet-examples/offline-tts-play/run-matcha-en.sh
Executable file
26
dotnet-examples/offline-tts-play/run-matcha-en.sh
Executable file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
|
||||||
|
# please visit
|
||||||
|
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
|
||||||
|
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
|
||||||
|
# to download more models
|
||||||
|
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
tar xf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./hifigan_v2.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
dotnet run \
|
||||||
|
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
|
||||||
|
--matcha-vocoder=./hifigan_v2.onnx \
|
||||||
|
--tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
|
||||||
|
--data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
|
||||||
|
--debug=1 \
|
||||||
|
--output-filename=./matcha-en.wav \
|
||||||
|
--text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
|
||||||
27
dotnet-examples/offline-tts-play/run-matcha-zh.sh
Executable file
27
dotnet-examples/offline-tts-play/run-matcha-zh.sh
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# please visit
|
||||||
|
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
|
||||||
|
# to download more models
|
||||||
|
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||||
|
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||||
|
rm matcha-icefall-zh-baker.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./hifigan_v2.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
dotnet run \
|
||||||
|
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
|
||||||
|
--matcha-vocoder=./hifigan_v2.onnx \
|
||||||
|
--lexicon=./matcha-icefall-zh-baker/lexicon.txt \
|
||||||
|
--tokens=./matcha-icefall-zh-baker/tokens.txt \
|
||||||
|
--dict-dir=./matcha-icefall-zh-baker/dict \
|
||||||
|
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
|
||||||
|
--debug=1 \
|
||||||
|
--output-filename=./matcha-zh.wav \
|
||||||
|
--text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||||||
@@ -9,8 +9,8 @@ fi
|
|||||||
|
|
||||||
dotnet run \
|
dotnet run \
|
||||||
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
||||||
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
--tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
||||||
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
--data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
||||||
--debug=1 \
|
--debug=1 \
|
||||||
--output-filename=./amy.wav \
|
--output-filename=./amy.wav \
|
||||||
--text="This is a text to speech application in dotnet with Next Generation Kaldi"
|
--text="This is a text to speech application in dotnet with Next Generation Kaldi"
|
||||||
|
|||||||
@@ -20,25 +20,25 @@ class OfflineTtsDemo
|
|||||||
[Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
|
[Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
|
||||||
public string RuleFars { get; set; } = string.Empty;
|
public string RuleFars { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
|
[Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
|
||||||
public string DictDir { get; set; } = string.Empty;
|
public string DictDir { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
|
[Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
|
||||||
public string DataDir { get; set; } = string.Empty;
|
public string DataDir { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
|
[Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
|
||||||
public float LengthScale { get; set; } = 1;
|
public float LengthScale { get; set; } = 1;
|
||||||
|
|
||||||
[Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")]
|
[Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")]
|
||||||
public float NoiseScale { get; set; } = 0.667F;
|
public float NoiseScale { get; set; } = 0.667F;
|
||||||
|
|
||||||
[Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")]
|
[Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")]
|
||||||
public float NoiseScaleW { get; set; } = 0.8F;
|
public float NoiseScaleW { get; set; } = 0.8F;
|
||||||
|
|
||||||
[Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
|
[Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
|
||||||
public string Lexicon { get; set; } = string.Empty;
|
public string Lexicon { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]
|
[Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")]
|
||||||
public string Tokens { get; set; } = string.Empty;
|
public string Tokens { get; set; } = string.Empty;
|
||||||
|
|
||||||
[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
|
[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
|
||||||
@@ -47,9 +47,15 @@ class OfflineTtsDemo
|
|||||||
[Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")]
|
[Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")]
|
||||||
public int Debug { get; set; } = 0;
|
public int Debug { get; set; } = 0;
|
||||||
|
|
||||||
[Option("vits-model", Required = true, HelpText = "Path to VITS model")]
|
[Option("vits-model", Required = false, HelpText = "Path to VITS model")]
|
||||||
public string Model { get; set; } = string.Empty;
|
public string Model { get; set; } = string.Empty;
|
||||||
|
|
||||||
|
[Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")]
|
||||||
|
public string AcousticModel { get; set; } = "";
|
||||||
|
|
||||||
|
[Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")]
|
||||||
|
public string Vocoder { get; set; } = "";
|
||||||
|
|
||||||
[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
|
[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
|
||||||
public int SpeakerId { get; set; } = 0;
|
public int SpeakerId { get; set; } = 0;
|
||||||
|
|
||||||
@@ -73,6 +79,42 @@ class OfflineTtsDemo
|
|||||||
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
|
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
|
||||||
{
|
{
|
||||||
var usage = @"
|
var usage = @"
|
||||||
|
# matcha-icefall-zh-baker
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||||
|
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||||
|
rm matcha-icefall-zh-baker.tar.bz2
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
|
||||||
|
dotnet run \
|
||||||
|
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
|
||||||
|
--matcha-vocoder=./hifigan_v2.onnx \
|
||||||
|
--lexicon=./matcha-icefall-zh-baker/lexicon.txt \
|
||||||
|
--tokens=./matcha-icefall-zh-baker/tokens.txt \
|
||||||
|
--dict-dir=./matcha-icefall-zh-baker/dict \
|
||||||
|
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
|
||||||
|
--debug=1 \
|
||||||
|
--output-filename=./matcha-zh.wav \
|
||||||
|
--text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'
|
||||||
|
|
||||||
|
# matcha-icefall-en_US-ljspeech
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
|
||||||
|
dotnet run \
|
||||||
|
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
|
||||||
|
--matcha-vocoder=./hifigan_v2.onnx \
|
||||||
|
--tokens=./matcha-icefall-zh-baker/tokens.txt \
|
||||||
|
--data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
|
||||||
|
--debug=1 \
|
||||||
|
--output-filename=./matcha-zh.wav \
|
||||||
|
--text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
|
||||||
|
|
||||||
# vits-aishell3
|
# vits-aishell3
|
||||||
|
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||||
@@ -80,8 +122,8 @@ tar xvf vits-icefall-zh-aishell3.tar.bz2
|
|||||||
|
|
||||||
dotnet run \
|
dotnet run \
|
||||||
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
||||||
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
--tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
||||||
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
--lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
||||||
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
|
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
|
||||||
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
|
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
|
||||||
--sid=66 \
|
--sid=66 \
|
||||||
@@ -96,8 +138,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2
|
|||||||
|
|
||||||
dotnet run \
|
dotnet run \
|
||||||
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
||||||
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
--tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
||||||
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
--data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
||||||
--debug=1 \
|
--debug=1 \
|
||||||
--output-filename=./amy.wav \
|
--output-filename=./amy.wav \
|
||||||
--text='This is a text to speech application in dotnet with Next Generation Kaldi'
|
--text='This is a text to speech application in dotnet with Next Generation Kaldi'
|
||||||
@@ -128,6 +170,16 @@ to download more models.
|
|||||||
config.Model.Vits.NoiseScale = options.NoiseScale;
|
config.Model.Vits.NoiseScale = options.NoiseScale;
|
||||||
config.Model.Vits.NoiseScaleW = options.NoiseScaleW;
|
config.Model.Vits.NoiseScaleW = options.NoiseScaleW;
|
||||||
config.Model.Vits.LengthScale = options.LengthScale;
|
config.Model.Vits.LengthScale = options.LengthScale;
|
||||||
|
|
||||||
|
config.Model.Matcha.AcousticModel = options.AcousticModel;
|
||||||
|
config.Model.Matcha.Vocoder = options.Vocoder;
|
||||||
|
config.Model.Matcha.Lexicon = options.Lexicon;
|
||||||
|
config.Model.Matcha.Tokens = options.Tokens;
|
||||||
|
config.Model.Matcha.DataDir = options.DataDir;
|
||||||
|
config.Model.Matcha.DictDir = options.DictDir;
|
||||||
|
config.Model.Matcha.NoiseScale = options.NoiseScale;
|
||||||
|
config.Model.Matcha.LengthScale = options.LengthScale;
|
||||||
|
|
||||||
config.Model.NumThreads = 1;
|
config.Model.NumThreads = 1;
|
||||||
config.Model.Debug = options.Debug;
|
config.Model.Debug = options.Debug;
|
||||||
config.Model.Provider = "cpu";
|
config.Model.Provider = "cpu";
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ fi
|
|||||||
|
|
||||||
dotnet run \
|
dotnet run \
|
||||||
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
||||||
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
--tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
||||||
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
--lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
||||||
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
|
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
|
||||||
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
|
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
|
||||||
--sid=66 \
|
--sid=66 \
|
||||||
|
|||||||
@@ -8,10 +8,10 @@ fi
|
|||||||
|
|
||||||
dotnet run \
|
dotnet run \
|
||||||
--vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \
|
--vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \
|
||||||
--vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \
|
--tokens=./vits-zh-hf-fanchen-C/tokens.txt \
|
||||||
--vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
|
--lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
|
||||||
--tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \
|
--tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \
|
||||||
--vits-dict-dir=./vits-zh-hf-fanchen-C/dict \
|
--dict-dir=./vits-zh-hf-fanchen-C/dict \
|
||||||
--sid=100 \
|
--sid=100 \
|
||||||
--debug=1 \
|
--debug=1 \
|
||||||
--output-filename=./fanchen-100.wav \
|
--output-filename=./fanchen-100.wav \
|
||||||
|
|||||||
26
dotnet-examples/offline-tts/run-matcha-en.sh
Executable file
26
dotnet-examples/offline-tts/run-matcha-en.sh
Executable file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
|
||||||
|
# please visit
|
||||||
|
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
|
||||||
|
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
|
||||||
|
# to download more models
|
||||||
|
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
tar xf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./hifigan_v2.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
dotnet run \
|
||||||
|
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
|
||||||
|
--matcha-vocoder=./hifigan_v2.onnx \
|
||||||
|
--tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
|
||||||
|
--data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
|
||||||
|
--debug=1 \
|
||||||
|
--output-filename=./matcha-en.wav \
|
||||||
|
--text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
|
||||||
27
dotnet-examples/offline-tts/run-matcha-zh.sh
Executable file
27
dotnet-examples/offline-tts/run-matcha-zh.sh
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# please visit
|
||||||
|
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
|
||||||
|
# to download more models
|
||||||
|
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||||
|
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||||
|
rm matcha-icefall-zh-baker.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./hifigan_v2.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
dotnet run \
|
||||||
|
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
|
||||||
|
--matcha-vocoder=./hifigan_v2.onnx \
|
||||||
|
--lexicon=./matcha-icefall-zh-baker/lexicon.txt \
|
||||||
|
--tokens=./matcha-icefall-zh-baker/tokens.txt \
|
||||||
|
--dict-dir=./matcha-icefall-zh-baker/dict \
|
||||||
|
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
|
||||||
|
--debug=1 \
|
||||||
|
--output-filename=./matcha-zh.wav \
|
||||||
|
--text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||||||
@@ -10,8 +10,8 @@ fi
|
|||||||
|
|
||||||
dotnet run \
|
dotnet run \
|
||||||
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
||||||
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
--tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
||||||
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
--data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
||||||
--debug=1 \
|
--debug=1 \
|
||||||
--output-filename=./amy.wav \
|
--output-filename=./amy.wav \
|
||||||
--text="This is a text to speech application in dotnet with Next Generation Kaldi"
|
--text="This is a text to speech application in dotnet with Next Generation Kaldi"
|
||||||
|
|||||||
@@ -42,9 +42,45 @@ node ./test-offline-speaker-diarization.js
|
|||||||
|
|
||||||
In the following, we demonstrate how to run text-to-speech.
|
In the following, we demonstrate how to run text-to-speech.
|
||||||
|
|
||||||
## ./test-offline-tts-en.js
|
## ./test-offline-tts-matcha-zh.js
|
||||||
|
|
||||||
[./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use
|
[./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use
|
||||||
|
[matcha-icefall-zh-baker](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker)
|
||||||
|
for text-to-speech.
|
||||||
|
|
||||||
|
You can use the following command to run it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||||
|
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||||
|
rm matcha-icefall-zh-baker.tar.bz2
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
|
||||||
|
node ./test-offline-tts-matcha-zh.js
|
||||||
|
```
|
||||||
|
|
||||||
|
## ./test-offline-tts-matcha-en.js
|
||||||
|
|
||||||
|
[./test-offline-tts-matcha-en.js](./test-offline-tts-matcha-en.js) shows how to use
|
||||||
|
[matcha-icefall-en_US-ljspeech](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker)
|
||||||
|
for text-to-speech.
|
||||||
|
|
||||||
|
You can use the following command to run it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||||
|
|
||||||
|
node ./test-offline-tts-matcha-en.js
|
||||||
|
```
|
||||||
|
|
||||||
|
## ./test-offline-tts-vits-en.js
|
||||||
|
|
||||||
|
[./test-offline-tts-vits-en.js](./test-offline-tts-vits-en.js) shows how to use
|
||||||
[vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2)
|
[vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2)
|
||||||
for text-to-speech.
|
for text-to-speech.
|
||||||
|
|
||||||
@@ -53,12 +89,12 @@ You can use the following command to run it:
|
|||||||
```bash
|
```bash
|
||||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||||
tar xvf vits-piper-en_US-amy-low.tar.bz2
|
tar xvf vits-piper-en_US-amy-low.tar.bz2
|
||||||
node ./test-offline-tts-en.js
|
node ./test-offline-tts-vits-en.js
|
||||||
```
|
```
|
||||||
|
|
||||||
## ./test-offline-tts-zh.js
|
## ./test-offline-tts-vits-zh.js
|
||||||
|
|
||||||
[./test-offline-tts-zh.js](./test-offline-tts-zh.js) shows how to use
|
[./test-offline-tts-vits-zh.js](./test-offline-tts-vits-zh.js) shows how to use
|
||||||
a VITS pretrained model
|
a VITS pretrained model
|
||||||
[aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3)
|
[aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3)
|
||||||
for text-to-speech.
|
for text-to-speech.
|
||||||
@@ -68,7 +104,7 @@ You can use the following command to run it:
|
|||||||
```bash
|
```bash
|
||||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||||
node ./test-offline-tts-zh.js
|
node ./test-offline-tts-vits-zh.js
|
||||||
```
|
```
|
||||||
|
|
||||||
# Speech-to-text
|
# Speech-to-text
|
||||||
|
|||||||
40
nodejs-examples/test-offline-tts-matcha-en.js
Normal file
40
nodejs-examples/test-offline-tts-matcha-en.js
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
const sherpa_onnx = require('sherpa-onnx');
|
||||||
|
|
||||||
|
function createOfflineTts() {
|
||||||
|
let offlineTtsMatchaModelConfig = {
|
||||||
|
acousticModel: './matcha-icefall-en_US-ljspeech/model-steps-3.onnx',
|
||||||
|
vocoder: './hifigan_v2.onnx',
|
||||||
|
lexicon: './matcha-icefall-en_US-ljspeech/lexicon.txt',
|
||||||
|
tokens: './matcha-icefall-en_US-ljspeech/tokens.txt',
|
||||||
|
dataDir: './matcha-icefall-en_US-ljspeech/espeak-ng-data',
|
||||||
|
|
||||||
|
noiseScale: 0.667,
|
||||||
|
lengthScale: 1.0,
|
||||||
|
};
|
||||||
|
let offlineTtsModelConfig = {
|
||||||
|
offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
|
||||||
|
numThreads: 1,
|
||||||
|
debug: 1,
|
||||||
|
provider: 'cpu',
|
||||||
|
};
|
||||||
|
|
||||||
|
let offlineTtsConfig = {
|
||||||
|
offlineTtsModelConfig: offlineTtsModelConfig,
|
||||||
|
maxNumSentences: 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
return sherpa_onnx.createOfflineTts(offlineTtsConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
const tts = createOfflineTts();
|
||||||
|
const speakerId = 0;
|
||||||
|
const speed = 1.0;
|
||||||
|
const text =
|
||||||
|
'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
|
||||||
|
|
||||||
|
const audio = tts.generate({text: text, sid: speakerId, speed: speed});
|
||||||
|
tts.save('./test-matcha-en.wav', audio);
|
||||||
|
console.log('Saved to test-matcha-en.wav successfully.');
|
||||||
|
tts.free();
|
||||||
41
nodejs-examples/test-offline-tts-matcha-zh.js
Normal file
41
nodejs-examples/test-offline-tts-matcha-zh.js
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
const sherpa_onnx = require('sherpa-onnx');
|
||||||
|
|
||||||
|
function createOfflineTts() {
|
||||||
|
let offlineTtsMatchaModelConfig = {
|
||||||
|
acousticModel: './matcha-icefall-zh-baker/model-steps-3.onnx',
|
||||||
|
vocoder: './hifigan_v2.onnx',
|
||||||
|
lexicon: './matcha-icefall-zh-baker/lexicon.txt',
|
||||||
|
tokens: './matcha-icefall-zh-baker/tokens.txt',
|
||||||
|
dictDir: './matcha-icefall-zh-baker/dict',
|
||||||
|
noiseScale: 0.667,
|
||||||
|
lengthScale: 1.0,
|
||||||
|
};
|
||||||
|
let offlineTtsModelConfig = {
|
||||||
|
offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
|
||||||
|
numThreads: 1,
|
||||||
|
debug: 1,
|
||||||
|
provider: 'cpu',
|
||||||
|
};
|
||||||
|
|
||||||
|
let offlineTtsConfig = {
|
||||||
|
offlineTtsModelConfig: offlineTtsModelConfig,
|
||||||
|
maxNumSentences: 1,
|
||||||
|
ruleFsts:
|
||||||
|
'./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst',
|
||||||
|
};
|
||||||
|
|
||||||
|
return sherpa_onnx.createOfflineTts(offlineTtsConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
const tts = createOfflineTts();
|
||||||
|
const speakerId = 0;
|
||||||
|
const speed = 1.0;
|
||||||
|
const text =
|
||||||
|
'当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔. 某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'
|
||||||
|
|
||||||
|
const audio = tts.generate({text: text, sid: speakerId, speed: speed});
|
||||||
|
tts.save('./test-matcha-zh.wav', audio);
|
||||||
|
console.log('Saved to test-matcha-zh.wav successfully.');
|
||||||
|
tts.free();
|
||||||
@@ -37,7 +37,7 @@ const audio = tts.generate({
|
|||||||
speed: speed
|
speed: speed
|
||||||
});
|
});
|
||||||
|
|
||||||
tts.save('./test-en.wav', audio);
|
tts.save('./test-vits-en.wav', audio);
|
||||||
console.log('Saved to test-en.wav successfully.');
|
console.log('Saved to test-vits-en.wav successfully.');
|
||||||
|
|
||||||
tts.free();
|
tts.free();
|
||||||
@@ -34,6 +34,6 @@ const speakerId = 66;
|
|||||||
const speed = 1.0;
|
const speed = 1.0;
|
||||||
const audio = tts.generate(
|
const audio = tts.generate(
|
||||||
{text: '3年前中国总人口是1411778724人', sid: speakerId, speed: speed});
|
{text: '3年前中国总人口是1411778724人', sid: speakerId, speed: speed});
|
||||||
tts.save('./test-zh.wav', audio);
|
tts.save('./test-vits-zh.wav', audio);
|
||||||
console.log('Saved to test-zh.wav successfully.');
|
console.log('Saved to test-vits-zh.wav successfully.');
|
||||||
tts.free();
|
tts.free();
|
||||||
44
scripts/dotnet/OfflineTtsMatchaModelConfig.cs
Normal file
44
scripts/dotnet/OfflineTtsMatchaModelConfig.cs
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
|
||||||
|
namespace SherpaOnnx
|
||||||
|
{
|
||||||
|
[StructLayout(LayoutKind.Sequential)]
|
||||||
|
public struct OfflineTtsMatchaModelConfig
|
||||||
|
{
|
||||||
|
public OfflineTtsMatchaModelConfig()
|
||||||
|
{
|
||||||
|
AcousticModel = "";
|
||||||
|
Vocoder = "";
|
||||||
|
Lexicon = "";
|
||||||
|
Tokens = "";
|
||||||
|
DataDir = "";
|
||||||
|
|
||||||
|
NoiseScale = 0.667F;
|
||||||
|
LengthScale = 1.0F;
|
||||||
|
|
||||||
|
DictDir = "";
|
||||||
|
}
|
||||||
|
[MarshalAs(UnmanagedType.LPStr)]
|
||||||
|
public string AcousticModel;
|
||||||
|
|
||||||
|
[MarshalAs(UnmanagedType.LPStr)]
|
||||||
|
public string Vocoder;
|
||||||
|
|
||||||
|
[MarshalAs(UnmanagedType.LPStr)]
|
||||||
|
public string Lexicon;
|
||||||
|
|
||||||
|
[MarshalAs(UnmanagedType.LPStr)]
|
||||||
|
public string Tokens;
|
||||||
|
|
||||||
|
[MarshalAs(UnmanagedType.LPStr)]
|
||||||
|
public string DataDir;
|
||||||
|
|
||||||
|
public float NoiseScale;
|
||||||
|
public float LengthScale;
|
||||||
|
|
||||||
|
[MarshalAs(UnmanagedType.LPStr)]
|
||||||
|
public string DictDir;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,6 +11,7 @@ namespace SherpaOnnx
|
|||||||
public OfflineTtsModelConfig()
|
public OfflineTtsModelConfig()
|
||||||
{
|
{
|
||||||
Vits = new OfflineTtsVitsModelConfig();
|
Vits = new OfflineTtsVitsModelConfig();
|
||||||
|
Matcha = new OfflineTtsMatchaModelConfig();
|
||||||
NumThreads = 1;
|
NumThreads = 1;
|
||||||
Debug = 0;
|
Debug = 0;
|
||||||
Provider = "cpu";
|
Provider = "cpu";
|
||||||
@@ -21,5 +22,7 @@ namespace SherpaOnnx
|
|||||||
public int Debug;
|
public int Debug;
|
||||||
[MarshalAs(UnmanagedType.LPStr)]
|
[MarshalAs(UnmanagedType.LPStr)]
|
||||||
public string Provider;
|
public string Provider;
|
||||||
|
|
||||||
|
public OfflineTtsMatchaModelConfig Matcha;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
<Project Sdk="Microsoft.NET.Sdk">
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
<PropertyGroup>
|
<PropertyGroup>
|
||||||
<TargetFramework>.net6</TargetFramework>
|
<TargetFramework>net8.0</TargetFramework>
|
||||||
<RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
|
<RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
<PackageReadmeFile>README.md</PackageReadmeFile>
|
<PackageReadmeFile>README.md</PackageReadmeFile>
|
||||||
<OutputType>Library</OutputType>
|
<OutputType>Library</OutputType>
|
||||||
<LangVersion>10.0</LangVersion>
|
<LangVersion>10.0</LangVersion>
|
||||||
<TargetFrameworks>net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks>
|
<TargetFrameworks>net8.0;net7.0;net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks>
|
||||||
<RuntimeIdentifiers>linux-x64;linux-arm64;osx-x64;osx-arm64;win-x64;win-x86;win-arm64</RuntimeIdentifiers>
|
<RuntimeIdentifiers>linux-x64;linux-arm64;osx-x64;osx-arm64;win-x64;win-x86;win-arm64</RuntimeIdentifiers>
|
||||||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||||
<AssemblyName>sherpa-onnx</AssemblyName>
|
<AssemblyName>sherpa-onnx</AssemblyName>
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
<PackageLicenseExpression>Apache-2.0</PackageLicenseExpression>
|
<PackageLicenseExpression>Apache-2.0</PackageLicenseExpression>
|
||||||
<PackageReadmeFile>README.md</PackageReadmeFile>
|
<PackageReadmeFile>README.md</PackageReadmeFile>
|
||||||
<OutputType>Library</OutputType>
|
<OutputType>Library</OutputType>
|
||||||
<TargetFrameworks>net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks>
|
<TargetFrameworks>net8.0;net7.0;net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks>
|
||||||
<RuntimeIdentifier>{{ dotnet_rid }}</RuntimeIdentifier>
|
<RuntimeIdentifier>{{ dotnet_rid }}</RuntimeIdentifier>
|
||||||
<AssemblyName>sherpa-onnx</AssemblyName>
|
<AssemblyName>sherpa-onnx</AssemblyName>
|
||||||
<Version>{{ version }}</Version>
|
<Version>{{ version }}</Version>
|
||||||
|
|||||||
@@ -8,6 +8,10 @@ function freeConfig(config, Module) {
|
|||||||
freeConfig(config.config, Module)
|
freeConfig(config.config, Module)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ('config2' in config) {
|
||||||
|
freeConfig(config.config2, Module)
|
||||||
|
}
|
||||||
|
|
||||||
Module._free(config.ptr);
|
Module._free(config.ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -66,11 +70,103 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) {
|
||||||
|
const acousticModelLen = Module.lengthBytesUTF8(config.acousticModel) + 1;
|
||||||
|
const vocoderLen = Module.lengthBytesUTF8(config.vocoder) + 1;
|
||||||
|
const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
|
||||||
|
const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
|
||||||
|
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
|
||||||
|
const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
|
||||||
|
|
||||||
|
const n = acousticModelLen + vocoderLen + lexiconLen + tokensLen +
|
||||||
|
dataDirLen + dictDirLen;
|
||||||
|
|
||||||
|
const buffer = Module._malloc(n);
|
||||||
|
|
||||||
|
const len = 8 * 4;
|
||||||
|
const ptr = Module._malloc(len);
|
||||||
|
|
||||||
|
let offset = 0;
|
||||||
|
Module.stringToUTF8(
|
||||||
|
config.acousticModel || '', buffer + offset, acousticModelLen);
|
||||||
|
offset += acousticModelLen;
|
||||||
|
|
||||||
|
Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen);
|
||||||
|
offset += vocoderLen;
|
||||||
|
|
||||||
|
Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
|
||||||
|
offset += lexiconLen;
|
||||||
|
|
||||||
|
Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
|
||||||
|
offset += tokensLen;
|
||||||
|
|
||||||
|
Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
|
||||||
|
offset += dataDirLen;
|
||||||
|
|
||||||
|
Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen);
|
||||||
|
offset += dictDirLen;
|
||||||
|
|
||||||
|
offset = 0;
|
||||||
|
Module.setValue(ptr, buffer + offset, 'i8*');
|
||||||
|
offset += acousticModelLen;
|
||||||
|
|
||||||
|
Module.setValue(ptr + 4, buffer + offset, 'i8*');
|
||||||
|
offset += vocoderLen;
|
||||||
|
|
||||||
|
Module.setValue(ptr + 8, buffer + offset, 'i8*');
|
||||||
|
offset += lexiconLen;
|
||||||
|
|
||||||
|
Module.setValue(ptr + 12, buffer + offset, 'i8*');
|
||||||
|
offset += tokensLen;
|
||||||
|
|
||||||
|
Module.setValue(ptr + 16, buffer + offset, 'i8*');
|
||||||
|
offset += dataDirLen;
|
||||||
|
|
||||||
|
Module.setValue(ptr + 20, config.noiseScale || 0.667, 'float');
|
||||||
|
Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float');
|
||||||
|
Module.setValue(ptr + 28, buffer + offset, 'i8*');
|
||||||
|
offset += dictDirLen;
|
||||||
|
|
||||||
|
return {
|
||||||
|
buffer: buffer, ptr: ptr, len: len,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
||||||
|
if (!('offlineTtsVitsModelConfig' in config)) {
|
||||||
|
config.offlineTtsVitsModelConfig = {
|
||||||
|
model: '',
|
||||||
|
lexicon: '',
|
||||||
|
tokens: '',
|
||||||
|
noiseScale: 0.667,
|
||||||
|
noiseScaleW: 0.8,
|
||||||
|
lengthScale: 1.0,
|
||||||
|
dataDir: '',
|
||||||
|
dictDir: '',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!('offlineTtsMatchaModelConfig' in config)) {
|
||||||
|
config.offlineTtsMatchaModelConfig = {
|
||||||
|
acousticModel: '',
|
||||||
|
vocoder: '',
|
||||||
|
lexicon: '',
|
||||||
|
tokens: '',
|
||||||
|
noiseScale: 0.667,
|
||||||
|
lengthScale: 1.0,
|
||||||
|
dataDir: '',
|
||||||
|
dictDir: '',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
|
const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
|
||||||
config.offlineTtsVitsModelConfig, Module);
|
config.offlineTtsVitsModelConfig, Module);
|
||||||
|
|
||||||
const len = vitsModelConfig.len + 3 * 4;
|
const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig(
|
||||||
|
config.offlineTtsMatchaModelConfig, Module);
|
||||||
|
|
||||||
|
const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4;
|
||||||
const ptr = Module._malloc(len);
|
const ptr = Module._malloc(len);
|
||||||
|
|
||||||
let offset = 0;
|
let offset = 0;
|
||||||
@@ -87,9 +183,14 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
|||||||
const buffer = Module._malloc(providerLen);
|
const buffer = Module._malloc(providerLen);
|
||||||
Module.stringToUTF8(config.provider, buffer, providerLen);
|
Module.stringToUTF8(config.provider, buffer, providerLen);
|
||||||
Module.setValue(ptr + offset, buffer, 'i8*');
|
Module.setValue(ptr + offset, buffer, 'i8*');
|
||||||
|
offset += 4;
|
||||||
|
|
||||||
|
Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset);
|
||||||
|
offset += matchaModelConfig.len;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
|
buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
|
||||||
|
config2: matchaModelConfig
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -195,12 +296,26 @@ function createOfflineTts(Module, myConfig) {
|
|||||||
noiseScaleW: 0.8,
|
noiseScaleW: 0.8,
|
||||||
lengthScale: 1.0,
|
lengthScale: 1.0,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const offlineTtsMatchaModelConfig = {
|
||||||
|
acousticModel: '',
|
||||||
|
vocoder: '',
|
||||||
|
lexicon: '',
|
||||||
|
tokens: '',
|
||||||
|
dataDir: '',
|
||||||
|
dictDir: '',
|
||||||
|
noiseScale: 0.667,
|
||||||
|
lengthScale: 1.0,
|
||||||
|
};
|
||||||
|
|
||||||
const offlineTtsModelConfig = {
|
const offlineTtsModelConfig = {
|
||||||
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
|
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
|
||||||
|
offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
|
||||||
numThreads: 1,
|
numThreads: 1,
|
||||||
debug: 1,
|
debug: 1,
|
||||||
provider: 'cpu',
|
provider: 'cpu',
|
||||||
};
|
};
|
||||||
|
|
||||||
let offlineTtsConfig = {
|
let offlineTtsConfig = {
|
||||||
offlineTtsModelConfig: offlineTtsModelConfig,
|
offlineTtsModelConfig: offlineTtsModelConfig,
|
||||||
ruleFsts: '',
|
ruleFsts: '',
|
||||||
|
|||||||
@@ -14,8 +14,10 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
|
static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
|
||||||
|
static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
|
||||||
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
|
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
|
||||||
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4,
|
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
|
||||||
|
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4,
|
||||||
"");
|
"");
|
||||||
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
|
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
|
||||||
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
|
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
|
||||||
@@ -24,6 +26,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
|
|||||||
void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
||||||
auto tts_model_config = &tts_config->model;
|
auto tts_model_config = &tts_config->model;
|
||||||
auto vits_model_config = &tts_model_config->vits;
|
auto vits_model_config = &tts_model_config->vits;
|
||||||
|
auto matcha_model_config = &tts_model_config->matcha;
|
||||||
fprintf(stdout, "----------vits model config----------\n");
|
fprintf(stdout, "----------vits model config----------\n");
|
||||||
fprintf(stdout, "model: %s\n", vits_model_config->model);
|
fprintf(stdout, "model: %s\n", vits_model_config->model);
|
||||||
fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
|
fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
|
||||||
@@ -34,6 +37,16 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
|||||||
fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale);
|
fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale);
|
||||||
fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir);
|
fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir);
|
||||||
|
|
||||||
|
fprintf(stdout, "----------matcha model config----------\n");
|
||||||
|
fprintf(stdout, "acoustic_model: %s\n", matcha_model_config->acoustic_model);
|
||||||
|
fprintf(stdout, "vocoder: %s\n", matcha_model_config->vocoder);
|
||||||
|
fprintf(stdout, "lexicon: %s\n", matcha_model_config->lexicon);
|
||||||
|
fprintf(stdout, "tokens: %s\n", matcha_model_config->tokens);
|
||||||
|
fprintf(stdout, "data_dir: %s\n", matcha_model_config->data_dir);
|
||||||
|
fprintf(stdout, "noise scale: %.3f\n", matcha_model_config->noise_scale);
|
||||||
|
fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale);
|
||||||
|
fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir);
|
||||||
|
|
||||||
fprintf(stdout, "----------tts model config----------\n");
|
fprintf(stdout, "----------tts model config----------\n");
|
||||||
fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
|
fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
|
||||||
fprintf(stdout, "debug: %d\n", tts_model_config->debug);
|
fprintf(stdout, "debug: %d\n", tts_model_config->debug);
|
||||||
|
|||||||
Reference in New Issue
Block a user