diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index 7c339e15..f120653c 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,11 @@ cd dotnet-examples/ -cd ./offline-tts +cd ./kokoro-tts +./run-kokoro-en.sh +ls -lh + +cd ../offline-tts ./run-matcha-zh.sh ls -lh *.wav ./run-matcha-en.sh @@ -19,7 +23,8 @@ pushd ../.. mkdir tts -cp dotnet-examples/offline-tts/*.wav ./tts +cp -v dotnet-examples/kokoro-tts/*.wav ./tts +cp -v dotnet-examples/offline-tts/*.wav ./tts popd cd ../offline-speaker-diarization diff --git a/dotnet-examples/kokoro-tts-play/Program.cs b/dotnet-examples/kokoro-tts-play/Program.cs new file mode 100644 index 00000000..eea22cc2 --- /dev/null +++ b/dotnet-examples/kokoro-tts-play/Program.cs @@ -0,0 +1,189 @@ +// Copyright (c) 2025 Xiaomi Corporation +// +// This file shows how to use a non-streaming Kokoro TTS model +// for text-to-speech +// Please refer to +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html +// and +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +// to download pre-trained models +using PortAudioSharp; +using SherpaOnnx; +using System.Collections.Concurrent; +using System.Runtime.InteropServices; + +class OfflineTtsDemo +{ + static void Main(string[] args) + { + var config = new OfflineTtsConfig(); + config.Model.Kokoro.Model = "./kokoro-en-v0_19/model.onnx"; + config.Model.Kokoro.Voices = "./kokoro-en-v0_19/voices.bin"; + config.Model.Kokoro.Tokens = "./kokoro-en-v0_19/tokens.txt"; + config.Model.Kokoro.DataDir = "./kokoro-en-v0_19/espeak-ng-data"; + + config.Model.NumThreads = 2; + config.Model.Debug = 1; + config.Model.Provider = "cpu"; + + var tts = new OfflineTts(config); + var speed = 1.0f; + var text = "Today as always, men fall into two groups: slaves and free men. Whoever " + + "does not have two-thirds of his day for himself, is a slave, whatever " + + "he may be: a statesman, a businessman, an official, or a scholar. " + + "Friends fell out often because life was changing so fast. The easiest " + + "thing in the world was to lose touch with someone."; + + // mapping of sid to voice name + // 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam + // 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis + var sid = 0; + + + Console.WriteLine(PortAudio.VersionInfo.versionText); + PortAudio.Initialize(); + Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}"); + + for (int i = 0; i != PortAudio.DeviceCount; ++i) + { + Console.WriteLine($" Device {i}"); + DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i); + Console.WriteLine($" Name: {deviceInfo.name}"); + Console.WriteLine($" Max output channels: {deviceInfo.maxOutputChannels}"); + Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}"); + } + int deviceIndex = PortAudio.DefaultOutputDevice; + if (deviceIndex == PortAudio.NoDevice) + { + Console.WriteLine("No default output device found. Please use ../offline-tts instead"); + Environment.Exit(1); + } + + var info = PortAudio.GetDeviceInfo(deviceIndex); + Console.WriteLine(); + Console.WriteLine($"Use output default device {deviceIndex} ({info.name})"); + + var param = new StreamParameters(); + param.device = deviceIndex; + param.channelCount = 1; + param.sampleFormat = SampleFormat.Float32; + param.suggestedLatency = info.defaultLowOutputLatency; + param.hostApiSpecificStreamInfo = IntPtr.Zero; + + // https://learn.microsoft.com/en-us/dotnet/standard/collections/thread-safe/blockingcollection-overview + var dataItems = new BlockingCollection(); + + var MyCallback = (IntPtr samples, int n, float progress) => + { + Console.WriteLine($"Progress {progress*100}%"); + + float[] data = new float[n]; + + Marshal.Copy(samples, data, 0, n); + + dataItems.Add(data); + + // 1 means to keep generating + // 0 means to stop generating + return 1; + }; + + var playFinished = false; + + float[]? lastSampleArray = null; + int lastIndex = 0; // not played + + PortAudioSharp.Stream.Callback playCallback = (IntPtr input, IntPtr output, + UInt32 frameCount, + ref StreamCallbackTimeInfo timeInfo, + StreamCallbackFlags statusFlags, + IntPtr userData + ) => + { + if (dataItems.IsCompleted && lastSampleArray == null && lastIndex == 0) + { + Console.WriteLine($"Finished playing"); + playFinished = true; + return StreamCallbackResult.Complete; + } + + int expected = Convert.ToInt32(frameCount); + int i = 0; + + while ((lastSampleArray != null || dataItems.Count != 0) && (i < expected)) + { + int needed = expected - i; + + if (lastSampleArray != null) + { + int remaining = lastSampleArray.Length - lastIndex; + if (remaining >= needed) + { + float[] this_block = lastSampleArray.Skip(lastIndex).Take(needed).ToArray(); + lastIndex += needed; + if (lastIndex == lastSampleArray.Length) + { + lastSampleArray = null; + lastIndex = 0; + } + + Marshal.Copy(this_block, 0, IntPtr.Add(output, i * sizeof(float)), needed); + return StreamCallbackResult.Continue; + } + + float[] this_block2 = lastSampleArray.Skip(lastIndex).Take(remaining).ToArray(); + lastIndex = 0; + lastSampleArray = null; + + Marshal.Copy(this_block2, 0, IntPtr.Add(output, i * sizeof(float)), remaining); + i += remaining; + continue; + } + + if (dataItems.Count != 0) + { + lastSampleArray = dataItems.Take(); + lastIndex = 0; + } + } + + if (i < expected) + { + int sizeInBytes = (expected - i) * 4; + Marshal.Copy(new byte[sizeInBytes], 0, IntPtr.Add(output, i * sizeof(float)), sizeInBytes); + } + + return StreamCallbackResult.Continue; + }; + + PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: null, outParams: param, sampleRate: tts.SampleRate, + framesPerBuffer: 0, + streamFlags: StreamFlags.ClipOff, + callback: playCallback, + userData: IntPtr.Zero + ); + + stream.Start(); + + var callback = new OfflineTtsCallbackProgress(MyCallback); + + var audio = tts.GenerateWithCallbackProgress(text, speed, sid, callback); + var outputFilename = "./generated-kokoro-0.wav"; + var ok = audio.SaveToWaveFile(outputFilename); + + if (ok) + { + Console.WriteLine($"Wrote to {outputFilename} succeeded!"); + } + else + { + Console.WriteLine($"Failed to write {outputFilename}"); + } + dataItems.CompleteAdding(); + + while (!playFinished) + { + Thread.Sleep(100); // 100ms + } + } +} diff --git a/dotnet-examples/kokoro-tts-play/kokoro-tts-play.csproj b/dotnet-examples/kokoro-tts-play/kokoro-tts-play.csproj new file mode 100644 index 00000000..6c725686 --- /dev/null +++ b/dotnet-examples/kokoro-tts-play/kokoro-tts-play.csproj @@ -0,0 +1,19 @@ + + + + Exe + net8.0 + kokoro_tts_play + enable + enable + + + + + + + + + + + diff --git a/dotnet-examples/kokoro-tts-play/run-kokoro-en.sh b/dotnet-examples/kokoro-tts-play/run-kokoro-en.sh new file mode 100755 index 00000000..08bdc693 --- /dev/null +++ b/dotnet-examples/kokoro-tts-play/run-kokoro-en.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -ex + +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 + tar xf kokoro-en-v0_19.tar.bz2 + rm kokoro-en-v0_19.tar.bz2 +fi + +dotnet run diff --git a/dotnet-examples/kokoro-tts/Program.cs b/dotnet-examples/kokoro-tts/Program.cs new file mode 100644 index 00000000..61792683 --- /dev/null +++ b/dotnet-examples/kokoro-tts/Program.cs @@ -0,0 +1,70 @@ +// Copyright (c) 2025 Xiaomi Corporation +// +// This file shows how to use a non-streaming Kokoro TTS model +// for text-to-speech +// Please refer to +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html +// and +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +// to download pre-trained models +using SherpaOnnx; +using System.Runtime.InteropServices; + +class OfflineTtsDemo +{ + static void Main(string[] args) + { + var config = new OfflineTtsConfig(); + config.Model.Kokoro.Model = "./kokoro-en-v0_19/model.onnx"; + config.Model.Kokoro.Voices = "./kokoro-en-v0_19/voices.bin"; + config.Model.Kokoro.Tokens = "./kokoro-en-v0_19/tokens.txt"; + config.Model.Kokoro.DataDir = "./kokoro-en-v0_19/espeak-ng-data"; + + config.Model.NumThreads = 2; + config.Model.Debug = 1; + config.Model.Provider = "cpu"; + + var tts = new OfflineTts(config); + var speed = 1.0f; + var text = "Today as always, men fall into two groups: slaves and free men. Whoever " + + "does not have two-thirds of his day for himself, is a slave, whatever " + + "he may be: a statesman, a businessman, an official, or a scholar. " + + "Friends fell out often because life was changing so fast. The easiest " + + "thing in the world was to lose touch with someone."; + + // mapping of sid to voice name + // 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam + // 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis + var sid = 0; + + var MyCallback = (IntPtr samples, int n, float progress) => + { + float[] data = new float[n]; + Marshal.Copy(samples, data, 0, n); + // You can process samples here, e.g., play them. + // See ../kokoro-tts-playback for how to play them + Console.WriteLine($"Progress {progress*100}%"); + + // 1 means to keep generating + // 0 means to stop generating + return 1; + }; + + var callback = new OfflineTtsCallbackProgress(MyCallback); + + var audio = tts.GenerateWithCallbackProgress(text, speed, sid, callback); + + var outputFilename = "./generated-kokoro-0.wav"; + var ok = audio.SaveToWaveFile(outputFilename); + + if (ok) + { + Console.WriteLine($"Wrote to {outputFilename} succeeded!"); + } + else + { + Console.WriteLine($"Failed to write {outputFilename}"); + } + } +} + diff --git a/dotnet-examples/kokoro-tts/kokoro-tts.csproj b/dotnet-examples/kokoro-tts/kokoro-tts.csproj new file mode 100644 index 00000000..132819c6 --- /dev/null +++ b/dotnet-examples/kokoro-tts/kokoro-tts.csproj @@ -0,0 +1,15 @@ + + + + Exe + net8.0 + kokoro_tts + enable + enable + + + + + + + diff --git a/dotnet-examples/kokoro-tts/run-kokoro-en.sh b/dotnet-examples/kokoro-tts/run-kokoro-en.sh new file mode 100755 index 00000000..08bdc693 --- /dev/null +++ b/dotnet-examples/kokoro-tts/run-kokoro-en.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -ex + +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 + tar xf kokoro-en-v0_19.tar.bz2 + rm kokoro-en-v0_19.tar.bz2 +fi + +dotnet run diff --git a/dotnet-examples/sherpa-onnx.sln b/dotnet-examples/sherpa-onnx.sln index 1ebcdf46..404c4976 100644 --- a/dotnet-examples/sherpa-onnx.sln +++ b/dotnet-examples/sherpa-onnx.sln @@ -31,6 +31,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-micro EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "kokoro-tts", "kokoro-tts\kokoro-tts.csproj", "{9C0ABE6C-1F54-42B5-804E-C3FED6668F52}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "kokoro-tts-play", "kokoro-tts-play\kokoro-tts-play.csproj", "{EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -93,6 +97,14 @@ Global {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.Build.0 = Release|Any CPU + {9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Debug|Any CPU.Build.0 = Debug|Any CPU + {9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Release|Any CPU.ActiveCfg = Release|Any CPU + {9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Release|Any CPU.Build.0 = Release|Any CPU + {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Debug|Any CPU.Build.0 = Debug|Any CPU + {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Release|Any CPU.ActiveCfg = Release|Any CPU + {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/scripts/dotnet/OfflineTts.cs b/scripts/dotnet/OfflineTts.cs index e4d29733..334ba228 100644 --- a/scripts/dotnet/OfflineTts.cs +++ b/scripts/dotnet/OfflineTts.cs @@ -7,6 +7,7 @@ namespace SherpaOnnx { // IntPtr is actually a `const float*` from C++ public delegate int OfflineTtsCallback(IntPtr samples, int n); + public delegate int OfflineTtsCallbackProgress(IntPtr samples, int n, float progress); public class OfflineTts : IDisposable { @@ -36,6 +37,16 @@ namespace SherpaOnnx return new OfflineTtsGeneratedAudio(p); } + public OfflineTtsGeneratedAudio GenerateWithCallbackProgress(String text, float speed, int speakerId, OfflineTtsCallbackProgress callback) + { + byte[] utf8Bytes = Encoding.UTF8.GetBytes(text); + byte[] utf8BytesWithNull = new byte[utf8Bytes.Length + 1]; // +1 for null terminator + Array.Copy(utf8Bytes, utf8BytesWithNull, utf8Bytes.Length); + utf8BytesWithNull[utf8Bytes.Length] = 0; // Null terminator + IntPtr p = SherpaOnnxOfflineTtsGenerateWithProgressCallback(_handle.Handle, utf8BytesWithNull, speakerId, speed, callback); + return new OfflineTtsGeneratedAudio(p); + } + public void Dispose() { Cleanup(); @@ -92,5 +103,8 @@ namespace SherpaOnnx [DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)] private static extern IntPtr SherpaOnnxOfflineTtsGenerateWithCallback(IntPtr handle, [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I1)] byte[] utf8Text, int sid, float speed, OfflineTtsCallback callback); + + [DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)] + private static extern IntPtr SherpaOnnxOfflineTtsGenerateWithProgressCallback(IntPtr handle, [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I1)] byte[] utf8Text, int sid, float speed, OfflineTtsCallbackProgress callback); } } diff --git a/scripts/dotnet/OfflineTtsKokoroModelConfig.cs b/scripts/dotnet/OfflineTtsKokoroModelConfig.cs new file mode 100644 index 00000000..18fd60da --- /dev/null +++ b/scripts/dotnet/OfflineTtsKokoroModelConfig.cs @@ -0,0 +1,33 @@ +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + [StructLayout(LayoutKind.Sequential)] + public struct OfflineTtsKokoroModelConfig + { + public OfflineTtsKokoroModelConfig() + { + Model = ""; + Voices = ""; + Tokens = ""; + DataDir = ""; + + LengthScale = 1.0F; + } + [MarshalAs(UnmanagedType.LPStr)] + public string Model; + + [MarshalAs(UnmanagedType.LPStr)] + public string Voices; + + [MarshalAs(UnmanagedType.LPStr)] + public string Tokens; + + [MarshalAs(UnmanagedType.LPStr)] + public string DataDir; + + public float LengthScale; + } +} diff --git a/scripts/dotnet/OfflineTtsModelConfig.cs b/scripts/dotnet/OfflineTtsModelConfig.cs index e5caa117..9b1ec550 100644 --- a/scripts/dotnet/OfflineTtsModelConfig.cs +++ b/scripts/dotnet/OfflineTtsModelConfig.cs @@ -12,6 +12,7 @@ namespace SherpaOnnx { Vits = new OfflineTtsVitsModelConfig(); Matcha = new OfflineTtsMatchaModelConfig(); + Kokoro = new OfflineTtsKokoroModelConfig(); NumThreads = 1; Debug = 0; Provider = "cpu"; @@ -24,5 +25,6 @@ namespace SherpaOnnx public string Provider; public OfflineTtsMatchaModelConfig Matcha; + public OfflineTtsKokoroModelConfig Kokoro; } }