Add C# API for Kokoro TTS models (#1720)
This commit is contained in:
9
.github/scripts/test-dot-net.sh
vendored
9
.github/scripts/test-dot-net.sh
vendored
@@ -2,7 +2,11 @@
|
||||
|
||||
cd dotnet-examples/
|
||||
|
||||
cd ./offline-tts
|
||||
cd ./kokoro-tts
|
||||
./run-kokoro-en.sh
|
||||
ls -lh
|
||||
|
||||
cd ../offline-tts
|
||||
./run-matcha-zh.sh
|
||||
ls -lh *.wav
|
||||
./run-matcha-en.sh
|
||||
@@ -19,7 +23,8 @@ pushd ../..
|
||||
|
||||
mkdir tts
|
||||
|
||||
cp dotnet-examples/offline-tts/*.wav ./tts
|
||||
cp -v dotnet-examples/kokoro-tts/*.wav ./tts
|
||||
cp -v dotnet-examples/offline-tts/*.wav ./tts
|
||||
popd
|
||||
|
||||
cd ../offline-speaker-diarization
|
||||
|
||||
189
dotnet-examples/kokoro-tts-play/Program.cs
Normal file
189
dotnet-examples/kokoro-tts-play/Program.cs
Normal file
@@ -0,0 +1,189 @@
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
//
|
||||
// This file shows how to use a non-streaming Kokoro TTS model
|
||||
// for text-to-speech
|
||||
// Please refer to
|
||||
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
// and
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
// to download pre-trained models
|
||||
using PortAudioSharp;
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
class OfflineTtsDemo
|
||||
{
|
||||
static void Main(string[] args)
|
||||
{
|
||||
var config = new OfflineTtsConfig();
|
||||
config.Model.Kokoro.Model = "./kokoro-en-v0_19/model.onnx";
|
||||
config.Model.Kokoro.Voices = "./kokoro-en-v0_19/voices.bin";
|
||||
config.Model.Kokoro.Tokens = "./kokoro-en-v0_19/tokens.txt";
|
||||
config.Model.Kokoro.DataDir = "./kokoro-en-v0_19/espeak-ng-data";
|
||||
|
||||
config.Model.NumThreads = 2;
|
||||
config.Model.Debug = 1;
|
||||
config.Model.Provider = "cpu";
|
||||
|
||||
var tts = new OfflineTts(config);
|
||||
var speed = 1.0f;
|
||||
var text = "Today as always, men fall into two groups: slaves and free men. Whoever " +
|
||||
"does not have two-thirds of his day for himself, is a slave, whatever " +
|
||||
"he may be: a statesman, a businessman, an official, or a scholar. " +
|
||||
"Friends fell out often because life was changing so fast. The easiest " +
|
||||
"thing in the world was to lose touch with someone.";
|
||||
|
||||
// mapping of sid to voice name
|
||||
// 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam
|
||||
// 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis
|
||||
var sid = 0;
|
||||
|
||||
|
||||
Console.WriteLine(PortAudio.VersionInfo.versionText);
|
||||
PortAudio.Initialize();
|
||||
Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}");
|
||||
|
||||
for (int i = 0; i != PortAudio.DeviceCount; ++i)
|
||||
{
|
||||
Console.WriteLine($" Device {i}");
|
||||
DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i);
|
||||
Console.WriteLine($" Name: {deviceInfo.name}");
|
||||
Console.WriteLine($" Max output channels: {deviceInfo.maxOutputChannels}");
|
||||
Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}");
|
||||
}
|
||||
int deviceIndex = PortAudio.DefaultOutputDevice;
|
||||
if (deviceIndex == PortAudio.NoDevice)
|
||||
{
|
||||
Console.WriteLine("No default output device found. Please use ../offline-tts instead");
|
||||
Environment.Exit(1);
|
||||
}
|
||||
|
||||
var info = PortAudio.GetDeviceInfo(deviceIndex);
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Use output default device {deviceIndex} ({info.name})");
|
||||
|
||||
var param = new StreamParameters();
|
||||
param.device = deviceIndex;
|
||||
param.channelCount = 1;
|
||||
param.sampleFormat = SampleFormat.Float32;
|
||||
param.suggestedLatency = info.defaultLowOutputLatency;
|
||||
param.hostApiSpecificStreamInfo = IntPtr.Zero;
|
||||
|
||||
// https://learn.microsoft.com/en-us/dotnet/standard/collections/thread-safe/blockingcollection-overview
|
||||
var dataItems = new BlockingCollection<float[]>();
|
||||
|
||||
var MyCallback = (IntPtr samples, int n, float progress) =>
|
||||
{
|
||||
Console.WriteLine($"Progress {progress*100}%");
|
||||
|
||||
float[] data = new float[n];
|
||||
|
||||
Marshal.Copy(samples, data, 0, n);
|
||||
|
||||
dataItems.Add(data);
|
||||
|
||||
// 1 means to keep generating
|
||||
// 0 means to stop generating
|
||||
return 1;
|
||||
};
|
||||
|
||||
var playFinished = false;
|
||||
|
||||
float[]? lastSampleArray = null;
|
||||
int lastIndex = 0; // not played
|
||||
|
||||
PortAudioSharp.Stream.Callback playCallback = (IntPtr input, IntPtr output,
|
||||
UInt32 frameCount,
|
||||
ref StreamCallbackTimeInfo timeInfo,
|
||||
StreamCallbackFlags statusFlags,
|
||||
IntPtr userData
|
||||
) =>
|
||||
{
|
||||
if (dataItems.IsCompleted && lastSampleArray == null && lastIndex == 0)
|
||||
{
|
||||
Console.WriteLine($"Finished playing");
|
||||
playFinished = true;
|
||||
return StreamCallbackResult.Complete;
|
||||
}
|
||||
|
||||
int expected = Convert.ToInt32(frameCount);
|
||||
int i = 0;
|
||||
|
||||
while ((lastSampleArray != null || dataItems.Count != 0) && (i < expected))
|
||||
{
|
||||
int needed = expected - i;
|
||||
|
||||
if (lastSampleArray != null)
|
||||
{
|
||||
int remaining = lastSampleArray.Length - lastIndex;
|
||||
if (remaining >= needed)
|
||||
{
|
||||
float[] this_block = lastSampleArray.Skip(lastIndex).Take(needed).ToArray();
|
||||
lastIndex += needed;
|
||||
if (lastIndex == lastSampleArray.Length)
|
||||
{
|
||||
lastSampleArray = null;
|
||||
lastIndex = 0;
|
||||
}
|
||||
|
||||
Marshal.Copy(this_block, 0, IntPtr.Add(output, i * sizeof(float)), needed);
|
||||
return StreamCallbackResult.Continue;
|
||||
}
|
||||
|
||||
float[] this_block2 = lastSampleArray.Skip(lastIndex).Take(remaining).ToArray();
|
||||
lastIndex = 0;
|
||||
lastSampleArray = null;
|
||||
|
||||
Marshal.Copy(this_block2, 0, IntPtr.Add(output, i * sizeof(float)), remaining);
|
||||
i += remaining;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (dataItems.Count != 0)
|
||||
{
|
||||
lastSampleArray = dataItems.Take();
|
||||
lastIndex = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i < expected)
|
||||
{
|
||||
int sizeInBytes = (expected - i) * 4;
|
||||
Marshal.Copy(new byte[sizeInBytes], 0, IntPtr.Add(output, i * sizeof(float)), sizeInBytes);
|
||||
}
|
||||
|
||||
return StreamCallbackResult.Continue;
|
||||
};
|
||||
|
||||
PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: null, outParams: param, sampleRate: tts.SampleRate,
|
||||
framesPerBuffer: 0,
|
||||
streamFlags: StreamFlags.ClipOff,
|
||||
callback: playCallback,
|
||||
userData: IntPtr.Zero
|
||||
);
|
||||
|
||||
stream.Start();
|
||||
|
||||
var callback = new OfflineTtsCallbackProgress(MyCallback);
|
||||
|
||||
var audio = tts.GenerateWithCallbackProgress(text, speed, sid, callback);
|
||||
var outputFilename = "./generated-kokoro-0.wav";
|
||||
var ok = audio.SaveToWaveFile(outputFilename);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
Console.WriteLine($"Wrote to {outputFilename} succeeded!");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Failed to write {outputFilename}");
|
||||
}
|
||||
dataItems.CompleteAdding();
|
||||
|
||||
while (!playFinished)
|
||||
{
|
||||
Thread.Sleep(100); // 100ms
|
||||
}
|
||||
}
|
||||
}
|
||||
19
dotnet-examples/kokoro-tts-play/kokoro-tts-play.csproj
Normal file
19
dotnet-examples/kokoro-tts-play/kokoro-tts-play.csproj
Normal file
@@ -0,0 +1,19 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>kokoro_tts_play</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="PortAudioSharp2" Version="*" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Common\Common.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
10
dotnet-examples/kokoro-tts-play/run-kokoro-en.sh
Executable file
10
dotnet-examples/kokoro-tts-play/run-kokoro-en.sh
Executable file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
|
||||
if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||
tar xf kokoro-en-v0_19.tar.bz2
|
||||
rm kokoro-en-v0_19.tar.bz2
|
||||
fi
|
||||
|
||||
dotnet run
|
||||
70
dotnet-examples/kokoro-tts/Program.cs
Normal file
70
dotnet-examples/kokoro-tts/Program.cs
Normal file
@@ -0,0 +1,70 @@
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
//
|
||||
// This file shows how to use a non-streaming Kokoro TTS model
|
||||
// for text-to-speech
|
||||
// Please refer to
|
||||
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
// and
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
// to download pre-trained models
|
||||
using SherpaOnnx;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
class OfflineTtsDemo
|
||||
{
|
||||
static void Main(string[] args)
|
||||
{
|
||||
var config = new OfflineTtsConfig();
|
||||
config.Model.Kokoro.Model = "./kokoro-en-v0_19/model.onnx";
|
||||
config.Model.Kokoro.Voices = "./kokoro-en-v0_19/voices.bin";
|
||||
config.Model.Kokoro.Tokens = "./kokoro-en-v0_19/tokens.txt";
|
||||
config.Model.Kokoro.DataDir = "./kokoro-en-v0_19/espeak-ng-data";
|
||||
|
||||
config.Model.NumThreads = 2;
|
||||
config.Model.Debug = 1;
|
||||
config.Model.Provider = "cpu";
|
||||
|
||||
var tts = new OfflineTts(config);
|
||||
var speed = 1.0f;
|
||||
var text = "Today as always, men fall into two groups: slaves and free men. Whoever " +
|
||||
"does not have two-thirds of his day for himself, is a slave, whatever " +
|
||||
"he may be: a statesman, a businessman, an official, or a scholar. " +
|
||||
"Friends fell out often because life was changing so fast. The easiest " +
|
||||
"thing in the world was to lose touch with someone.";
|
||||
|
||||
// mapping of sid to voice name
|
||||
// 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam
|
||||
// 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis
|
||||
var sid = 0;
|
||||
|
||||
var MyCallback = (IntPtr samples, int n, float progress) =>
|
||||
{
|
||||
float[] data = new float[n];
|
||||
Marshal.Copy(samples, data, 0, n);
|
||||
// You can process samples here, e.g., play them.
|
||||
// See ../kokoro-tts-playback for how to play them
|
||||
Console.WriteLine($"Progress {progress*100}%");
|
||||
|
||||
// 1 means to keep generating
|
||||
// 0 means to stop generating
|
||||
return 1;
|
||||
};
|
||||
|
||||
var callback = new OfflineTtsCallbackProgress(MyCallback);
|
||||
|
||||
var audio = tts.GenerateWithCallbackProgress(text, speed, sid, callback);
|
||||
|
||||
var outputFilename = "./generated-kokoro-0.wav";
|
||||
var ok = audio.SaveToWaveFile(outputFilename);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
Console.WriteLine($"Wrote to {outputFilename} succeeded!");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Failed to write {outputFilename}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
15
dotnet-examples/kokoro-tts/kokoro-tts.csproj
Normal file
15
dotnet-examples/kokoro-tts/kokoro-tts.csproj
Normal file
@@ -0,0 +1,15 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>kokoro_tts</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Common\Common.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
10
dotnet-examples/kokoro-tts/run-kokoro-en.sh
Executable file
10
dotnet-examples/kokoro-tts/run-kokoro-en.sh
Executable file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
|
||||
if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||
tar xf kokoro-en-v0_19.tar.bz2
|
||||
rm kokoro-en-v0_19.tar.bz2
|
||||
fi
|
||||
|
||||
dotnet run
|
||||
@@ -31,6 +31,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-micro
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "kokoro-tts", "kokoro-tts\kokoro-tts.csproj", "{9C0ABE6C-1F54-42B5-804E-C3FED6668F52}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "kokoro-tts-play", "kokoro-tts-play\kokoro-tts-play.csproj", "{EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
@@ -93,6 +97,14 @@ Global
|
||||
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
||||
@@ -7,6 +7,7 @@ namespace SherpaOnnx
|
||||
{
|
||||
// IntPtr is actually a `const float*` from C++
|
||||
public delegate int OfflineTtsCallback(IntPtr samples, int n);
|
||||
public delegate int OfflineTtsCallbackProgress(IntPtr samples, int n, float progress);
|
||||
|
||||
public class OfflineTts : IDisposable
|
||||
{
|
||||
@@ -36,6 +37,16 @@ namespace SherpaOnnx
|
||||
return new OfflineTtsGeneratedAudio(p);
|
||||
}
|
||||
|
||||
public OfflineTtsGeneratedAudio GenerateWithCallbackProgress(String text, float speed, int speakerId, OfflineTtsCallbackProgress callback)
|
||||
{
|
||||
byte[] utf8Bytes = Encoding.UTF8.GetBytes(text);
|
||||
byte[] utf8BytesWithNull = new byte[utf8Bytes.Length + 1]; // +1 for null terminator
|
||||
Array.Copy(utf8Bytes, utf8BytesWithNull, utf8Bytes.Length);
|
||||
utf8BytesWithNull[utf8Bytes.Length] = 0; // Null terminator
|
||||
IntPtr p = SherpaOnnxOfflineTtsGenerateWithProgressCallback(_handle.Handle, utf8BytesWithNull, speakerId, speed, callback);
|
||||
return new OfflineTtsGeneratedAudio(p);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
Cleanup();
|
||||
@@ -92,5 +103,8 @@ namespace SherpaOnnx
|
||||
|
||||
[DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)]
|
||||
private static extern IntPtr SherpaOnnxOfflineTtsGenerateWithCallback(IntPtr handle, [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I1)] byte[] utf8Text, int sid, float speed, OfflineTtsCallback callback);
|
||||
|
||||
[DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)]
|
||||
private static extern IntPtr SherpaOnnxOfflineTtsGenerateWithProgressCallback(IntPtr handle, [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I1)] byte[] utf8Text, int sid, float speed, OfflineTtsCallbackProgress callback);
|
||||
}
|
||||
}
|
||||
|
||||
33
scripts/dotnet/OfflineTtsKokoroModelConfig.cs
Normal file
33
scripts/dotnet/OfflineTtsKokoroModelConfig.cs
Normal file
@@ -0,0 +1,33 @@
|
||||
/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace SherpaOnnx
|
||||
{
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct OfflineTtsKokoroModelConfig
|
||||
{
|
||||
public OfflineTtsKokoroModelConfig()
|
||||
{
|
||||
Model = "";
|
||||
Voices = "";
|
||||
Tokens = "";
|
||||
DataDir = "";
|
||||
|
||||
LengthScale = 1.0F;
|
||||
}
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string Model;
|
||||
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string Voices;
|
||||
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string Tokens;
|
||||
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string DataDir;
|
||||
|
||||
public float LengthScale;
|
||||
}
|
||||
}
|
||||
@@ -12,6 +12,7 @@ namespace SherpaOnnx
|
||||
{
|
||||
Vits = new OfflineTtsVitsModelConfig();
|
||||
Matcha = new OfflineTtsMatchaModelConfig();
|
||||
Kokoro = new OfflineTtsKokoroModelConfig();
|
||||
NumThreads = 1;
|
||||
Debug = 0;
|
||||
Provider = "cpu";
|
||||
@@ -24,5 +25,6 @@ namespace SherpaOnnx
|
||||
public string Provider;
|
||||
|
||||
public OfflineTtsMatchaModelConfig Matcha;
|
||||
public OfflineTtsKokoroModelConfig Kokoro;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user