Upgraded to .NET 8 and made code style a little more internally consistent. (#1680)
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
|
||||
@@ -4,171 +4,166 @@ using System.IO;
|
||||
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace SherpaOnnx
|
||||
namespace SherpaOnnx;
|
||||
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct WaveHeader
|
||||
{
|
||||
public int ChunkID;
|
||||
public int ChunkSize;
|
||||
public int Format;
|
||||
public int SubChunk1ID;
|
||||
public int SubChunk1Size;
|
||||
public short AudioFormat;
|
||||
public short NumChannels;
|
||||
public int SampleRate;
|
||||
public int ByteRate;
|
||||
public short BlockAlign;
|
||||
public short BitsPerSample;
|
||||
public int SubChunk2ID;
|
||||
public int SubChunk2Size;
|
||||
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct WaveHeader
|
||||
public bool Validate()
|
||||
{
|
||||
public Int32 ChunkID;
|
||||
public Int32 ChunkSize;
|
||||
public Int32 Format;
|
||||
public Int32 SubChunk1ID;
|
||||
public Int32 SubChunk1Size;
|
||||
public Int16 AudioFormat;
|
||||
public Int16 NumChannels;
|
||||
public Int32 SampleRate;
|
||||
public Int32 ByteRate;
|
||||
public Int16 BlockAlign;
|
||||
public Int16 BitsPerSample;
|
||||
public Int32 SubChunk2ID;
|
||||
public Int32 SubChunk2Size;
|
||||
|
||||
public bool Validate()
|
||||
if (ChunkID != 0x46464952)
|
||||
{
|
||||
if (ChunkID != 0x46464952)
|
||||
{
|
||||
Console.WriteLine($"Invalid chunk ID: 0x{ChunkID:X}. Expect 0x46464952");
|
||||
return false;
|
||||
}
|
||||
|
||||
// E V A W
|
||||
if (Format != 0x45564157)
|
||||
{
|
||||
Console.WriteLine($"Invalid format: 0x{Format:X}. Expect 0x45564157");
|
||||
return false;
|
||||
}
|
||||
|
||||
// t m f
|
||||
if (SubChunk1ID != 0x20746d66)
|
||||
{
|
||||
Console.WriteLine($"Invalid SubChunk1ID: 0x{SubChunk1ID:X}. Expect 0x20746d66");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (SubChunk1Size != 16)
|
||||
{
|
||||
Console.WriteLine($"Invalid SubChunk1Size: {SubChunk1Size}. Expect 16");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (AudioFormat != 1)
|
||||
{
|
||||
Console.WriteLine($"Invalid AudioFormat: {AudioFormat}. Expect 1");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (NumChannels != 1)
|
||||
{
|
||||
Console.WriteLine($"Invalid NumChannels: {NumChannels}. Expect 1");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ByteRate != (SampleRate * NumChannels * BitsPerSample / 8))
|
||||
{
|
||||
Console.WriteLine($"Invalid byte rate: {ByteRate}.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (BlockAlign != (NumChannels * BitsPerSample / 8))
|
||||
{
|
||||
Console.WriteLine($"Invalid block align: {ByteRate}.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (BitsPerSample != 16)
|
||||
{ // we support only 16 bits per sample
|
||||
Console.WriteLine($"Invalid bits per sample: {BitsPerSample}. Expect 16");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
Console.WriteLine($"Invalid chunk ID: 0x{ChunkID:X}. Expect 0x46464952");
|
||||
return false;
|
||||
}
|
||||
|
||||
// E V A W
|
||||
if (Format != 0x45564157)
|
||||
{
|
||||
Console.WriteLine($"Invalid format: 0x{Format:X}. Expect 0x45564157");
|
||||
return false;
|
||||
}
|
||||
|
||||
// t m f
|
||||
if (SubChunk1ID != 0x20746d66)
|
||||
{
|
||||
Console.WriteLine($"Invalid SubChunk1ID: 0x{SubChunk1ID:X}. Expect 0x20746d66");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (SubChunk1Size != 16)
|
||||
{
|
||||
Console.WriteLine($"Invalid SubChunk1Size: {SubChunk1Size}. Expect 16");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (AudioFormat != 1)
|
||||
{
|
||||
Console.WriteLine($"Invalid AudioFormat: {AudioFormat}. Expect 1");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (NumChannels != 1)
|
||||
{
|
||||
Console.WriteLine($"Invalid NumChannels: {NumChannels}. Expect 1");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ByteRate != (SampleRate * NumChannels * BitsPerSample / 8))
|
||||
{
|
||||
Console.WriteLine($"Invalid byte rate: {ByteRate}.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (BlockAlign != (NumChannels * BitsPerSample / 8))
|
||||
{
|
||||
Console.WriteLine($"Invalid block align: {ByteRate}.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (BitsPerSample != 16)
|
||||
{ // we support only 16 bits per sample
|
||||
Console.WriteLine($"Invalid bits per sample: {BitsPerSample}. Expect 16");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// It supports only 16-bit, single channel WAVE format.
|
||||
// The sample rate can be any value.
|
||||
public class WaveReader
|
||||
{
|
||||
public WaveReader(string fileName)
|
||||
{
|
||||
if (!File.Exists(fileName))
|
||||
{
|
||||
throw new ApplicationException($"{fileName} does not exist!");
|
||||
}
|
||||
|
||||
using var stream = File.Open(fileName, FileMode.Open);
|
||||
using var reader = new BinaryReader(stream);
|
||||
|
||||
_header = ReadHeader(reader);
|
||||
|
||||
if (!_header.Validate())
|
||||
{
|
||||
throw new ApplicationException($"Invalid wave file ${fileName}");
|
||||
}
|
||||
|
||||
SkipMetaData(reader);
|
||||
|
||||
// now read samples
|
||||
// _header.SubChunk2Size contains number of bytes in total.
|
||||
// we assume each sample is of type int16
|
||||
var buffer = reader.ReadBytes(_header.SubChunk2Size);
|
||||
var samples_int16 = new short[_header.SubChunk2Size / 2];
|
||||
Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length);
|
||||
|
||||
_samples = new float[samples_int16.Length];
|
||||
|
||||
for (var i = 0; i < samples_int16.Length; ++i)
|
||||
{
|
||||
_samples[i] = samples_int16[i] / 32768.0F;
|
||||
}
|
||||
}
|
||||
|
||||
private static WaveHeader ReadHeader(BinaryReader reader)
|
||||
{
|
||||
var bytes = reader.ReadBytes(Marshal.SizeOf(typeof(WaveHeader)));
|
||||
|
||||
GCHandle handle = GCHandle.Alloc(bytes, GCHandleType.Pinned);
|
||||
WaveHeader header = (WaveHeader)Marshal.PtrToStructure(handle.AddrOfPinnedObject(), typeof(WaveHeader))!;
|
||||
handle.Free();
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
private void SkipMetaData(BinaryReader reader)
|
||||
{
|
||||
var bs = reader.BaseStream;
|
||||
|
||||
var subChunk2ID = _header.SubChunk2ID;
|
||||
var subChunk2Size = _header.SubChunk2Size;
|
||||
|
||||
while (bs.Position != bs.Length && subChunk2ID != 0x61746164)
|
||||
{
|
||||
bs.Seek(subChunk2Size, SeekOrigin.Current);
|
||||
subChunk2ID = reader.ReadInt32();
|
||||
subChunk2Size = reader.ReadInt32();
|
||||
}
|
||||
_header.SubChunk2ID = subChunk2ID;
|
||||
_header.SubChunk2Size = subChunk2Size;
|
||||
}
|
||||
|
||||
private WaveHeader _header;
|
||||
|
||||
// Samples are normalized to the range [-1, 1]
|
||||
private float[] _samples;
|
||||
|
||||
public int SampleRate => _header.SampleRate;
|
||||
|
||||
public float[] Samples => _samples;
|
||||
|
||||
public static void Test(string fileName)
|
||||
{
|
||||
WaveReader reader = new WaveReader(fileName);
|
||||
Console.WriteLine($"samples length: {reader.Samples.Length}");
|
||||
Console.WriteLine($"samples rate: {reader.SampleRate}");
|
||||
}
|
||||
|
||||
// It supports only 16-bit, single channel WAVE format.
|
||||
// The sample rate can be any value.
|
||||
public class WaveReader
|
||||
{
|
||||
public WaveReader(String fileName)
|
||||
{
|
||||
if (!File.Exists(fileName))
|
||||
{
|
||||
throw new ApplicationException($"{fileName} does not exist!");
|
||||
}
|
||||
|
||||
using (var stream = File.Open(fileName, FileMode.Open))
|
||||
{
|
||||
using (var reader = new BinaryReader(stream))
|
||||
{
|
||||
_header = ReadHeader(reader);
|
||||
|
||||
if (!_header.Validate())
|
||||
{
|
||||
throw new ApplicationException($"Invalid wave file ${fileName}");
|
||||
}
|
||||
|
||||
SkipMetaData(reader);
|
||||
|
||||
// now read samples
|
||||
// _header.SubChunk2Size contains number of bytes in total.
|
||||
// we assume each sample is of type int16
|
||||
byte[] buffer = reader.ReadBytes(_header.SubChunk2Size);
|
||||
short[] samples_int16 = new short[_header.SubChunk2Size / 2];
|
||||
Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length);
|
||||
|
||||
_samples = new float[samples_int16.Length];
|
||||
|
||||
for (var i = 0; i < samples_int16.Length; ++i)
|
||||
{
|
||||
_samples[i] = samples_int16[i] / 32768.0F;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static WaveHeader ReadHeader(BinaryReader reader)
|
||||
{
|
||||
byte[] bytes = reader.ReadBytes(Marshal.SizeOf(typeof(WaveHeader)));
|
||||
|
||||
GCHandle handle = GCHandle.Alloc(bytes, GCHandleType.Pinned);
|
||||
WaveHeader header = (WaveHeader)Marshal.PtrToStructure(handle.AddrOfPinnedObject(), typeof(WaveHeader))!;
|
||||
handle.Free();
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
private void SkipMetaData(BinaryReader reader)
|
||||
{
|
||||
var bs = reader.BaseStream;
|
||||
|
||||
Int32 subChunk2ID = _header.SubChunk2ID;
|
||||
Int32 subChunk2Size = _header.SubChunk2Size;
|
||||
|
||||
while (bs.Position != bs.Length && subChunk2ID != 0x61746164)
|
||||
{
|
||||
bs.Seek(subChunk2Size, SeekOrigin.Current);
|
||||
subChunk2ID = reader.ReadInt32();
|
||||
subChunk2Size = reader.ReadInt32();
|
||||
}
|
||||
_header.SubChunk2ID = subChunk2ID;
|
||||
_header.SubChunk2Size = subChunk2Size;
|
||||
}
|
||||
|
||||
private WaveHeader _header;
|
||||
|
||||
// Samples are normalized to the range [-1, 1]
|
||||
private float[] _samples;
|
||||
|
||||
public int SampleRate => _header.SampleRate;
|
||||
public float[] Samples => _samples;
|
||||
|
||||
public static void Test(String fileName)
|
||||
{
|
||||
WaveReader reader = new WaveReader(fileName);
|
||||
Console.WriteLine($"samples length: {reader.Samples.Length}");
|
||||
Console.WriteLine($"samples rate: {reader.SampleRate}");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -13,8 +13,6 @@
|
||||
// dotnet run
|
||||
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
|
||||
class KeywordSpotterDemo
|
||||
{
|
||||
@@ -38,11 +36,11 @@ class KeywordSpotterDemo
|
||||
|
||||
var filename = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav";
|
||||
|
||||
WaveReader waveReader = new WaveReader(filename);
|
||||
var waveReader = new WaveReader(filename);
|
||||
|
||||
Console.WriteLine("----------Use pre-defined keywords----------");
|
||||
|
||||
OnlineStream s = kws.CreateStream();
|
||||
var s = kws.CreateStream();
|
||||
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
|
||||
|
||||
float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
|
||||
@@ -53,7 +51,7 @@ class KeywordSpotterDemo
|
||||
{
|
||||
kws.Decode(s);
|
||||
var result = kws.GetResult(s);
|
||||
if (result.Keyword != "")
|
||||
if (result.Keyword != string.Empty)
|
||||
{
|
||||
Console.WriteLine("Detected: {0}", result.Keyword);
|
||||
}
|
||||
@@ -70,7 +68,7 @@ class KeywordSpotterDemo
|
||||
{
|
||||
kws.Decode(s);
|
||||
var result = kws.GetResult(s);
|
||||
if (result.Keyword != "")
|
||||
if (result.Keyword != string.Empty)
|
||||
{
|
||||
Console.WriteLine("Detected: {0}", result.Keyword);
|
||||
}
|
||||
@@ -89,7 +87,7 @@ class KeywordSpotterDemo
|
||||
{
|
||||
kws.Decode(s);
|
||||
var result = kws.GetResult(s);
|
||||
if (result.Keyword != "")
|
||||
if (result.Keyword != string.Empty)
|
||||
{
|
||||
Console.WriteLine("Detected: {0}", result.Keyword);
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>keyword_spotting_from_files</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -12,12 +12,9 @@
|
||||
//
|
||||
// dotnet run
|
||||
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.InteropServices;
|
||||
using System;
|
||||
|
||||
using PortAudioSharp;
|
||||
using SherpaOnnx;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
class KeywordSpotterDemo
|
||||
{
|
||||
@@ -41,11 +38,11 @@ class KeywordSpotterDemo
|
||||
|
||||
var filename = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav";
|
||||
|
||||
WaveReader waveReader = new WaveReader(filename);
|
||||
var waveReader = new WaveReader(filename);
|
||||
|
||||
Console.WriteLine("----------Use pre-defined keywords----------");
|
||||
|
||||
OnlineStream s = kws.CreateStream();
|
||||
var s = kws.CreateStream();
|
||||
|
||||
Console.WriteLine(PortAudio.VersionInfo.versionText);
|
||||
PortAudio.Initialize();
|
||||
@@ -54,7 +51,7 @@ class KeywordSpotterDemo
|
||||
for (int i = 0; i != PortAudio.DeviceCount; ++i)
|
||||
{
|
||||
Console.WriteLine($" Device {i}");
|
||||
DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i);
|
||||
var deviceInfo = PortAudio.GetDeviceInfo(i);
|
||||
Console.WriteLine($" Name: {deviceInfo.name}");
|
||||
Console.WriteLine($" Max input channels: {deviceInfo.maxInputChannels}");
|
||||
Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}");
|
||||
@@ -66,12 +63,12 @@ class KeywordSpotterDemo
|
||||
Environment.Exit(1);
|
||||
}
|
||||
|
||||
DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex);
|
||||
var info = PortAudio.GetDeviceInfo(deviceIndex);
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Use default device {deviceIndex} ({info.name})");
|
||||
|
||||
StreamParameters param = new StreamParameters();
|
||||
var param = new StreamParameters();
|
||||
param.device = deviceIndex;
|
||||
param.channelCount = 1;
|
||||
param.sampleFormat = SampleFormat.Float32;
|
||||
@@ -79,21 +76,21 @@ class KeywordSpotterDemo
|
||||
param.hostApiSpecificStreamInfo = IntPtr.Zero;
|
||||
|
||||
PortAudioSharp.Stream.Callback callback = (IntPtr input, IntPtr output,
|
||||
UInt32 frameCount,
|
||||
uint frameCount,
|
||||
ref StreamCallbackTimeInfo timeInfo,
|
||||
StreamCallbackFlags statusFlags,
|
||||
IntPtr userData
|
||||
) =>
|
||||
{
|
||||
float[] samples = new float[frameCount];
|
||||
Marshal.Copy(input, samples, 0, (Int32)frameCount);
|
||||
var samples = new float[frameCount];
|
||||
Marshal.Copy(input, samples, 0, (int)frameCount);
|
||||
|
||||
s.AcceptWaveform(config.FeatConfig.SampleRate, samples);
|
||||
|
||||
return StreamCallbackResult.Continue;
|
||||
};
|
||||
|
||||
PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: param, outParams: null, sampleRate: config.FeatConfig.SampleRate,
|
||||
var stream = new PortAudioSharp.Stream(inParams: param, outParams: null, sampleRate: config.FeatConfig.SampleRate,
|
||||
framesPerBuffer: 0,
|
||||
streamFlags: StreamFlags.ClipOff,
|
||||
callback: callback,
|
||||
@@ -113,15 +110,13 @@ class KeywordSpotterDemo
|
||||
}
|
||||
|
||||
var result = kws.GetResult(s);
|
||||
if (result.Keyword != "")
|
||||
if (result.Keyword != string.Empty)
|
||||
{
|
||||
Console.WriteLine("Detected: {0}", result.Keyword);
|
||||
}
|
||||
|
||||
Thread.Sleep(200); // ms
|
||||
}
|
||||
|
||||
PortAudio.Terminate();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>keyword_spotting_from_microphone</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -5,17 +5,14 @@
|
||||
// Please refer to
|
||||
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
// to download non-streaming models
|
||||
using CommandLine.Text;
|
||||
using CommandLine;
|
||||
using CommandLine.Text;
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
|
||||
class OfflineDecodeFiles
|
||||
{
|
||||
class Options
|
||||
{
|
||||
|
||||
[Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")]
|
||||
public int SampleRate { get; set; } = 16000;
|
||||
|
||||
@@ -23,58 +20,58 @@ class OfflineDecodeFiles
|
||||
public int FeatureDim { get; set; } = 80;
|
||||
|
||||
[Option(Required = false, HelpText = "Path to tokens.txt")]
|
||||
public string Tokens { get; set; } = "";
|
||||
public string Tokens { get; set; } = string.Empty;
|
||||
|
||||
[Option(Required = false, Default = "", HelpText = "Path to transducer encoder.onnx. Used only for transducer models")]
|
||||
public string Encoder { get; set; } = "";
|
||||
public string Encoder { get; set; } = string.Empty;
|
||||
|
||||
[Option(Required = false, Default = "", HelpText = "Path to transducer decoder.onnx. Used only for transducer models")]
|
||||
public string Decoder { get; set; } = "";
|
||||
public string Decoder { get; set; } = string.Empty;
|
||||
|
||||
[Option(Required = false, Default = "", HelpText = "Path to transducer joiner.onnx. Used only for transducer models")]
|
||||
public string Joiner { get; set; } = "";
|
||||
public string Joiner { get; set; } = string.Empty;
|
||||
|
||||
[Option("model-type", Required = false, Default = "", HelpText = "model type")]
|
||||
public string ModelType { get; set; } = "";
|
||||
public string ModelType { get; set; } = string.Empty;
|
||||
|
||||
[Option("whisper-encoder", Required = false, Default = "", HelpText = "Path to whisper encoder.onnx. Used only for whisper models")]
|
||||
public string WhisperEncoder { get; set; } = "";
|
||||
public string WhisperEncoder { get; set; } = string.Empty;
|
||||
|
||||
[Option("whisper-decoder", Required = false, Default = "", HelpText = "Path to whisper decoder.onnx. Used only for whisper models")]
|
||||
public string WhisperDecoder { get; set; } = "";
|
||||
public string WhisperDecoder { get; set; } = string.Empty;
|
||||
|
||||
[Option("whisper-language", Required = false, Default = "", HelpText = "Language of the input file. Can be empty")]
|
||||
public string WhisperLanguage { get; set; } = "";
|
||||
public string WhisperLanguage { get; set; } = string.Empty;
|
||||
|
||||
[Option("whisper-task", Required = false, Default = "transcribe", HelpText = "transcribe or translate")]
|
||||
public string WhisperTask { get; set; } = "transcribe";
|
||||
|
||||
[Option("moonshine-preprocessor", Required = false, Default = "", HelpText = "Path to preprocess.onnx. Used only for Moonshine models")]
|
||||
public string MoonshinePreprocessor { get; set; } = "";
|
||||
public string MoonshinePreprocessor { get; set; } = string.Empty;
|
||||
|
||||
[Option("moonshine-encoder", Required = false, Default = "", HelpText = "Path to encode.onnx. Used only for Moonshine models")]
|
||||
public string MoonshineEncoder { get; set; } = "";
|
||||
public string MoonshineEncoder { get; set; } = string.Empty;
|
||||
|
||||
[Option("moonshine-uncached-decoder", Required = false, Default = "", HelpText = "Path to uncached_decode.onnx. Used only for Moonshine models")]
|
||||
public string MoonshineUncachedDecoder { get; set; } = "";
|
||||
public string MoonshineUncachedDecoder { get; set; } = string.Empty;
|
||||
|
||||
[Option("moonshine-cached-decoder", Required = false, Default = "", HelpText = "Path to cached_decode.onnx. Used only for Moonshine models")]
|
||||
public string MoonshineCachedDecoder { get; set; } = "";
|
||||
public string MoonshineCachedDecoder { get; set; } = string.Empty;
|
||||
|
||||
[Option("tdnn-model", Required = false, Default = "", HelpText = "Path to tdnn yesno model")]
|
||||
public string TdnnModel { get; set; } = "";
|
||||
public string TdnnModel { get; set; } = string.Empty;
|
||||
|
||||
[Option(Required = false, HelpText = "Path to model.onnx. Used only for paraformer models")]
|
||||
public string Paraformer { get; set; } = "";
|
||||
public string Paraformer { get; set; } = string.Empty;
|
||||
|
||||
[Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")]
|
||||
public string NeMoCtc { get; set; } = "";
|
||||
public string NeMoCtc { get; set; } = string.Empty;
|
||||
|
||||
[Option("telespeech-ctc", Required = false, HelpText = "Path to model.onnx. Used only for TeleSpeech CTC models")]
|
||||
public string TeleSpeechCtc { get; set; } = "";
|
||||
public string TeleSpeechCtc { get; set; } = string.Empty;
|
||||
|
||||
[Option("sense-voice-model", Required = false, HelpText = "Path to model.onnx. Used only for SenseVoice CTC models")]
|
||||
public string SenseVoiceModel { get; set; } = "";
|
||||
public string SenseVoiceModel { get; set; } = string.Empty;
|
||||
|
||||
[Option("sense-voice-use-itn", Required = false, HelpText = "1 to use inverse text normalization for sense voice.")]
|
||||
public int SenseVoiceUseItn { get; set; } = 1;
|
||||
@@ -88,7 +85,7 @@ class OfflineDecodeFiles
|
||||
|
||||
[Option("rule-fsts", Required = false, Default = "",
|
||||
HelpText = "If not empty, path to rule fst for inverse text normalization")]
|
||||
public string RuleFsts { get; set; } = "";
|
||||
public string RuleFsts { get; set; } = string.Empty;
|
||||
|
||||
[Option("max-active-paths", Required = false, Default = 4,
|
||||
HelpText = @"Used only when --decoding--method is modified_beam_search.
|
||||
@@ -96,7 +93,7 @@ It specifies number of active paths to keep during the search")]
|
||||
public int MaxActivePaths { get; set; } = 4;
|
||||
|
||||
[Option("hotwords-file", Required = false, Default = "", HelpText = "Path to hotwords.txt")]
|
||||
public string HotwordsFile { get; set; } = "";
|
||||
public string HotwordsFile { get; set; } = string.Empty;
|
||||
|
||||
[Option("hotwords-score", Required = false, Default = 1.5F, HelpText = "hotwords score")]
|
||||
public float HotwordsScore { get; set; } = 1.5F;
|
||||
@@ -117,7 +114,7 @@ It specifies number of active paths to keep during the search")]
|
||||
|
||||
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
|
||||
{
|
||||
string usage = @"
|
||||
var usage = @"
|
||||
# Zipformer
|
||||
|
||||
dotnet run \
|
||||
@@ -213,42 +210,42 @@ to download pre-trained Tdnn models.
|
||||
|
||||
config.ModelConfig.Tokens = options.Tokens;
|
||||
|
||||
if (!String.IsNullOrEmpty(options.Encoder))
|
||||
if (!string.IsNullOrEmpty(options.Encoder))
|
||||
{
|
||||
// this is a transducer model
|
||||
config.ModelConfig.Transducer.Encoder = options.Encoder;
|
||||
config.ModelConfig.Transducer.Decoder = options.Decoder;
|
||||
config.ModelConfig.Transducer.Joiner = options.Joiner;
|
||||
}
|
||||
else if (!String.IsNullOrEmpty(options.Paraformer))
|
||||
else if (!string.IsNullOrEmpty(options.Paraformer))
|
||||
{
|
||||
config.ModelConfig.Paraformer.Model = options.Paraformer;
|
||||
}
|
||||
else if (!String.IsNullOrEmpty(options.NeMoCtc))
|
||||
else if (!string.IsNullOrEmpty(options.NeMoCtc))
|
||||
{
|
||||
config.ModelConfig.NeMoCtc.Model = options.NeMoCtc;
|
||||
}
|
||||
else if (!String.IsNullOrEmpty(options.TeleSpeechCtc))
|
||||
else if (!string.IsNullOrEmpty(options.TeleSpeechCtc))
|
||||
{
|
||||
config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc;
|
||||
}
|
||||
else if (!String.IsNullOrEmpty(options.WhisperEncoder))
|
||||
else if (!string.IsNullOrEmpty(options.WhisperEncoder))
|
||||
{
|
||||
config.ModelConfig.Whisper.Encoder = options.WhisperEncoder;
|
||||
config.ModelConfig.Whisper.Decoder = options.WhisperDecoder;
|
||||
config.ModelConfig.Whisper.Language = options.WhisperLanguage;
|
||||
config.ModelConfig.Whisper.Task = options.WhisperTask;
|
||||
}
|
||||
else if (!String.IsNullOrEmpty(options.TdnnModel))
|
||||
else if (!string.IsNullOrEmpty(options.TdnnModel))
|
||||
{
|
||||
config.ModelConfig.Tdnn.Model = options.TdnnModel;
|
||||
}
|
||||
else if (!String.IsNullOrEmpty(options.SenseVoiceModel))
|
||||
else if (!string.IsNullOrEmpty(options.SenseVoiceModel))
|
||||
{
|
||||
config.ModelConfig.SenseVoice.Model = options.SenseVoiceModel;
|
||||
config.ModelConfig.SenseVoice.UseInverseTextNormalization = options.SenseVoiceUseItn;
|
||||
}
|
||||
else if (!String.IsNullOrEmpty(options.MoonshinePreprocessor))
|
||||
else if (!string.IsNullOrEmpty(options.MoonshinePreprocessor))
|
||||
{
|
||||
config.ModelConfig.Moonshine.Preprocessor = options.MoonshinePreprocessor;
|
||||
config.ModelConfig.Moonshine.Encoder = options.MoonshineEncoder;
|
||||
@@ -270,17 +267,17 @@ to download pre-trained Tdnn models.
|
||||
|
||||
config.ModelConfig.Debug = 0;
|
||||
|
||||
OfflineRecognizer recognizer = new OfflineRecognizer(config);
|
||||
var recognizer = new OfflineRecognizer(config);
|
||||
|
||||
string[] files = options.Files.ToArray();
|
||||
var files = options.Files.ToArray();
|
||||
|
||||
// We create a separate stream for each file
|
||||
List<OfflineStream> streams = new List<OfflineStream>();
|
||||
var streams = new List<OfflineStream>();
|
||||
streams.EnsureCapacity(files.Length);
|
||||
|
||||
for (int i = 0; i != files.Length; ++i)
|
||||
{
|
||||
OfflineStream s = recognizer.CreateStream();
|
||||
var s = recognizer.CreateStream();
|
||||
|
||||
WaveReader waveReader = new WaveReader(files[i]);
|
||||
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
|
||||
@@ -299,7 +296,7 @@ to download pre-trained Tdnn models.
|
||||
Console.WriteLine("Tokens: [{0}]", string.Join(", ", r.Tokens));
|
||||
if (r.Timestamps != null && r.Timestamps.Length > 0) {
|
||||
Console.Write("Timestamps: [");
|
||||
var sep = "";
|
||||
var sep = string.Empty;
|
||||
for (int k = 0; k != r.Timestamps.Length; ++k)
|
||||
{
|
||||
Console.Write("{0}{1}", sep, r.Timestamps[k].ToString("0.00"));
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>offline_decode_files</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// dotnet run
|
||||
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
|
||||
class OfflinePunctuationDemo
|
||||
{
|
||||
@@ -25,14 +23,14 @@ class OfflinePunctuationDemo
|
||||
config.Model.NumThreads = 1;
|
||||
var punct = new OfflinePunctuation(config);
|
||||
|
||||
string[] textList = new string[] {
|
||||
var textList = new string[] {
|
||||
"这是一个测试你好吗How are you我很好thank you are you ok谢谢你",
|
||||
"我们都是木头人不会说话不会动",
|
||||
"The African blogosphere is rapidly expanding bringing more voices online in the form of commentaries opinions analyses rants and poetry",
|
||||
};
|
||||
|
||||
Console.WriteLine("---------");
|
||||
foreach (string text in textList)
|
||||
foreach (var text in textList)
|
||||
{
|
||||
string textWithPunct = punct.AddPunct(text);
|
||||
Console.WriteLine("Input text: {0}", text);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>offline_punctuation</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -34,7 +34,6 @@ Step 4. Run it
|
||||
*/
|
||||
|
||||
using SherpaOnnx;
|
||||
using System;
|
||||
|
||||
class OfflineSpeakerDiarizationDemo
|
||||
{
|
||||
@@ -54,7 +53,7 @@ class OfflineSpeakerDiarizationDemo
|
||||
var sd = new OfflineSpeakerDiarization(config);
|
||||
|
||||
var testWaveFile = "./0-four-speakers-zh.wav";
|
||||
WaveReader waveReader = new WaveReader(testWaveFile);
|
||||
var waveReader = new WaveReader(testWaveFile);
|
||||
if (sd.SampleRate != waveReader.SampleRate)
|
||||
{
|
||||
Console.WriteLine($"Expected sample rate: {sd.SampleRate}. Given: {waveReader.SampleRate}");
|
||||
@@ -65,19 +64,19 @@ class OfflineSpeakerDiarizationDemo
|
||||
|
||||
// var segments = sd.Process(waveReader.Samples); // this one is also ok
|
||||
|
||||
var MyProgressCallback = (int numProcessedChunks, int numTotalChunks, IntPtr arg) =>
|
||||
var progressCallback = (int numProcessedChunks, int numTotalChunks, IntPtr arg) =>
|
||||
{
|
||||
float progress = 100.0F * numProcessedChunks / numTotalChunks;
|
||||
Console.WriteLine("Progress {0}%", String.Format("{0:0.00}", progress));
|
||||
var progress = 100.0F * numProcessedChunks / numTotalChunks;
|
||||
Console.WriteLine("Progress {0}%", string.Format("{0:0.00}", progress));
|
||||
return 0;
|
||||
};
|
||||
|
||||
var callback = new OfflineSpeakerDiarizationProgressCallback(MyProgressCallback);
|
||||
var callback = new OfflineSpeakerDiarizationProgressCallback(progressCallback);
|
||||
var segments = sd.ProcessWithCallback(waveReader.Samples, callback, IntPtr.Zero);
|
||||
|
||||
foreach (var s in segments)
|
||||
{
|
||||
Console.WriteLine("{0} -- {1} speaker_{2}", String.Format("{0:0.00}", s.Start), String.Format("{0:0.00}", s.End), s.Speaker);
|
||||
Console.WriteLine("{0} -- {1} speaker_{2}", string.Format("{0:0.00}", s.Start), string.Format("{0:0.00}", s.End), s.Speaker);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>offline_speaker_diarization</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -10,15 +10,12 @@
|
||||
// Note that you need a speaker to run this file since it will play
|
||||
// the generated audio as it is generating.
|
||||
|
||||
using CommandLine.Text;
|
||||
using CommandLine;
|
||||
using CommandLine.Text;
|
||||
using PortAudioSharp;
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Threading;
|
||||
using System;
|
||||
|
||||
class OfflineTtsPlayDemo
|
||||
{
|
||||
@@ -26,13 +23,13 @@ class OfflineTtsPlayDemo
|
||||
{
|
||||
|
||||
[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
|
||||
public string RuleFsts { get; set; }
|
||||
public string? RuleFsts { get; set; }
|
||||
|
||||
[Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
|
||||
public string DictDir { get; set; }
|
||||
public string? DictDir { get; set; }
|
||||
|
||||
[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
|
||||
public string DataDir { get; set; }
|
||||
public string? DataDir { get; set; }
|
||||
|
||||
[Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
|
||||
public float LengthScale { get; set; }
|
||||
@@ -44,10 +41,10 @@ class OfflineTtsPlayDemo
|
||||
public float NoiseScaleW { get; set; }
|
||||
|
||||
[Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
|
||||
public string Lexicon { get; set; }
|
||||
public string? Lexicon { get; set; }
|
||||
|
||||
[Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]
|
||||
public string Tokens { get; set; }
|
||||
public string? Tokens { get; set; }
|
||||
|
||||
[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
|
||||
public int MaxNumSentences { get; set; }
|
||||
@@ -56,16 +53,16 @@ class OfflineTtsPlayDemo
|
||||
public int Debug { get; set; }
|
||||
|
||||
[Option("vits-model", Required = true, HelpText = "Path to VITS model")]
|
||||
public string Model { get; set; }
|
||||
public string? Model { get; set; }
|
||||
|
||||
[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
|
||||
public int SpeakerId { get; set; }
|
||||
|
||||
[Option("text", Required = true, HelpText = "Text to synthesize")]
|
||||
public string Text { get; set; }
|
||||
public string? Text { get; set; }
|
||||
|
||||
[Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
|
||||
public string OutputFilename { get; set; }
|
||||
public string? OutputFilename { get; set; }
|
||||
}
|
||||
|
||||
static void Main(string[] args)
|
||||
@@ -124,10 +121,9 @@ to download more models.
|
||||
Console.WriteLine(helpText);
|
||||
}
|
||||
|
||||
|
||||
private static void Run(Options options)
|
||||
{
|
||||
OfflineTtsConfig config = new OfflineTtsConfig();
|
||||
var config = new OfflineTtsConfig();
|
||||
config.Model.Vits.Model = options.Model;
|
||||
config.Model.Vits.Lexicon = options.Lexicon;
|
||||
config.Model.Vits.Tokens = options.Tokens;
|
||||
@@ -142,10 +138,9 @@ to download more models.
|
||||
config.RuleFsts = options.RuleFsts;
|
||||
config.MaxNumSentences = options.MaxNumSentences;
|
||||
|
||||
OfflineTts tts = new OfflineTts(config);
|
||||
float speed = 1.0f / options.LengthScale;
|
||||
int sid = options.SpeakerId;
|
||||
|
||||
var tts = new OfflineTts(config);
|
||||
var speed = 1.0f / options.LengthScale;
|
||||
var sid = options.SpeakerId;
|
||||
|
||||
Console.WriteLine(PortAudio.VersionInfo.versionText);
|
||||
PortAudio.Initialize();
|
||||
@@ -166,11 +161,11 @@ to download more models.
|
||||
Environment.Exit(1);
|
||||
}
|
||||
|
||||
DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex);
|
||||
var info = PortAudio.GetDeviceInfo(deviceIndex);
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Use output default device {deviceIndex} ({info.name})");
|
||||
|
||||
StreamParameters param = new StreamParameters();
|
||||
var param = new StreamParameters();
|
||||
param.device = deviceIndex;
|
||||
param.channelCount = 1;
|
||||
param.sampleFormat = SampleFormat.Float32;
|
||||
@@ -178,7 +173,7 @@ to download more models.
|
||||
param.hostApiSpecificStreamInfo = IntPtr.Zero;
|
||||
|
||||
// https://learn.microsoft.com/en-us/dotnet/standard/collections/thread-safe/blockingcollection-overview
|
||||
BlockingCollection<float[]> dataItems = new BlockingCollection<float[]>();
|
||||
var dataItems = new BlockingCollection<float[]>();
|
||||
|
||||
var MyCallback = (IntPtr samples, int n) =>
|
||||
{
|
||||
@@ -193,9 +188,9 @@ to download more models.
|
||||
return 1;
|
||||
};
|
||||
|
||||
bool playFinished = false;
|
||||
var playFinished = false;
|
||||
|
||||
float[] lastSampleArray = null;
|
||||
float[]? lastSampleArray = null;
|
||||
int lastIndex = 0; // not played
|
||||
|
||||
PortAudioSharp.Stream.Callback playCallback = (IntPtr input, IntPtr output,
|
||||
@@ -270,10 +265,10 @@ to download more models.
|
||||
|
||||
stream.Start();
|
||||
|
||||
OfflineTtsCallback callback = new OfflineTtsCallback(MyCallback);
|
||||
var callback = new OfflineTtsCallback(MyCallback);
|
||||
|
||||
OfflineTtsGeneratedAudio audio = tts.GenerateWithCallback(options.Text, speed, sid, callback);
|
||||
bool ok = audio.SaveToWaveFile(options.OutputFilename);
|
||||
var audio = tts.GenerateWithCallback(options.Text, speed, sid, callback);
|
||||
var ok = audio.SaveToWaveFile(options.OutputFilename);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>offline_tts_play</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -6,28 +6,25 @@
|
||||
// and
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
// to download pre-trained models
|
||||
using CommandLine.Text;
|
||||
using CommandLine;
|
||||
using CommandLine.Text;
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
|
||||
class OfflineTtsDemo
|
||||
{
|
||||
class Options
|
||||
{
|
||||
|
||||
[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
|
||||
public string RuleFsts { get; set; } = "";
|
||||
public string RuleFsts { get; set; } = string.Empty;
|
||||
|
||||
[Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
|
||||
public string RuleFars { get; set; } = "";
|
||||
public string RuleFars { get; set; } = string.Empty;
|
||||
|
||||
[Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
|
||||
public string DictDir { get; set; } = "";
|
||||
public string DictDir { get; set; } = string.Empty;
|
||||
|
||||
[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
|
||||
public string DataDir { get; set; } = "";
|
||||
public string DataDir { get; set; } = string.Empty;
|
||||
|
||||
[Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
|
||||
public float LengthScale { get; set; } = 1;
|
||||
@@ -39,10 +36,10 @@ class OfflineTtsDemo
|
||||
public float NoiseScaleW { get; set; } = 0.8F;
|
||||
|
||||
[Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
|
||||
public string Lexicon { get; set; } = "";
|
||||
public string Lexicon { get; set; } = string.Empty;
|
||||
|
||||
[Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]
|
||||
public string Tokens { get; set; } = "";
|
||||
public string Tokens { get; set; } = string.Empty;
|
||||
|
||||
[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
|
||||
public int MaxNumSentences { get; set; } = 1;
|
||||
@@ -51,13 +48,13 @@ class OfflineTtsDemo
|
||||
public int Debug { get; set; } = 0;
|
||||
|
||||
[Option("vits-model", Required = true, HelpText = "Path to VITS model")]
|
||||
public string Model { get; set; } = "";
|
||||
public string Model { get; set; } = string.Empty;
|
||||
|
||||
[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
|
||||
public int SpeakerId { get; set; } = 0;
|
||||
|
||||
[Option("text", Required = true, HelpText = "Text to synthesize")]
|
||||
public string Text { get; set; } = "";
|
||||
public string Text { get; set; } = string.Empty;
|
||||
|
||||
[Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
|
||||
public string OutputFilename { get; set; } = "./generated.wav";
|
||||
@@ -65,7 +62,7 @@ class OfflineTtsDemo
|
||||
|
||||
static void Main(string[] args)
|
||||
{
|
||||
var parser = new CommandLine.Parser(with => with.HelpWriter = null);
|
||||
var parser = new Parser(with => with.HelpWriter = null);
|
||||
var parserResult = parser.ParseArguments<Options>(args);
|
||||
|
||||
parserResult
|
||||
@@ -75,7 +72,7 @@ class OfflineTtsDemo
|
||||
|
||||
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
|
||||
{
|
||||
string usage = @"
|
||||
var usage = @"
|
||||
# vits-aishell3
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
@@ -122,7 +119,7 @@ to download more models.
|
||||
|
||||
private static void Run(Options options)
|
||||
{
|
||||
OfflineTtsConfig config = new OfflineTtsConfig();
|
||||
var config = new OfflineTtsConfig();
|
||||
config.Model.Vits.Model = options.Model;
|
||||
config.Model.Vits.Lexicon = options.Lexicon;
|
||||
config.Model.Vits.Tokens = options.Tokens;
|
||||
@@ -138,11 +135,11 @@ to download more models.
|
||||
config.RuleFars = options.RuleFars;
|
||||
config.MaxNumSentences = options.MaxNumSentences;
|
||||
|
||||
OfflineTts tts = new OfflineTts(config);
|
||||
float speed = 1.0f / options.LengthScale;
|
||||
int sid = options.SpeakerId;
|
||||
OfflineTtsGeneratedAudio audio = tts.Generate(options.Text, speed, sid);
|
||||
bool ok = audio.SaveToWaveFile(options.OutputFilename);
|
||||
var tts = new OfflineTts(config);
|
||||
var speed = 1.0f / options.LengthScale;
|
||||
var sid = options.SpeakerId;
|
||||
var audio = tts.Generate(options.Text, speed, sid);
|
||||
var ok = audio.SaveToWaveFile(options.OutputFilename);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>offline_tts</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -6,40 +6,37 @@
|
||||
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html
|
||||
// to download streaming models
|
||||
|
||||
using CommandLine.Text;
|
||||
using CommandLine;
|
||||
using CommandLine.Text;
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System;
|
||||
|
||||
class OnlineDecodeFiles
|
||||
{
|
||||
class Options
|
||||
{
|
||||
[Option(Required = true, HelpText = "Path to tokens.txt")]
|
||||
public string Tokens { get; set; } = "";
|
||||
public string Tokens { get; set; } = string.Empty;
|
||||
|
||||
[Option(Required = false, Default = "cpu", HelpText = "Provider, e.g., cpu, coreml")]
|
||||
public string Provider { get; set; } = "";
|
||||
public string Provider { get; set; } = string.Empty;
|
||||
|
||||
[Option(Required = false, HelpText = "Path to transducer encoder.onnx")]
|
||||
public string Encoder { get; set; } = "";
|
||||
public string Encoder { get; set; } = string.Empty;
|
||||
|
||||
[Option(Required = false, HelpText = "Path to transducer decoder.onnx")]
|
||||
public string Decoder { get; set; } = "";
|
||||
public string Decoder { get; set; } = string.Empty;
|
||||
|
||||
[Option(Required = false, HelpText = "Path to transducer joiner.onnx")]
|
||||
public string Joiner { get; set; } = "";
|
||||
public string Joiner { get; set; } = string.Empty;
|
||||
|
||||
[Option("paraformer-encoder", Required = false, HelpText = "Path to paraformer encoder.onnx")]
|
||||
public string ParaformerEncoder { get; set; } = "";
|
||||
public string ParaformerEncoder { get; set; } = string.Empty;
|
||||
|
||||
[Option("paraformer-decoder", Required = false, HelpText = "Path to paraformer decoder.onnx")]
|
||||
public string ParaformerDecoder { get; set; } = "";
|
||||
public string ParaformerDecoder { get; set; } = string.Empty;
|
||||
|
||||
[Option("zipformer2-ctc", Required = false, HelpText = "Path to zipformer2 CTC onnx model")]
|
||||
public string Zipformer2Ctc { get; set; } = "";
|
||||
public string Zipformer2Ctc { get; set; } = string.Empty;
|
||||
|
||||
[Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")]
|
||||
public int NumThreads { get; set; } = 1;
|
||||
@@ -80,15 +77,14 @@ larger than this value. Used only when --enable-endpoint is true.")]
|
||||
public float Rule3MinUtteranceLength { get; set; } = 20.0F;
|
||||
|
||||
[Option("hotwords-file", Required = false, Default = "", HelpText = "Path to hotwords.txt")]
|
||||
public string HotwordsFile { get; set; } = "";
|
||||
public string HotwordsFile { get; set; } = string.Empty;
|
||||
|
||||
[Option("hotwords-score", Required = false, Default = 1.5F, HelpText = "hotwords score")]
|
||||
public float HotwordsScore { get; set; } = 1.5F;
|
||||
|
||||
[Option("rule-fsts", Required = false, Default = "",
|
||||
HelpText = "If not empty, path to rule fst for inverse text normalization")]
|
||||
public string RuleFsts { get; set; } = "";
|
||||
|
||||
public string RuleFsts { get; set; } = string.Empty;
|
||||
|
||||
[Option("files", Required = true, HelpText = "Audio files for decoding")]
|
||||
public IEnumerable<string> Files { get; set; } = new string[] {};
|
||||
@@ -162,7 +158,7 @@ to download pre-trained streaming models.
|
||||
|
||||
private static void Run(Options options)
|
||||
{
|
||||
OnlineRecognizerConfig config = new OnlineRecognizerConfig();
|
||||
var config = new OnlineRecognizerConfig();
|
||||
config.FeatConfig.SampleRate = options.SampleRate;
|
||||
|
||||
// All models from icefall using feature dim 80.
|
||||
@@ -194,22 +190,22 @@ to download pre-trained streaming models.
|
||||
config.HotwordsScore = options.HotwordsScore;
|
||||
config.RuleFsts = options.RuleFsts;
|
||||
|
||||
OnlineRecognizer recognizer = new OnlineRecognizer(config);
|
||||
var recognizer = new OnlineRecognizer(config);
|
||||
|
||||
string[] files = options.Files.ToArray();
|
||||
var files = options.Files.ToArray();
|
||||
|
||||
// We create a separate stream for each file
|
||||
List<OnlineStream> streams = new List<OnlineStream>();
|
||||
var streams = new List<OnlineStream>();
|
||||
streams.EnsureCapacity(files.Length);
|
||||
|
||||
for (int i = 0; i != files.Length; ++i)
|
||||
{
|
||||
OnlineStream s = recognizer.CreateStream();
|
||||
var s = recognizer.CreateStream();
|
||||
|
||||
WaveReader waveReader = new WaveReader(files[i]);
|
||||
var waveReader = new WaveReader(files[i]);
|
||||
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
|
||||
|
||||
float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
|
||||
var tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
|
||||
s.AcceptWaveform(waveReader.SampleRate, tailPadding);
|
||||
s.InputFinished();
|
||||
|
||||
@@ -230,7 +226,7 @@ to download pre-trained streaming models.
|
||||
// display results
|
||||
for (int i = 0; i != files.Length; ++i)
|
||||
{
|
||||
OnlineRecognizerResult r = recognizer.GetResult(streams[i]);
|
||||
var r = recognizer.GetResult(streams[i]);
|
||||
var text = r.Text;
|
||||
var tokens = r.Tokens;
|
||||
Console.WriteLine("--------------------");
|
||||
@@ -238,7 +234,7 @@ to download pre-trained streaming models.
|
||||
Console.WriteLine("text: {0}", text);
|
||||
Console.WriteLine("tokens: [{0}]", string.Join(", ", tokens));
|
||||
Console.Write("timestamps: [");
|
||||
r.Timestamps.ToList().ForEach(i => Console.Write(String.Format("{0:0.00}", i) + ", "));
|
||||
r.Timestamps.ToList().ForEach(i => Console.Write(string.Format("{0:0.00}", i) + ", "));
|
||||
Console.WriteLine("]");
|
||||
}
|
||||
Console.WriteLine("--------------------");
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>online_decode_files</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -29,9 +29,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-files
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-microphone", "keyword-spotting-from-microphone\keyword-spotting-from-microphone.csproj", "{AEE0ED2B-C86F-4952-863C-EAD3219CB4EC}"
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TTS", "TTS\TTS.csproj", "{DACE4A18-4FC8-4437-92BF-5A90BA81286C}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}"
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
@@ -91,10 +89,6 @@ Global
|
||||
{AEE0ED2B-C86F-4952-863C-EAD3219CB4EC}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{AEE0ED2B-C86F-4952-863C-EAD3219CB4EC}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{AEE0ED2B-C86F-4952-863C-EAD3219CB4EC}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
|
||||
@@ -16,20 +16,18 @@
|
||||
// dotnet run
|
||||
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
|
||||
class SpeakerIdentificationDemo
|
||||
{
|
||||
public static float[] ComputeEmbedding(SpeakerEmbeddingExtractor extractor, String filename)
|
||||
public static float[] ComputeEmbedding(SpeakerEmbeddingExtractor extractor, string filename)
|
||||
{
|
||||
WaveReader reader = new WaveReader(filename);
|
||||
var reader = new WaveReader(filename);
|
||||
|
||||
OnlineStream stream = extractor.CreateStream();
|
||||
var stream = extractor.CreateStream();
|
||||
stream.AcceptWaveform(reader.SampleRate, reader.Samples);
|
||||
stream.InputFinished();
|
||||
|
||||
float[] embedding = extractor.Compute(stream);
|
||||
var embedding = extractor.Compute(stream);
|
||||
|
||||
return embedding;
|
||||
}
|
||||
@@ -43,25 +41,25 @@ class SpeakerIdentificationDemo
|
||||
|
||||
var manager = new SpeakerEmbeddingManager(extractor.Dim);
|
||||
|
||||
string[] spk1Files =
|
||||
var spk1Files =
|
||||
new string[] {
|
||||
"./sr-data/enroll/fangjun-sr-1.wav",
|
||||
"./sr-data/enroll/fangjun-sr-2.wav",
|
||||
"./sr-data/enroll/fangjun-sr-3.wav",
|
||||
};
|
||||
float[][] spk1Vec = new float[spk1Files.Length][];
|
||||
var spk1Vec = new float[spk1Files.Length][];
|
||||
|
||||
for (int i = 0; i < spk1Files.Length; ++i)
|
||||
{
|
||||
spk1Vec[i] = ComputeEmbedding(extractor, spk1Files[i]);
|
||||
}
|
||||
|
||||
string[] spk2Files =
|
||||
var spk2Files =
|
||||
new string[] {
|
||||
"./sr-data/enroll/leijun-sr-1.wav", "./sr-data/enroll/leijun-sr-2.wav",
|
||||
};
|
||||
|
||||
float[][] spk2Vec = new float[spk2Files.Length][];
|
||||
var spk2Vec = new float[spk2Files.Length][];
|
||||
|
||||
for (int i = 0; i < spk2Files.Length; ++i)
|
||||
{
|
||||
@@ -100,14 +98,14 @@ class SpeakerIdentificationDemo
|
||||
|
||||
Console.WriteLine("---All speakers---");
|
||||
|
||||
string[] allSpeakers = manager.GetAllSpeakers();
|
||||
var allSpeakers = manager.GetAllSpeakers();
|
||||
foreach (var s in allSpeakers)
|
||||
{
|
||||
Console.WriteLine(s);
|
||||
}
|
||||
Console.WriteLine("------------");
|
||||
|
||||
string[] testFiles =
|
||||
var testFiles =
|
||||
new string[] {
|
||||
"./sr-data/test/fangjun-test-sr-1.wav",
|
||||
"./sr-data/test/leijun-test-sr-1.wav",
|
||||
@@ -117,9 +115,9 @@ class SpeakerIdentificationDemo
|
||||
float threshold = 0.6f;
|
||||
foreach (var file in testFiles)
|
||||
{
|
||||
float[] embedding = ComputeEmbedding(extractor, file);
|
||||
var embedding = ComputeEmbedding(extractor, file);
|
||||
|
||||
String name = manager.Search(embedding, threshold);
|
||||
var name = manager.Search(embedding, threshold);
|
||||
if (name == "")
|
||||
{
|
||||
name = "<Unknown>";
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>speaker_identification</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -6,47 +6,43 @@
|
||||
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html
|
||||
// to download streaming models
|
||||
|
||||
using CommandLine.Text;
|
||||
using CommandLine;
|
||||
using CommandLine.Text;
|
||||
using PortAudioSharp;
|
||||
using System.Threading;
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.InteropServices;
|
||||
using System;
|
||||
|
||||
|
||||
class SpeechRecognitionFromMicrophone
|
||||
{
|
||||
class Options
|
||||
{
|
||||
[Option(Required = true, HelpText = "Path to tokens.txt")]
|
||||
public string Tokens { get; set; }
|
||||
public string? Tokens { get; set; }
|
||||
|
||||
[Option(Required = false, Default = "cpu", HelpText = "Provider, e.g., cpu, coreml")]
|
||||
public string Provider { get; set; }
|
||||
public string? Provider { get; set; }
|
||||
|
||||
[Option(Required = false, HelpText = "Path to transducer encoder.onnx")]
|
||||
public string Encoder { get; set; }
|
||||
public string? Encoder { get; set; }
|
||||
|
||||
[Option(Required = false, HelpText = "Path to transducer decoder.onnx")]
|
||||
public string Decoder { get; set; }
|
||||
public string? Decoder { get; set; }
|
||||
|
||||
[Option(Required = false, HelpText = "Path to transducer joiner.onnx")]
|
||||
public string Joiner { get; set; }
|
||||
public string? Joiner { get; set; }
|
||||
|
||||
[Option("paraformer-encoder", Required = false, HelpText = "Path to paraformer encoder.onnx")]
|
||||
public string ParaformerEncoder { get; set; }
|
||||
public string? ParaformerEncoder { get; set; }
|
||||
|
||||
[Option("paraformer-decoder", Required = false, HelpText = "Path to paraformer decoder.onnx")]
|
||||
public string ParaformerDecoder { get; set; }
|
||||
public string? ParaformerDecoder { get; set; }
|
||||
|
||||
[Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")]
|
||||
public int NumThreads { get; set; }
|
||||
|
||||
[Option("decoding-method", Required = false, Default = "greedy_search",
|
||||
HelpText = "Valid decoding methods are: greedy_search, modified_beam_search")]
|
||||
public string DecodingMethod { get; set; }
|
||||
public string? DecodingMethod { get; set; }
|
||||
|
||||
[Option(Required = false, Default = false, HelpText = "True to show model info during loading")]
|
||||
public bool Debug { get; set; }
|
||||
@@ -126,7 +122,7 @@ to download pre-trained streaming models.
|
||||
|
||||
private static void Run(Options options)
|
||||
{
|
||||
OnlineRecognizerConfig config = new OnlineRecognizerConfig();
|
||||
var config = new OnlineRecognizerConfig();
|
||||
config.FeatConfig.SampleRate = options.SampleRate;
|
||||
|
||||
// All models from icefall using feature dim 80.
|
||||
@@ -153,9 +149,9 @@ to download pre-trained streaming models.
|
||||
config.Rule2MinTrailingSilence = options.Rule2MinTrailingSilence;
|
||||
config.Rule3MinUtteranceLength = options.Rule3MinUtteranceLength;
|
||||
|
||||
OnlineRecognizer recognizer = new OnlineRecognizer(config);
|
||||
var recognizer = new OnlineRecognizer(config);
|
||||
|
||||
OnlineStream s = recognizer.CreateStream();
|
||||
var s = recognizer.CreateStream();
|
||||
|
||||
Console.WriteLine(PortAudio.VersionInfo.versionText);
|
||||
PortAudio.Initialize();
|
||||
@@ -176,12 +172,12 @@ to download pre-trained streaming models.
|
||||
Environment.Exit(1);
|
||||
}
|
||||
|
||||
DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex);
|
||||
var info = PortAudio.GetDeviceInfo(deviceIndex);
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Use default device {deviceIndex} ({info.name})");
|
||||
|
||||
StreamParameters param = new StreamParameters();
|
||||
var param = new StreamParameters();
|
||||
param.device = deviceIndex;
|
||||
param.channelCount = 1;
|
||||
param.sampleFormat = SampleFormat.Float32;
|
||||
@@ -189,14 +185,14 @@ to download pre-trained streaming models.
|
||||
param.hostApiSpecificStreamInfo = IntPtr.Zero;
|
||||
|
||||
PortAudioSharp.Stream.Callback callback = (IntPtr input, IntPtr output,
|
||||
UInt32 frameCount,
|
||||
uint frameCount,
|
||||
ref StreamCallbackTimeInfo timeInfo,
|
||||
StreamCallbackFlags statusFlags,
|
||||
IntPtr userData
|
||||
) =>
|
||||
{
|
||||
float[] samples = new float[frameCount];
|
||||
Marshal.Copy(input, samples, 0, (Int32)frameCount);
|
||||
var samples = new float[frameCount];
|
||||
Marshal.Copy(input, samples, 0, (int)frameCount);
|
||||
|
||||
s.AcceptWaveform(options.SampleRate, samples);
|
||||
|
||||
@@ -215,7 +211,7 @@ to download pre-trained streaming models.
|
||||
|
||||
stream.Start();
|
||||
|
||||
String lastText = "";
|
||||
var lastText = string.Empty;
|
||||
int segmentIndex = 0;
|
||||
|
||||
while (true)
|
||||
@@ -245,9 +241,5 @@ to download pre-trained streaming models.
|
||||
|
||||
Thread.Sleep(200); // ms
|
||||
}
|
||||
|
||||
PortAudio.Terminate();
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>speech_recognition_from_microphone</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -15,12 +15,9 @@
|
||||
// dotnet run
|
||||
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
|
||||
class SpokenLanguageIdentificationDemo
|
||||
{
|
||||
|
||||
static void Main(string[] args)
|
||||
{
|
||||
var config = new SpokenLanguageIdentificationConfig();
|
||||
@@ -30,7 +27,7 @@ class SpokenLanguageIdentificationDemo
|
||||
var slid = new SpokenLanguageIdentification(config);
|
||||
var filename = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav";
|
||||
|
||||
WaveReader waveReader = new WaveReader(filename);
|
||||
var waveReader = new WaveReader(filename);
|
||||
|
||||
var s = slid.CreateStream();
|
||||
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>spoken_language_identification</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -13,12 +13,9 @@
|
||||
// dotnet run
|
||||
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
|
||||
class StreamingHlgDecodingDemo
|
||||
{
|
||||
|
||||
static void Main(string[] args)
|
||||
{
|
||||
var config = new OnlineRecognizerConfig();
|
||||
@@ -32,15 +29,15 @@ class StreamingHlgDecodingDemo
|
||||
config.ModelConfig.Debug = 0;
|
||||
config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst";
|
||||
|
||||
OnlineRecognizer recognizer = new OnlineRecognizer(config);
|
||||
var recognizer = new OnlineRecognizer(config);
|
||||
|
||||
var filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav";
|
||||
|
||||
WaveReader waveReader = new WaveReader(filename);
|
||||
OnlineStream s = recognizer.CreateStream();
|
||||
var waveReader = new WaveReader(filename);
|
||||
var s = recognizer.CreateStream();
|
||||
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
|
||||
|
||||
float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
|
||||
var tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
|
||||
s.AcceptWaveform(waveReader.SampleRate, tailPadding);
|
||||
s.InputFinished();
|
||||
|
||||
@@ -49,7 +46,7 @@ class StreamingHlgDecodingDemo
|
||||
recognizer.Decode(s);
|
||||
}
|
||||
|
||||
OnlineRecognizerResult r = recognizer.GetResult(s);
|
||||
var r = recognizer.GetResult(s);
|
||||
var text = r.Text;
|
||||
var tokens = r.Tokens;
|
||||
Console.WriteLine("--------------------");
|
||||
@@ -57,10 +54,8 @@ class StreamingHlgDecodingDemo
|
||||
Console.WriteLine("text: {0}", text);
|
||||
Console.WriteLine("tokens: [{0}]", string.Join(", ", tokens));
|
||||
Console.Write("timestamps: [");
|
||||
r.Timestamps.ToList().ForEach(i => Console.Write(String.Format("{0:0.00}", i) + ", "));
|
||||
r.Timestamps.ToList().ForEach(i => Console.Write(string.Format("{0:0.00}", i) + ", "));
|
||||
Console.WriteLine("]");
|
||||
Console.WriteLine("--------------------");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>streaming_hlg_decoding</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
// This file shows how to use a silero_vad model with a non-streaming Paraformer
|
||||
// for speech recognition.
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
|
||||
class VadNonStreamingAsrParaformer
|
||||
{
|
||||
@@ -12,45 +10,49 @@ class VadNonStreamingAsrParaformer
|
||||
{
|
||||
// please download model files from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
OfflineRecognizerConfig config = new OfflineRecognizerConfig();
|
||||
var config = new OfflineRecognizerConfig();
|
||||
config.ModelConfig.Paraformer.Model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx";
|
||||
config.ModelConfig.Tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt";
|
||||
config.ModelConfig.Debug = 0;
|
||||
OfflineRecognizer recognizer = new OfflineRecognizer(config);
|
||||
var recognizer = new OfflineRecognizer(config);
|
||||
|
||||
VadModelConfig vadModelConfig = new VadModelConfig();
|
||||
var vadModelConfig = new VadModelConfig();
|
||||
vadModelConfig.SileroVad.Model = "./silero_vad.onnx";
|
||||
vadModelConfig.Debug = 0;
|
||||
|
||||
VoiceActivityDetector vad = new VoiceActivityDetector(vadModelConfig, 60);
|
||||
var vad = new VoiceActivityDetector(vadModelConfig, 60);
|
||||
|
||||
string testWaveFilename = "./lei-jun-test.wav";
|
||||
WaveReader reader = new WaveReader(testWaveFilename);
|
||||
var testWaveFilename = "./lei-jun-test.wav";
|
||||
var reader = new WaveReader(testWaveFilename);
|
||||
|
||||
int numSamples = reader.Samples.Length;
|
||||
int windowSize = vadModelConfig.SileroVad.WindowSize;
|
||||
int sampleRate = vadModelConfig.SampleRate;
|
||||
int numIter = numSamples / windowSize;
|
||||
|
||||
for (int i = 0; i != numIter; ++i) {
|
||||
for (int i = 0; i != numIter; ++i)
|
||||
{
|
||||
int start = i * windowSize;
|
||||
float[] samples = new float[windowSize];
|
||||
var samples = new float[windowSize];
|
||||
Array.Copy(reader.Samples, start, samples, 0, windowSize);
|
||||
vad.AcceptWaveform(samples);
|
||||
if (vad.IsSpeechDetected()) {
|
||||
while (!vad.IsEmpty()) {
|
||||
if (vad.IsSpeechDetected())
|
||||
{
|
||||
while (!vad.IsEmpty())
|
||||
{
|
||||
SpeechSegment segment = vad.Front();
|
||||
float startTime = segment.Start / (float)sampleRate;
|
||||
float duration = segment.Samples.Length / (float)sampleRate;
|
||||
var startTime = segment.Start / (float)sampleRate;
|
||||
var duration = segment.Samples.Length / (float)sampleRate;
|
||||
|
||||
OfflineStream stream = recognizer.CreateStream();
|
||||
stream.AcceptWaveform(sampleRate, segment.Samples);
|
||||
recognizer.Decode(stream);
|
||||
String text = stream.Result.Text;
|
||||
var text = stream.Result.Text;
|
||||
|
||||
if (!String.IsNullOrEmpty(text)) {
|
||||
Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime),
|
||||
String.Format("{0:0.00}", startTime+duration), text);
|
||||
if (!string.IsNullOrEmpty(text))
|
||||
{
|
||||
Console.WriteLine("{0}--{1}: {2}", string.Format("{0:0.00}", startTime),
|
||||
string.Format("{0:0.00}", startTime + duration), text);
|
||||
}
|
||||
|
||||
vad.Pop();
|
||||
@@ -60,19 +62,21 @@ class VadNonStreamingAsrParaformer
|
||||
|
||||
vad.Flush();
|
||||
|
||||
while (!vad.IsEmpty()) {
|
||||
SpeechSegment segment = vad.Front();
|
||||
while (!vad.IsEmpty())
|
||||
{
|
||||
var segment = vad.Front();
|
||||
float startTime = segment.Start / (float)sampleRate;
|
||||
float duration = segment.Samples.Length / (float)sampleRate;
|
||||
|
||||
OfflineStream stream = recognizer.CreateStream();
|
||||
var stream = recognizer.CreateStream();
|
||||
stream.AcceptWaveform(sampleRate, segment.Samples);
|
||||
recognizer.Decode(stream);
|
||||
String text = stream.Result.Text;
|
||||
var text = stream.Result.Text;
|
||||
|
||||
if (!String.IsNullOrEmpty(text)) {
|
||||
Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime),
|
||||
String.Format("{0:0.00}", startTime+duration), text);
|
||||
if (!string.IsNullOrEmpty(text))
|
||||
{
|
||||
Console.WriteLine("{0}--{1}: {2}", string.Format("{0:0.00}", startTime),
|
||||
string.Format("{0:0.00}", startTime + duration), text);
|
||||
}
|
||||
|
||||
vad.Pop();
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>vad_non_streaming_asr_paraformer</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
|
||||
Reference in New Issue
Block a user