Add Golang API for VAD (#708)

This commit is contained in:
Fangjun Kuang
2024-03-27 12:09:39 +08:00
committed by GitHub
parent ccb2d435ec
commit 69c7880c4d
28 changed files with 674 additions and 4 deletions

View File

@@ -234,6 +234,12 @@ def get_mimic3_models() -> List[TtsModel]:
def get_vits_models() -> List[TtsModel]:
return [
# Chinese
TtsModel(
model_dir="vits-icefall-zh-aishell3",
model_name="model.onnx",
lang="zh",
rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/rule.fst",
),
TtsModel(
model_dir="vits-zh-aishell3",
model_name="vits-aishell3.onnx",

View File

@@ -1 +1,2 @@
!*.sh
go.sum

View File

@@ -0,0 +1 @@
vad-asr-paraformer

View File

@@ -0,0 +1,10 @@
module vad-asr-paraformer
go 1.12
replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
require (
github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5
github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx v0.0.0-00010101000000-000000000000
)

View File

@@ -0,0 +1 @@
../../../../go-api-examples/vad-asr-paraformer/main.go

View File

@@ -0,0 +1 @@
../../../../go-api-examples/vad-asr-paraformer/run.sh

View File

@@ -0,0 +1,2 @@
vad-asr-whisper

View File

@@ -0,0 +1,5 @@
module vad-asr-whisper
go 1.12
replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../

View File

@@ -0,0 +1 @@
../../../../go-api-examples/vad-asr-whisper/main.go

View File

@@ -0,0 +1 @@
../../../../go-api-examples/vad-asr-whisper/run.sh

2
scripts/go/_internal/vad/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
vad
go.sum

View File

@@ -0,0 +1,5 @@
module vad
go 1.12
replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../

View File

@@ -0,0 +1 @@
../../../../go-api-examples/vad/main.go

View File

@@ -0,0 +1 @@
../../../../go-api-examples/vad/run.sh

View File

@@ -614,6 +614,9 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA
ans.SampleRate = int(audio.sample_rate)
n := int(audio.n)
ans.Samples = make([]float32, n)
// see https://stackoverflow.com/questions/48756732/what-does-1-30c-yourtype-do-exactly-in-cgo
// :n:n means 0:n:n, means low:high:capacity
samples := (*[1 << 28]C.float)(unsafe.Pointer(audio.samples))[:n:n]
// copy(ans.Samples, samples)
for i := 0; i < n; i++ {
@@ -623,11 +626,160 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA
return ans
}
func (audio *GeneratedAudio) Save(filename string) int {
func (audio *GeneratedAudio) Save(filename string) bool {
s := C.CString(filename)
defer C.free(unsafe.Pointer(s))
ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s))
return ok
return ok == 1
}
// ============================================================
// For VAD
// ============================================================
type SileroVadModelConfig struct {
Model string
Threshold float32
MinSilenceDuration float32
MinSpeechDuration float32
WindowSize int
}
type VadModelConfig struct {
SileroVad SileroVadModelConfig
SampleRate int
NumThreads int
Provider string
Debug int
}
type CircularBuffer struct {
impl *C.struct_SherpaOnnxCircularBuffer
}
func DeleteCircularBuffer(buffer *CircularBuffer) {
C.SherpaOnnxDestroyCircularBuffer(buffer.impl)
buffer.impl = nil
}
func NewCircularBuffer(capacity int) *CircularBuffer {
circularBuffer := &CircularBuffer{}
circularBuffer.impl = C.SherpaOnnxCreateCircularBuffer(C.int(capacity))
return circularBuffer
}
func (buffer *CircularBuffer) Push(samples []float32) {
C.SherpaOnnxCircularBufferPush(buffer.impl, (*C.float)(&samples[0]), C.int(len(samples)))
}
func (buffer *CircularBuffer) Get(start int, n int) []float32 {
samples := C.SherpaOnnxCircularBufferGet(buffer.impl, C.int(start), C.int(n))
defer C.SherpaOnnxCircularBufferFree(samples)
result := make([]float32, n)
p := (*[1 << 28]C.float)(unsafe.Pointer(samples))[:n:n]
for i := 0; i < n; i++ {
result[i] = float32(p[i])
}
return result
}
func (buffer *CircularBuffer) Pop(n int) {
C.SherpaOnnxCircularBufferPop(buffer.impl, C.int(n))
}
func (buffer *CircularBuffer) Size() int {
return int(C.SherpaOnnxCircularBufferSize(buffer.impl))
}
func (buffer *CircularBuffer) Head() int {
return int(C.SherpaOnnxCircularBufferHead(buffer.impl))
}
func (buffer *CircularBuffer) Reset() {
C.SherpaOnnxCircularBufferReset(buffer.impl)
}
type SpeechSegment struct {
Start int
Samples []float32
}
type VoiceActivityDetector struct {
impl *C.struct_SherpaOnnxVoiceActivityDetector
}
func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float32) *VoiceActivityDetector {
c := C.struct_SherpaOnnxVadModelConfig{}
c.silero_vad.model = C.CString(config.SileroVad.Model)
defer C.free(unsafe.Pointer(c.silero_vad.model))
c.silero_vad.threshold = C.float(config.SileroVad.Threshold)
c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
c.sample_rate = C.int(config.SampleRate)
c.num_threads = C.int(config.NumThreads)
c.provider = C.CString(config.Provider)
defer C.free(unsafe.Pointer(c.provider))
c.debug = C.int(config.Debug)
vad := &VoiceActivityDetector{}
vad.impl = C.SherpaOnnxCreateVoiceActivityDetector(&c, C.float(bufferSizeInSeconds))
return vad
}
func DeleteVoiceActivityDetector(vad *VoiceActivityDetector) {
C.SherpaOnnxDestroyVoiceActivityDetector(vad.impl)
vad.impl = nil
}
func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32) {
C.SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad.impl, (*C.float)(&samples[0]), C.int(len(samples)))
}
func (vad *VoiceActivityDetector) IsEmpty() bool {
return 1 == int(C.SherpaOnnxVoiceActivityDetectorEmpty(vad.impl))
}
func (vad *VoiceActivityDetector) IsSpeech() bool {
return 1 == int(C.SherpaOnnxVoiceActivityDetectorDetected(vad.impl))
}
func (vad *VoiceActivityDetector) Pop() {
C.SherpaOnnxVoiceActivityDetectorPop(vad.impl)
}
func (vad *VoiceActivityDetector) Clear() {
C.SherpaOnnxVoiceActivityDetectorClear(vad.impl)
}
func (vad *VoiceActivityDetector) Front() *SpeechSegment {
f := C.SherpaOnnxVoiceActivityDetectorFront(vad.impl)
defer C.SherpaOnnxDestroySpeechSegment(f)
ans := &SpeechSegment{}
ans.Start = int(f.start)
n := int(f.n)
ans.Samples = make([]float32, n)
samples := (*[1 << 28]C.float)(unsafe.Pointer(f.samples))[:n:n]
for i := 0; i < n; i++ {
ans.Samples[i] = float32(samples[i])
}
return ans
}
func (vad *VoiceActivityDetector) Reset() {
C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
}