Add Golang API for VAD (#708)
This commit is contained in:
@@ -234,6 +234,12 @@ def get_mimic3_models() -> List[TtsModel]:
|
||||
def get_vits_models() -> List[TtsModel]:
|
||||
return [
|
||||
# Chinese
|
||||
TtsModel(
|
||||
model_dir="vits-icefall-zh-aishell3",
|
||||
model_name="model.onnx",
|
||||
lang="zh",
|
||||
rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/rule.fst",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-aishell3",
|
||||
model_name="vits-aishell3.onnx",
|
||||
|
||||
1
scripts/go/_internal/.gitignore
vendored
1
scripts/go/_internal/.gitignore
vendored
@@ -1 +1,2 @@
|
||||
!*.sh
|
||||
go.sum
|
||||
|
||||
1
scripts/go/_internal/vad-asr-paraformer/.gitignore
vendored
Normal file
1
scripts/go/_internal/vad-asr-paraformer/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
vad-asr-paraformer
|
||||
10
scripts/go/_internal/vad-asr-paraformer/go.mod
Normal file
10
scripts/go/_internal/vad-asr-paraformer/go.mod
Normal file
@@ -0,0 +1,10 @@
|
||||
module vad-asr-paraformer
|
||||
|
||||
go 1.12
|
||||
|
||||
replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
|
||||
|
||||
require (
|
||||
github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5
|
||||
github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
1
scripts/go/_internal/vad-asr-paraformer/main.go
Symbolic link
1
scripts/go/_internal/vad-asr-paraformer/main.go
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../../go-api-examples/vad-asr-paraformer/main.go
|
||||
1
scripts/go/_internal/vad-asr-paraformer/run.sh
Symbolic link
1
scripts/go/_internal/vad-asr-paraformer/run.sh
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../../go-api-examples/vad-asr-paraformer/run.sh
|
||||
2
scripts/go/_internal/vad-asr-whisper/.gitignore
vendored
Normal file
2
scripts/go/_internal/vad-asr-whisper/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
vad-asr-whisper
|
||||
|
||||
5
scripts/go/_internal/vad-asr-whisper/go.mod
Normal file
5
scripts/go/_internal/vad-asr-whisper/go.mod
Normal file
@@ -0,0 +1,5 @@
|
||||
module vad-asr-whisper
|
||||
|
||||
go 1.12
|
||||
|
||||
replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
|
||||
1
scripts/go/_internal/vad-asr-whisper/main.go
Symbolic link
1
scripts/go/_internal/vad-asr-whisper/main.go
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../../go-api-examples/vad-asr-whisper/main.go
|
||||
1
scripts/go/_internal/vad-asr-whisper/run.sh
Symbolic link
1
scripts/go/_internal/vad-asr-whisper/run.sh
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../../go-api-examples/vad-asr-whisper/run.sh
|
||||
2
scripts/go/_internal/vad/.gitignore
vendored
Normal file
2
scripts/go/_internal/vad/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
vad
|
||||
go.sum
|
||||
5
scripts/go/_internal/vad/go.mod
Normal file
5
scripts/go/_internal/vad/go.mod
Normal file
@@ -0,0 +1,5 @@
|
||||
module vad
|
||||
|
||||
go 1.12
|
||||
|
||||
replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
|
||||
1
scripts/go/_internal/vad/main.go
Symbolic link
1
scripts/go/_internal/vad/main.go
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../../go-api-examples/vad/main.go
|
||||
1
scripts/go/_internal/vad/run.sh
Symbolic link
1
scripts/go/_internal/vad/run.sh
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../../go-api-examples/vad/run.sh
|
||||
@@ -614,6 +614,9 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA
|
||||
ans.SampleRate = int(audio.sample_rate)
|
||||
n := int(audio.n)
|
||||
ans.Samples = make([]float32, n)
|
||||
|
||||
// see https://stackoverflow.com/questions/48756732/what-does-1-30c-yourtype-do-exactly-in-cgo
|
||||
// :n:n means 0:n:n, means low:high:capacity
|
||||
samples := (*[1 << 28]C.float)(unsafe.Pointer(audio.samples))[:n:n]
|
||||
// copy(ans.Samples, samples)
|
||||
for i := 0; i < n; i++ {
|
||||
@@ -623,11 +626,160 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA
|
||||
return ans
|
||||
}
|
||||
|
||||
func (audio *GeneratedAudio) Save(filename string) int {
|
||||
func (audio *GeneratedAudio) Save(filename string) bool {
|
||||
s := C.CString(filename)
|
||||
defer C.free(unsafe.Pointer(s))
|
||||
|
||||
ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s))
|
||||
|
||||
return ok
|
||||
return ok == 1
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// For VAD
|
||||
// ============================================================
|
||||
type SileroVadModelConfig struct {
|
||||
Model string
|
||||
Threshold float32
|
||||
MinSilenceDuration float32
|
||||
MinSpeechDuration float32
|
||||
WindowSize int
|
||||
}
|
||||
|
||||
type VadModelConfig struct {
|
||||
SileroVad SileroVadModelConfig
|
||||
SampleRate int
|
||||
NumThreads int
|
||||
Provider string
|
||||
Debug int
|
||||
}
|
||||
|
||||
type CircularBuffer struct {
|
||||
impl *C.struct_SherpaOnnxCircularBuffer
|
||||
}
|
||||
|
||||
func DeleteCircularBuffer(buffer *CircularBuffer) {
|
||||
C.SherpaOnnxDestroyCircularBuffer(buffer.impl)
|
||||
buffer.impl = nil
|
||||
}
|
||||
|
||||
func NewCircularBuffer(capacity int) *CircularBuffer {
|
||||
circularBuffer := &CircularBuffer{}
|
||||
circularBuffer.impl = C.SherpaOnnxCreateCircularBuffer(C.int(capacity))
|
||||
return circularBuffer
|
||||
}
|
||||
|
||||
func (buffer *CircularBuffer) Push(samples []float32) {
|
||||
C.SherpaOnnxCircularBufferPush(buffer.impl, (*C.float)(&samples[0]), C.int(len(samples)))
|
||||
}
|
||||
|
||||
func (buffer *CircularBuffer) Get(start int, n int) []float32 {
|
||||
samples := C.SherpaOnnxCircularBufferGet(buffer.impl, C.int(start), C.int(n))
|
||||
defer C.SherpaOnnxCircularBufferFree(samples)
|
||||
|
||||
result := make([]float32, n)
|
||||
|
||||
p := (*[1 << 28]C.float)(unsafe.Pointer(samples))[:n:n]
|
||||
for i := 0; i < n; i++ {
|
||||
result[i] = float32(p[i])
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func (buffer *CircularBuffer) Pop(n int) {
|
||||
C.SherpaOnnxCircularBufferPop(buffer.impl, C.int(n))
|
||||
}
|
||||
|
||||
func (buffer *CircularBuffer) Size() int {
|
||||
return int(C.SherpaOnnxCircularBufferSize(buffer.impl))
|
||||
}
|
||||
|
||||
func (buffer *CircularBuffer) Head() int {
|
||||
return int(C.SherpaOnnxCircularBufferHead(buffer.impl))
|
||||
}
|
||||
|
||||
func (buffer *CircularBuffer) Reset() {
|
||||
C.SherpaOnnxCircularBufferReset(buffer.impl)
|
||||
}
|
||||
|
||||
type SpeechSegment struct {
|
||||
Start int
|
||||
Samples []float32
|
||||
}
|
||||
|
||||
type VoiceActivityDetector struct {
|
||||
impl *C.struct_SherpaOnnxVoiceActivityDetector
|
||||
}
|
||||
|
||||
func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float32) *VoiceActivityDetector {
|
||||
c := C.struct_SherpaOnnxVadModelConfig{}
|
||||
|
||||
c.silero_vad.model = C.CString(config.SileroVad.Model)
|
||||
defer C.free(unsafe.Pointer(c.silero_vad.model))
|
||||
|
||||
c.silero_vad.threshold = C.float(config.SileroVad.Threshold)
|
||||
c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
|
||||
c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
|
||||
c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
|
||||
|
||||
c.sample_rate = C.int(config.SampleRate)
|
||||
c.num_threads = C.int(config.NumThreads)
|
||||
c.provider = C.CString(config.Provider)
|
||||
defer C.free(unsafe.Pointer(c.provider))
|
||||
|
||||
c.debug = C.int(config.Debug)
|
||||
|
||||
vad := &VoiceActivityDetector{}
|
||||
vad.impl = C.SherpaOnnxCreateVoiceActivityDetector(&c, C.float(bufferSizeInSeconds))
|
||||
|
||||
return vad
|
||||
}
|
||||
|
||||
func DeleteVoiceActivityDetector(vad *VoiceActivityDetector) {
|
||||
C.SherpaOnnxDestroyVoiceActivityDetector(vad.impl)
|
||||
vad.impl = nil
|
||||
}
|
||||
|
||||
func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32) {
|
||||
C.SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad.impl, (*C.float)(&samples[0]), C.int(len(samples)))
|
||||
}
|
||||
|
||||
func (vad *VoiceActivityDetector) IsEmpty() bool {
|
||||
return 1 == int(C.SherpaOnnxVoiceActivityDetectorEmpty(vad.impl))
|
||||
}
|
||||
|
||||
func (vad *VoiceActivityDetector) IsSpeech() bool {
|
||||
return 1 == int(C.SherpaOnnxVoiceActivityDetectorDetected(vad.impl))
|
||||
}
|
||||
|
||||
func (vad *VoiceActivityDetector) Pop() {
|
||||
C.SherpaOnnxVoiceActivityDetectorPop(vad.impl)
|
||||
}
|
||||
|
||||
func (vad *VoiceActivityDetector) Clear() {
|
||||
C.SherpaOnnxVoiceActivityDetectorClear(vad.impl)
|
||||
}
|
||||
|
||||
func (vad *VoiceActivityDetector) Front() *SpeechSegment {
|
||||
f := C.SherpaOnnxVoiceActivityDetectorFront(vad.impl)
|
||||
defer C.SherpaOnnxDestroySpeechSegment(f)
|
||||
|
||||
ans := &SpeechSegment{}
|
||||
ans.Start = int(f.start)
|
||||
|
||||
n := int(f.n)
|
||||
ans.Samples = make([]float32, n)
|
||||
|
||||
samples := (*[1 << 28]C.float)(unsafe.Pointer(f.samples))[:n:n]
|
||||
|
||||
for i := 0; i < n; i++ {
|
||||
ans.Samples[i] = float32(samples[i])
|
||||
}
|
||||
|
||||
return ans
|
||||
}
|
||||
|
||||
func (vad *VoiceActivityDetector) Reset() {
|
||||
C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user