This PR adds support for non-streaming Zipformer CTC ASR models across multiple language bindings, WebAssembly, examples, and CI workflows. - Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs - Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js - Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models Model doc is available at https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html
2059 lines
63 KiB
Go
2059 lines
63 KiB
Go
/*
|
|
Speech recognition with [Next-gen Kaldi].
|
|
|
|
[sherpa-onnx] is an open-source speech recognition framework for [Next-gen Kaldi].
|
|
It depends only on [onnxruntime], supporting both streaming and non-streaming
|
|
speech recognition.
|
|
|
|
It does not need to access the network during recognition and everything
|
|
runs locally.
|
|
|
|
It supports a variety of platforms, such as Linux (x86_64, aarch64, arm),
|
|
Windows (x86_64, x86), macOS (x86_64, arm64), etc.
|
|
|
|
Usage examples:
|
|
|
|
1. Real-time speech recognition from a microphone
|
|
|
|
Please see
|
|
https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/real-time-speech-recognition-from-microphone
|
|
|
|
2. Decode files using a non-streaming model
|
|
|
|
Please see
|
|
https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-decode-files
|
|
|
|
3. Decode files using a streaming model
|
|
|
|
Please see
|
|
https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/streaming-decode-files
|
|
|
|
4. Convert text to speech using a non-streaming model
|
|
|
|
Please see
|
|
https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-tts
|
|
|
|
[sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx
|
|
[onnxruntime]: https://github.com/microsoft/onnxruntime
|
|
[Next-gen Kaldi]: https://github.com/k2-fsa/
|
|
*/
|
|
package sherpa_onnx
|
|
|
|
// #include <stdlib.h>
|
|
// #include "c-api.h"
|
|
// extern int32_t _cgoGeneratedAudioCallback(float *samples,int32_t n,void *arg);
|
|
// extern int32_t _cgoGeneratedAudioProgressCallback(float *samples, int32_t n, float p, void *arg);
|
|
import "C"
|
|
import (
|
|
"runtime/cgo"
|
|
"unsafe"
|
|
)
|
|
|
|
// Configuration for online/streaming transducer models
|
|
//
|
|
// Please refer to
|
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html
|
|
// to download pre-trained models
|
|
type OnlineTransducerModelConfig struct {
|
|
Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx
|
|
Decoder string // Path to the decoder model.
|
|
Joiner string // Path to the joiner model.
|
|
}
|
|
|
|
// Configuration for online/streaming paraformer models
|
|
//
|
|
// Please refer to
|
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html
|
|
// to download pre-trained models
|
|
type OnlineParaformerModelConfig struct {
|
|
Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx
|
|
Decoder string // Path to the decoder model.
|
|
}
|
|
|
|
// Please refer to
|
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-ctc/index.html
|
|
// to download pre-trained models
|
|
type OnlineZipformer2CtcModelConfig struct {
|
|
Model string // Path to the onnx model
|
|
}
|
|
|
|
// Configuration for online/streaming models
|
|
//
|
|
// Please refer to
|
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html
|
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html
|
|
// to download pre-trained models
|
|
type OnlineModelConfig struct {
|
|
Transducer OnlineTransducerModelConfig
|
|
Paraformer OnlineParaformerModelConfig
|
|
Zipformer2Ctc OnlineZipformer2CtcModelConfig
|
|
Tokens string // Path to tokens.txt
|
|
NumThreads int // Number of threads to use for neural network computation
|
|
Provider string // Optional. Valid values are: cpu, cuda, coreml
|
|
Debug int // 1 to show model meta information while loading it.
|
|
ModelType string // Optional. You can specify it for faster model initialization
|
|
ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe
|
|
BpeVocab string // Optional.
|
|
TokensBuf string // Optional.
|
|
TokensBufSize int // Optional.
|
|
}
|
|
|
|
// Configuration for the feature extractor
|
|
type FeatureConfig struct {
|
|
// Sample rate expected by the model. It is 16000 for all
|
|
// pre-trained models provided by us
|
|
SampleRate int
|
|
// Feature dimension expected by the model. It is 80 for all
|
|
// pre-trained models provided by us
|
|
FeatureDim int
|
|
}
|
|
|
|
type OnlineCtcFstDecoderConfig struct {
|
|
Graph string
|
|
MaxActive int
|
|
}
|
|
|
|
type HomophoneReplacerConfig struct {
|
|
DictDir string
|
|
Lexicon string
|
|
RuleFsts string
|
|
}
|
|
|
|
// Configuration for the online/streaming recognizer.
|
|
type OnlineRecognizerConfig struct {
|
|
FeatConfig FeatureConfig
|
|
ModelConfig OnlineModelConfig
|
|
|
|
// Valid decoding methods: greedy_search, modified_beam_search
|
|
DecodingMethod string
|
|
|
|
// Used only when DecodingMethod is modified_beam_search. It specifies
|
|
// the maximum number of paths to keep during the search
|
|
MaxActivePaths int
|
|
|
|
EnableEndpoint int // 1 to enable endpoint detection.
|
|
|
|
// Please see
|
|
// https://k2-fsa.github.io/sherpa/ncnn/endpoint.html
|
|
// for the meaning of Rule1MinTrailingSilence, Rule2MinTrailingSilence
|
|
// and Rule3MinUtteranceLength.
|
|
Rule1MinTrailingSilence float32
|
|
Rule2MinTrailingSilence float32
|
|
Rule3MinUtteranceLength float32
|
|
HotwordsFile string
|
|
HotwordsScore float32
|
|
BlankPenalty float32
|
|
CtcFstDecoderConfig OnlineCtcFstDecoderConfig
|
|
RuleFsts string
|
|
RuleFars string
|
|
HotwordsBuf string
|
|
HotwordsBufSize int
|
|
Hr HomophoneReplacerConfig
|
|
}
|
|
|
|
// It contains the recognition result for a online stream.
|
|
type OnlineRecognizerResult struct {
|
|
Text string
|
|
}
|
|
|
|
// The online recognizer class. It wraps a pointer from C.
|
|
type OnlineRecognizer struct {
|
|
impl *C.struct_SherpaOnnxOnlineRecognizer
|
|
}
|
|
|
|
// The online stream class. It wraps a pointer from C.
|
|
type OnlineStream struct {
|
|
impl *C.struct_SherpaOnnxOnlineStream
|
|
}
|
|
|
|
// Free the internal pointer inside the recognizer to avoid memory leak.
|
|
func DeleteOnlineRecognizer(recognizer *OnlineRecognizer) {
|
|
C.SherpaOnnxDestroyOnlineRecognizer(recognizer.impl)
|
|
recognizer.impl = nil
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOnlineRecognizer]() to free
|
|
// the returned recognizer to avoid memory leak
|
|
func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer {
|
|
c := C.struct_SherpaOnnxOnlineRecognizerConfig{}
|
|
c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate)
|
|
c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim)
|
|
|
|
c.model_config.transducer.encoder = C.CString(config.ModelConfig.Transducer.Encoder)
|
|
defer C.free(unsafe.Pointer(c.model_config.transducer.encoder))
|
|
|
|
c.model_config.transducer.decoder = C.CString(config.ModelConfig.Transducer.Decoder)
|
|
defer C.free(unsafe.Pointer(c.model_config.transducer.decoder))
|
|
|
|
c.model_config.transducer.joiner = C.CString(config.ModelConfig.Transducer.Joiner)
|
|
defer C.free(unsafe.Pointer(c.model_config.transducer.joiner))
|
|
|
|
c.model_config.paraformer.encoder = C.CString(config.ModelConfig.Paraformer.Encoder)
|
|
defer C.free(unsafe.Pointer(c.model_config.paraformer.encoder))
|
|
|
|
c.model_config.paraformer.decoder = C.CString(config.ModelConfig.Paraformer.Decoder)
|
|
defer C.free(unsafe.Pointer(c.model_config.paraformer.decoder))
|
|
|
|
c.model_config.zipformer2_ctc.model = C.CString(config.ModelConfig.Zipformer2Ctc.Model)
|
|
defer C.free(unsafe.Pointer(c.model_config.zipformer2_ctc.model))
|
|
|
|
c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
|
|
defer C.free(unsafe.Pointer(c.model_config.tokens))
|
|
|
|
c.model_config.tokens_buf = C.CString(config.ModelConfig.TokensBuf)
|
|
defer C.free(unsafe.Pointer(c.model_config.tokens_buf))
|
|
|
|
c.model_config.tokens_buf_size = C.int(config.ModelConfig.TokensBufSize)
|
|
|
|
c.model_config.num_threads = C.int(config.ModelConfig.NumThreads)
|
|
|
|
c.model_config.provider = C.CString(config.ModelConfig.Provider)
|
|
defer C.free(unsafe.Pointer(c.model_config.provider))
|
|
|
|
c.model_config.debug = C.int(config.ModelConfig.Debug)
|
|
|
|
c.model_config.model_type = C.CString(config.ModelConfig.ModelType)
|
|
defer C.free(unsafe.Pointer(c.model_config.model_type))
|
|
|
|
c.model_config.modeling_unit = C.CString(config.ModelConfig.ModelingUnit)
|
|
defer C.free(unsafe.Pointer(c.model_config.modeling_unit))
|
|
|
|
c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab)
|
|
defer C.free(unsafe.Pointer(c.model_config.bpe_vocab))
|
|
|
|
c.decoding_method = C.CString(config.DecodingMethod)
|
|
defer C.free(unsafe.Pointer(c.decoding_method))
|
|
|
|
c.max_active_paths = C.int(config.MaxActivePaths)
|
|
c.enable_endpoint = C.int(config.EnableEndpoint)
|
|
c.rule1_min_trailing_silence = C.float(config.Rule1MinTrailingSilence)
|
|
c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence)
|
|
c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength)
|
|
|
|
c.hotwords_file = C.CString(config.HotwordsFile)
|
|
defer C.free(unsafe.Pointer(c.hotwords_file))
|
|
|
|
c.hotwords_buf = C.CString(config.HotwordsBuf)
|
|
defer C.free(unsafe.Pointer(c.hotwords_buf))
|
|
|
|
c.hotwords_buf_size = C.int(config.HotwordsBufSize)
|
|
|
|
c.hotwords_score = C.float(config.HotwordsScore)
|
|
c.blank_penalty = C.float(config.BlankPenalty)
|
|
|
|
c.rule_fsts = C.CString(config.RuleFsts)
|
|
defer C.free(unsafe.Pointer(c.rule_fsts))
|
|
|
|
c.rule_fars = C.CString(config.RuleFars)
|
|
defer C.free(unsafe.Pointer(c.rule_fars))
|
|
|
|
c.ctc_fst_decoder_config.graph = C.CString(config.CtcFstDecoderConfig.Graph)
|
|
defer C.free(unsafe.Pointer(c.ctc_fst_decoder_config.graph))
|
|
c.ctc_fst_decoder_config.max_active = C.int(config.CtcFstDecoderConfig.MaxActive)
|
|
|
|
c.hr.dict_dir = C.CString(config.Hr.DictDir)
|
|
defer C.free(unsafe.Pointer(c.hr.dict_dir))
|
|
|
|
c.hr.lexicon = C.CString(config.Hr.Lexicon)
|
|
defer C.free(unsafe.Pointer(c.hr.lexicon))
|
|
|
|
c.hr.rule_fsts = C.CString(config.Hr.RuleFsts)
|
|
defer C.free(unsafe.Pointer(c.hr.rule_fsts))
|
|
|
|
impl := C.SherpaOnnxCreateOnlineRecognizer(&c)
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
recognizer := &OnlineRecognizer{}
|
|
recognizer.impl = impl
|
|
return recognizer
|
|
}
|
|
|
|
// Delete the internal pointer inside the stream to avoid memory leak.
|
|
func DeleteOnlineStream(stream *OnlineStream) {
|
|
C.SherpaOnnxDestroyOnlineStream(stream.impl)
|
|
stream.impl = nil
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOnlineStream]() to free
|
|
// the returned stream to avoid memory leak
|
|
func NewOnlineStream(recognizer *OnlineRecognizer) *OnlineStream {
|
|
stream := &OnlineStream{}
|
|
stream.impl = C.SherpaOnnxCreateOnlineStream(recognizer.impl)
|
|
return stream
|
|
}
|
|
|
|
// Input audio samples for the stream.
|
|
//
|
|
// sampleRate is the actual sample rate of the input audio samples. If it
|
|
// is different from the sample rate expected by the feature extractor, we will
|
|
// do resampling inside.
|
|
//
|
|
// samples contains audio samples. Each sample is in the range [-1, 1]
|
|
func (s *OnlineStream) AcceptWaveform(sampleRate int, samples []float32) {
|
|
C.SherpaOnnxOnlineStreamAcceptWaveform(s.impl, C.int(sampleRate), (*C.float)(&samples[0]), C.int(len(samples)))
|
|
}
|
|
|
|
// Signal that there will be no incoming audio samples.
|
|
// After calling this function, you cannot call [OnlineStream.AcceptWaveform] any longer.
|
|
//
|
|
// The main purpose of this function is to flush the remaining audio samples
|
|
// buffered inside for feature extraction.
|
|
func (s *OnlineStream) InputFinished() {
|
|
C.SherpaOnnxOnlineStreamInputFinished(s.impl)
|
|
}
|
|
|
|
// Check whether the stream has enough feature frames for decoding.
|
|
// Return true if this stream is ready for decoding. Return false otherwise.
|
|
//
|
|
// You will usually use it like below:
|
|
//
|
|
// for recognizer.IsReady(s) {
|
|
// recognizer.Decode(s)
|
|
// }
|
|
func (recognizer *OnlineRecognizer) IsReady(s *OnlineStream) bool {
|
|
return C.SherpaOnnxIsOnlineStreamReady(recognizer.impl, s.impl) == 1
|
|
}
|
|
|
|
// Return true if an endpoint is detected.
|
|
//
|
|
// You usually use it like below:
|
|
//
|
|
// if recognizer.IsEndpoint(s) {
|
|
// // do your own stuff after detecting an endpoint
|
|
//
|
|
// recognizer.Reset(s)
|
|
// }
|
|
func (recognizer *OnlineRecognizer) IsEndpoint(s *OnlineStream) bool {
|
|
return C.SherpaOnnxOnlineStreamIsEndpoint(recognizer.impl, s.impl) == 1
|
|
}
|
|
|
|
// After calling this function, the internal neural network model states
|
|
// are reset and IsEndpoint(s) would return false. GetResult(s) would also
|
|
// return an empty string.
|
|
func (recognizer *OnlineRecognizer) Reset(s *OnlineStream) {
|
|
C.SherpaOnnxOnlineStreamReset(recognizer.impl, s.impl)
|
|
}
|
|
|
|
// Decode the stream. Before calling this function, you have to ensure
|
|
// that recognizer.IsReady(s) returns true. Otherwise, you will be SAD.
|
|
//
|
|
// You usually use it like below:
|
|
//
|
|
// for recognizer.IsReady(s) {
|
|
// recognizer.Decode(s)
|
|
// }
|
|
func (recognizer *OnlineRecognizer) Decode(s *OnlineStream) {
|
|
C.SherpaOnnxDecodeOnlineStream(recognizer.impl, s.impl)
|
|
}
|
|
|
|
// Decode multiple streams in parallel, i.e., in batch.
|
|
// You have to ensure that each stream is ready for decoding. Otherwise,
|
|
// you will be SAD.
|
|
func (recognizer *OnlineRecognizer) DecodeStreams(s []*OnlineStream) {
|
|
ss := make([]*C.struct_SherpaOnnxOnlineStream, len(s))
|
|
for i, v := range s {
|
|
ss[i] = v.impl
|
|
}
|
|
|
|
C.SherpaOnnxDecodeMultipleOnlineStreams(recognizer.impl, &ss[0], C.int(len(s)))
|
|
}
|
|
|
|
// Get the current result of stream since the last invoke of Reset()
|
|
func (recognizer *OnlineRecognizer) GetResult(s *OnlineStream) *OnlineRecognizerResult {
|
|
p := C.SherpaOnnxGetOnlineStreamResult(recognizer.impl, s.impl)
|
|
defer C.SherpaOnnxDestroyOnlineRecognizerResult(p)
|
|
result := &OnlineRecognizerResult{}
|
|
result.Text = C.GoString(p.text)
|
|
|
|
return result
|
|
}
|
|
|
|
// Configuration for offline/non-streaming transducer.
|
|
//
|
|
// Please refer to
|
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/index.html
|
|
// to download pre-trained models
|
|
type OfflineTransducerModelConfig struct {
|
|
Encoder string // Path to the encoder model, i.e., encoder.onnx or encoder.int8.onnx
|
|
Decoder string // Path to the decoder model
|
|
Joiner string // Path to the joiner model
|
|
}
|
|
|
|
// Configuration for offline/non-streaming paraformer.
|
|
//
|
|
// please refer to
|
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html
|
|
// to download pre-trained models
|
|
type OfflineParaformerModelConfig struct {
|
|
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
|
|
}
|
|
|
|
// Configuration for offline/non-streaming NeMo CTC models.
|
|
//
|
|
// Please refer to
|
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/index.html
|
|
// to download pre-trained models
|
|
type OfflineNemoEncDecCtcModelConfig struct {
|
|
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
|
|
}
|
|
|
|
type OfflineZipformerCtcModelConfig struct {
|
|
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
|
|
}
|
|
|
|
type OfflineDolphinModelConfig struct {
|
|
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
|
|
}
|
|
|
|
type OfflineWhisperModelConfig struct {
|
|
Encoder string
|
|
Decoder string
|
|
Language string
|
|
Task string
|
|
TailPaddings int
|
|
}
|
|
|
|
type OfflineFireRedAsrModelConfig struct {
|
|
Encoder string
|
|
Decoder string
|
|
}
|
|
|
|
type OfflineMoonshineModelConfig struct {
|
|
Preprocessor string
|
|
Encoder string
|
|
UncachedDecoder string
|
|
CachedDecoder string
|
|
}
|
|
|
|
type OfflineTdnnModelConfig struct {
|
|
Model string
|
|
}
|
|
|
|
type OfflineSenseVoiceModelConfig struct {
|
|
Model string
|
|
Language string
|
|
UseInverseTextNormalization int
|
|
}
|
|
|
|
// Configuration for offline LM.
|
|
type OfflineLMConfig struct {
|
|
Model string // Path to the model
|
|
Scale float32 // scale for LM score
|
|
}
|
|
|
|
type OfflineModelConfig struct {
|
|
Transducer OfflineTransducerModelConfig
|
|
Paraformer OfflineParaformerModelConfig
|
|
NemoCTC OfflineNemoEncDecCtcModelConfig
|
|
Whisper OfflineWhisperModelConfig
|
|
Tdnn OfflineTdnnModelConfig
|
|
SenseVoice OfflineSenseVoiceModelConfig
|
|
Moonshine OfflineMoonshineModelConfig
|
|
FireRedAsr OfflineFireRedAsrModelConfig
|
|
Dolphin OfflineDolphinModelConfig
|
|
ZipformerCtc OfflineZipformerCtcModelConfig
|
|
Tokens string // Path to tokens.txt
|
|
|
|
// Number of threads to use for neural network computation
|
|
NumThreads int
|
|
|
|
// 1 to print model meta information while loading
|
|
Debug int
|
|
|
|
// Optional. Valid values: cpu, cuda, coreml
|
|
Provider string
|
|
|
|
// Optional. Specify it for faster model initialization.
|
|
ModelType string
|
|
|
|
ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe
|
|
BpeVocab string // Optional.
|
|
TeleSpeechCtc string // Optional.
|
|
}
|
|
|
|
// Configuration for the offline/non-streaming recognizer.
|
|
type OfflineRecognizerConfig struct {
|
|
FeatConfig FeatureConfig
|
|
ModelConfig OfflineModelConfig
|
|
LmConfig OfflineLMConfig
|
|
|
|
// Valid decoding method: greedy_search, modified_beam_search
|
|
DecodingMethod string
|
|
|
|
// Used only when DecodingMethod is modified_beam_search.
|
|
MaxActivePaths int
|
|
HotwordsFile string
|
|
HotwordsScore float32
|
|
BlankPenalty float32
|
|
RuleFsts string
|
|
RuleFars string
|
|
Hr HomophoneReplacerConfig
|
|
}
|
|
|
|
// It wraps a pointer from C
|
|
type OfflineRecognizer struct {
|
|
impl *C.struct_SherpaOnnxOfflineRecognizer
|
|
}
|
|
|
|
// It wraps a pointer from C
|
|
type OfflineStream struct {
|
|
impl *C.struct_SherpaOnnxOfflineStream
|
|
}
|
|
|
|
// It contains recognition result of an offline stream.
|
|
type OfflineRecognizerResult struct {
|
|
Text string
|
|
Tokens []string
|
|
Timestamps []float32
|
|
Lang string
|
|
Emotion string
|
|
Event string
|
|
}
|
|
|
|
func newCOfflineRecognizerConfig(config *OfflineRecognizerConfig) *C.struct_SherpaOnnxOfflineRecognizerConfig {
|
|
c := C.struct_SherpaOnnxOfflineRecognizerConfig{}
|
|
c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate)
|
|
c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim)
|
|
|
|
c.model_config.transducer.encoder = C.CString(config.ModelConfig.Transducer.Encoder)
|
|
c.model_config.transducer.decoder = C.CString(config.ModelConfig.Transducer.Decoder)
|
|
c.model_config.transducer.joiner = C.CString(config.ModelConfig.Transducer.Joiner)
|
|
|
|
c.model_config.paraformer.model = C.CString(config.ModelConfig.Paraformer.Model)
|
|
|
|
c.model_config.nemo_ctc.model = C.CString(config.ModelConfig.NemoCTC.Model)
|
|
|
|
c.model_config.whisper.encoder = C.CString(config.ModelConfig.Whisper.Encoder)
|
|
c.model_config.whisper.decoder = C.CString(config.ModelConfig.Whisper.Decoder)
|
|
c.model_config.whisper.language = C.CString(config.ModelConfig.Whisper.Language)
|
|
c.model_config.whisper.task = C.CString(config.ModelConfig.Whisper.Task)
|
|
c.model_config.whisper.tail_paddings = C.int(config.ModelConfig.Whisper.TailPaddings)
|
|
|
|
c.model_config.tdnn.model = C.CString(config.ModelConfig.Tdnn.Model)
|
|
|
|
c.model_config.sense_voice.model = C.CString(config.ModelConfig.SenseVoice.Model)
|
|
c.model_config.sense_voice.language = C.CString(config.ModelConfig.SenseVoice.Language)
|
|
c.model_config.sense_voice.use_itn = C.int(config.ModelConfig.SenseVoice.UseInverseTextNormalization)
|
|
|
|
c.model_config.moonshine.preprocessor = C.CString(config.ModelConfig.Moonshine.Preprocessor)
|
|
c.model_config.moonshine.encoder = C.CString(config.ModelConfig.Moonshine.Encoder)
|
|
c.model_config.moonshine.uncached_decoder = C.CString(config.ModelConfig.Moonshine.UncachedDecoder)
|
|
c.model_config.moonshine.cached_decoder = C.CString(config.ModelConfig.Moonshine.CachedDecoder)
|
|
|
|
c.model_config.fire_red_asr.encoder = C.CString(config.ModelConfig.FireRedAsr.Encoder)
|
|
c.model_config.fire_red_asr.decoder = C.CString(config.ModelConfig.FireRedAsr.Decoder)
|
|
|
|
c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model)
|
|
c.model_config.zipformer_ctc.model = C.CString(config.ModelConfig.ZipformerCtc.Model)
|
|
|
|
c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
|
|
|
|
c.model_config.num_threads = C.int(config.ModelConfig.NumThreads)
|
|
|
|
c.model_config.debug = C.int(config.ModelConfig.Debug)
|
|
|
|
c.model_config.provider = C.CString(config.ModelConfig.Provider)
|
|
|
|
c.model_config.model_type = C.CString(config.ModelConfig.ModelType)
|
|
|
|
c.model_config.modeling_unit = C.CString(config.ModelConfig.ModelingUnit)
|
|
|
|
c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab)
|
|
|
|
c.model_config.telespeech_ctc = C.CString(config.ModelConfig.TeleSpeechCtc)
|
|
|
|
c.lm_config.model = C.CString(config.LmConfig.Model)
|
|
c.lm_config.scale = C.float(config.LmConfig.Scale)
|
|
|
|
c.decoding_method = C.CString(config.DecodingMethod)
|
|
|
|
c.max_active_paths = C.int(config.MaxActivePaths)
|
|
|
|
c.hotwords_file = C.CString(config.HotwordsFile)
|
|
c.hotwords_score = C.float(config.HotwordsScore)
|
|
|
|
c.blank_penalty = C.float(config.BlankPenalty)
|
|
|
|
c.rule_fsts = C.CString(config.RuleFsts)
|
|
c.rule_fars = C.CString(config.RuleFars)
|
|
|
|
c.hr.dict_dir = C.CString(config.Hr.DictDir)
|
|
c.hr.lexicon = C.CString(config.Hr.Lexicon)
|
|
c.hr.rule_fsts = C.CString(config.Hr.RuleFsts)
|
|
return &c
|
|
}
|
|
func freeCOfflineRecognizerConfig(c *C.struct_SherpaOnnxOfflineRecognizerConfig) {
|
|
if c.model_config.transducer.encoder != nil {
|
|
C.free(unsafe.Pointer(c.model_config.transducer.encoder))
|
|
c.model_config.transducer.encoder = nil
|
|
}
|
|
if c.model_config.transducer.decoder != nil {
|
|
C.free(unsafe.Pointer(c.model_config.transducer.decoder))
|
|
c.model_config.transducer.decoder = nil
|
|
}
|
|
if c.model_config.transducer.joiner != nil {
|
|
C.free(unsafe.Pointer(c.model_config.transducer.joiner))
|
|
c.model_config.transducer.joiner = nil
|
|
}
|
|
|
|
if c.model_config.paraformer.model != nil {
|
|
C.free(unsafe.Pointer(c.model_config.paraformer.model))
|
|
c.model_config.paraformer.model = nil
|
|
}
|
|
|
|
if c.model_config.nemo_ctc.model != nil {
|
|
C.free(unsafe.Pointer(c.model_config.nemo_ctc.model))
|
|
c.model_config.nemo_ctc.model = nil
|
|
}
|
|
|
|
if c.model_config.whisper.encoder != nil {
|
|
C.free(unsafe.Pointer(c.model_config.whisper.encoder))
|
|
c.model_config.whisper.encoder = nil
|
|
}
|
|
if c.model_config.whisper.decoder != nil {
|
|
C.free(unsafe.Pointer(c.model_config.whisper.decoder))
|
|
c.model_config.whisper.decoder = nil
|
|
}
|
|
if c.model_config.whisper.language != nil {
|
|
C.free(unsafe.Pointer(c.model_config.whisper.language))
|
|
c.model_config.whisper.language = nil
|
|
}
|
|
if c.model_config.whisper.task != nil {
|
|
C.free(unsafe.Pointer(c.model_config.whisper.task))
|
|
c.model_config.whisper.task = nil
|
|
}
|
|
|
|
if c.model_config.tdnn.model != nil {
|
|
C.free(unsafe.Pointer(c.model_config.tdnn.model))
|
|
c.model_config.tdnn.model = nil
|
|
}
|
|
|
|
if c.model_config.sense_voice.model != nil {
|
|
C.free(unsafe.Pointer(c.model_config.sense_voice.model))
|
|
c.model_config.sense_voice.model = nil
|
|
}
|
|
if c.model_config.sense_voice.language != nil {
|
|
C.free(unsafe.Pointer(c.model_config.sense_voice.language))
|
|
c.model_config.sense_voice.language = nil
|
|
}
|
|
|
|
if c.model_config.moonshine.preprocessor != nil {
|
|
C.free(unsafe.Pointer(c.model_config.moonshine.preprocessor))
|
|
c.model_config.moonshine.preprocessor = nil
|
|
}
|
|
if c.model_config.moonshine.encoder != nil {
|
|
C.free(unsafe.Pointer(c.model_config.moonshine.encoder))
|
|
c.model_config.moonshine.encoder = nil
|
|
}
|
|
if c.model_config.moonshine.uncached_decoder != nil {
|
|
C.free(unsafe.Pointer(c.model_config.moonshine.uncached_decoder))
|
|
c.model_config.moonshine.uncached_decoder = nil
|
|
}
|
|
if c.model_config.moonshine.cached_decoder != nil {
|
|
C.free(unsafe.Pointer(c.model_config.moonshine.cached_decoder))
|
|
c.model_config.moonshine.cached_decoder = nil
|
|
}
|
|
|
|
if c.model_config.fire_red_asr.encoder != nil {
|
|
C.free(unsafe.Pointer(c.model_config.fire_red_asr.encoder))
|
|
c.model_config.fire_red_asr.encoder = nil
|
|
}
|
|
|
|
if c.model_config.fire_red_asr.decoder != nil {
|
|
C.free(unsafe.Pointer(c.model_config.fire_red_asr.decoder))
|
|
c.model_config.fire_red_asr.decoder = nil
|
|
}
|
|
|
|
if c.model_config.dolphin.model != nil {
|
|
C.free(unsafe.Pointer(c.model_config.dolphin.model))
|
|
c.model_config.dolphin.model = nil
|
|
}
|
|
|
|
if c.model_config.zipformer_ctc.model != nil {
|
|
C.free(unsafe.Pointer(c.model_config.zipformer_ctc.model))
|
|
c.model_config.zipformer_ctc.model = nil
|
|
}
|
|
|
|
if c.model_config.tokens != nil {
|
|
C.free(unsafe.Pointer(c.model_config.tokens))
|
|
c.model_config.tokens = nil
|
|
}
|
|
if c.model_config.provider != nil {
|
|
C.free(unsafe.Pointer(c.model_config.provider))
|
|
c.model_config.provider = nil
|
|
}
|
|
if c.model_config.model_type != nil {
|
|
C.free(unsafe.Pointer(c.model_config.model_type))
|
|
c.model_config.model_type = nil
|
|
}
|
|
if c.model_config.modeling_unit != nil {
|
|
C.free(unsafe.Pointer(c.model_config.modeling_unit))
|
|
c.model_config.modeling_unit = nil
|
|
}
|
|
if c.model_config.bpe_vocab != nil {
|
|
C.free(unsafe.Pointer(c.model_config.bpe_vocab))
|
|
c.model_config.bpe_vocab = nil
|
|
}
|
|
if c.model_config.telespeech_ctc != nil {
|
|
C.free(unsafe.Pointer(c.model_config.telespeech_ctc))
|
|
c.model_config.telespeech_ctc = nil
|
|
}
|
|
|
|
if c.lm_config.model != nil {
|
|
C.free(unsafe.Pointer(c.lm_config.model))
|
|
c.lm_config.model = nil
|
|
}
|
|
|
|
if c.decoding_method != nil {
|
|
C.free(unsafe.Pointer(c.decoding_method))
|
|
c.decoding_method = nil
|
|
}
|
|
|
|
if c.hotwords_file != nil {
|
|
C.free(unsafe.Pointer(c.hotwords_file))
|
|
c.hotwords_file = nil
|
|
}
|
|
|
|
if c.rule_fsts != nil {
|
|
C.free(unsafe.Pointer(c.rule_fsts))
|
|
c.rule_fsts = nil
|
|
}
|
|
|
|
if c.rule_fars != nil {
|
|
C.free(unsafe.Pointer(c.rule_fars))
|
|
c.rule_fars = nil
|
|
}
|
|
|
|
if c.hr.dict_dir != nil {
|
|
C.free(unsafe.Pointer(c.hr.dict_dir))
|
|
c.hr.dict_dir = nil
|
|
}
|
|
|
|
if c.hr.lexicon != nil {
|
|
C.free(unsafe.Pointer(c.hr.lexicon))
|
|
c.hr.lexicon = nil
|
|
}
|
|
|
|
if c.hr.rule_fsts != nil {
|
|
C.free(unsafe.Pointer(c.hr.rule_fsts))
|
|
c.hr.rule_fsts = nil
|
|
}
|
|
}
|
|
|
|
// Frees the internal pointer of the recognition to avoid memory leak.
|
|
func DeleteOfflineRecognizer(recognizer *OfflineRecognizer) {
|
|
C.SherpaOnnxDestroyOfflineRecognizer(recognizer.impl)
|
|
recognizer.impl = nil
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOfflineRecognizer]() to free
|
|
// the returned recognizer to avoid memory leak
|
|
func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer {
|
|
c := newCOfflineRecognizerConfig(config)
|
|
defer freeCOfflineRecognizerConfig(c)
|
|
|
|
impl := C.SherpaOnnxCreateOfflineRecognizer(c)
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
recognizer := &OfflineRecognizer{}
|
|
recognizer.impl = impl
|
|
|
|
return recognizer
|
|
}
|
|
|
|
// Set new config to replace
|
|
func (r *OfflineRecognizer) SetConfig(config *OfflineRecognizerConfig) {
|
|
c := newCOfflineRecognizerConfig(config)
|
|
defer freeCOfflineRecognizerConfig(c)
|
|
|
|
C.SherpaOnnxOfflineRecognizerSetConfig(r.impl, c)
|
|
}
|
|
|
|
// Frees the internal pointer of the stream to avoid memory leak.
|
|
func DeleteOfflineStream(stream *OfflineStream) {
|
|
C.SherpaOnnxDestroyOfflineStream(stream.impl)
|
|
stream.impl = nil
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOfflineStream]() to free
|
|
// the returned stream to avoid memory leak
|
|
func NewOfflineStream(recognizer *OfflineRecognizer) *OfflineStream {
|
|
stream := &OfflineStream{}
|
|
stream.impl = C.SherpaOnnxCreateOfflineStream(recognizer.impl)
|
|
return stream
|
|
}
|
|
|
|
// Input audio samples for the offline stream.
|
|
// Please only call it once. That is, input all samples at once.
|
|
//
|
|
// sampleRate is the sample rate of the input audio samples. If it is different
|
|
// from the value expected by the feature extractor, we will do resampling inside.
|
|
//
|
|
// samples contains the actual audio samples. Each sample is in the range [-1, 1].
|
|
func (s *OfflineStream) AcceptWaveform(sampleRate int, samples []float32) {
|
|
C.SherpaOnnxAcceptWaveformOffline(s.impl, C.int(sampleRate), (*C.float)(&samples[0]), C.int(len(samples)))
|
|
}
|
|
|
|
// Decode the offline stream.
|
|
func (recognizer *OfflineRecognizer) Decode(s *OfflineStream) {
|
|
C.SherpaOnnxDecodeOfflineStream(recognizer.impl, s.impl)
|
|
}
|
|
|
|
// Decode multiple streams in parallel, i.e., in batch.
|
|
func (recognizer *OfflineRecognizer) DecodeStreams(s []*OfflineStream) {
|
|
ss := make([]*C.struct_SherpaOnnxOfflineStream, len(s))
|
|
for i, v := range s {
|
|
ss[i] = v.impl
|
|
}
|
|
|
|
C.SherpaOnnxDecodeMultipleOfflineStreams(recognizer.impl, &ss[0], C.int(len(s)))
|
|
}
|
|
|
|
// Get the recognition result of the offline stream.
|
|
func (s *OfflineStream) GetResult() *OfflineRecognizerResult {
|
|
p := C.SherpaOnnxGetOfflineStreamResult(s.impl)
|
|
defer C.SherpaOnnxDestroyOfflineRecognizerResult(p)
|
|
n := int(p.count)
|
|
if n == 0 {
|
|
return nil
|
|
}
|
|
result := &OfflineRecognizerResult{}
|
|
result.Text = C.GoString(p.text)
|
|
result.Lang = C.GoString(p.lang)
|
|
result.Emotion = C.GoString(p.emotion)
|
|
result.Event = C.GoString(p.event)
|
|
result.Tokens = make([]string, n)
|
|
tokens := unsafe.Slice(p.tokens_arr, n)
|
|
for i := 0; i < n; i++ {
|
|
result.Tokens[i] = C.GoString(tokens[i])
|
|
}
|
|
if p.timestamps == nil {
|
|
return result
|
|
}
|
|
result.Timestamps = make([]float32, n)
|
|
timestamps := unsafe.Slice(p.timestamps, n)
|
|
for i := 0; i < n; i++ {
|
|
result.Timestamps[i] = float32(timestamps[i])
|
|
}
|
|
return result
|
|
}
|
|
|
|
// Configuration for offline/non-streaming text-to-speech (TTS).
|
|
//
|
|
// Please refer to
|
|
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html
|
|
// to download pre-trained models
|
|
type OfflineTtsVitsModelConfig struct {
|
|
Model string // Path to the VITS onnx model
|
|
Lexicon string // Path to lexicon.txt
|
|
Tokens string // Path to tokens.txt
|
|
DataDir string // Path to espeak-ng-data directory
|
|
NoiseScale float32 // noise scale for vits models. Please use 0.667 in general
|
|
NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general
|
|
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
|
|
DictDir string // Path to dict directory for jieba (used only in Chinese tts)
|
|
}
|
|
|
|
type OfflineTtsMatchaModelConfig struct {
|
|
AcousticModel string // Path to the acoustic model for MatchaTTS
|
|
Vocoder string // Path to the vocoder model for MatchaTTS
|
|
Lexicon string // Path to lexicon.txt
|
|
Tokens string // Path to tokens.txt
|
|
DataDir string // Path to espeak-ng-data directory
|
|
NoiseScale float32 // noise scale for vits models. Please use 0.667 in general
|
|
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
|
|
DictDir string // Path to dict directory for jieba (used only in Chinese tts)
|
|
}
|
|
|
|
type OfflineTtsKokoroModelConfig struct {
|
|
Model string // Path to the model for kokoro
|
|
Voices string // Path to the voices.bin for kokoro
|
|
Tokens string // Path to tokens.txt
|
|
DataDir string // Path to espeak-ng-data directory
|
|
DictDir string // Path to dict directory
|
|
Lexicon string // Path to lexicon files
|
|
Lang string // Example: es for Spanish, fr-fr for French. Can be empty
|
|
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
|
|
}
|
|
|
|
type OfflineTtsModelConfig struct {
|
|
Vits OfflineTtsVitsModelConfig
|
|
Matcha OfflineTtsMatchaModelConfig
|
|
Kokoro OfflineTtsKokoroModelConfig
|
|
|
|
// Number of threads to use for neural network computation
|
|
NumThreads int
|
|
|
|
// 1 to print model meta information while loading
|
|
Debug int
|
|
|
|
// Optional. Valid values: cpu, cuda, coreml
|
|
Provider string
|
|
}
|
|
|
|
type OfflineTtsConfig struct {
|
|
Model OfflineTtsModelConfig
|
|
RuleFsts string
|
|
RuleFars string
|
|
MaxNumSentences int
|
|
SilenceScale float32
|
|
}
|
|
|
|
type GeneratedAudio struct {
|
|
// Normalized samples in the range [-1, 1]
|
|
Samples []float32
|
|
|
|
SampleRate int
|
|
}
|
|
|
|
// The offline tts class. It wraps a pointer from C.
|
|
type OfflineTts struct {
|
|
impl *C.struct_SherpaOnnxOfflineTts
|
|
}
|
|
|
|
type sherpaOnnxGeneratedAudioCallbackWithArg func(samples []float32)
|
|
|
|
//export _cgoGeneratedAudioCallback
|
|
func _cgoGeneratedAudioCallback(samples *C.float, n C.int32_t, arg unsafe.Pointer) C.int32_t {
|
|
h := *(*cgo.Handle)(arg)
|
|
val := h.Value().(sherpaOnnxGeneratedAudioCallbackWithArg)
|
|
all := make([]float32, n)
|
|
arr := unsafe.Slice(samples, n)
|
|
for i := 0; i < int(n); i++ {
|
|
all[i] = float32(arr[i])
|
|
}
|
|
val(all)
|
|
return 1
|
|
}
|
|
|
|
type sherpaOnnxGeneratedAudioProgressCallbackWithArg func(samples []float32, p float32)
|
|
|
|
//export _cgoGeneratedAudioProgressCallback
|
|
func _cgoGeneratedAudioProgressCallback(samples *C.float, n C.int32_t, p C.float, arg unsafe.Pointer) C.int32_t {
|
|
h := *(*cgo.Handle)(arg)
|
|
val := h.Value().(sherpaOnnxGeneratedAudioProgressCallbackWithArg)
|
|
all := make([]float32, n)
|
|
arr := unsafe.Slice(samples, n)
|
|
for i := 0; i < int(n); i++ {
|
|
all[i] = float32(arr[i])
|
|
}
|
|
val(all, float32(p))
|
|
return 1
|
|
}
|
|
|
|
// Free the internal pointer inside the tts to avoid memory leak.
|
|
func DeleteOfflineTts(tts *OfflineTts) {
|
|
C.SherpaOnnxDestroyOfflineTts(tts.impl)
|
|
tts.impl = nil
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOfflineTts]() to free
|
|
// the returned tts to avoid memory leak
|
|
func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
|
|
c := C.struct_SherpaOnnxOfflineTtsConfig{}
|
|
|
|
c.rule_fsts = C.CString(config.RuleFsts)
|
|
defer C.free(unsafe.Pointer(c.rule_fsts))
|
|
|
|
c.rule_fars = C.CString(config.RuleFars)
|
|
defer C.free(unsafe.Pointer(c.rule_fars))
|
|
|
|
c.max_num_sentences = C.int(config.MaxNumSentences)
|
|
c.silence_scale = C.float(config.SilenceScale)
|
|
|
|
// vits
|
|
c.model.vits.model = C.CString(config.Model.Vits.Model)
|
|
defer C.free(unsafe.Pointer(c.model.vits.model))
|
|
|
|
c.model.vits.lexicon = C.CString(config.Model.Vits.Lexicon)
|
|
defer C.free(unsafe.Pointer(c.model.vits.lexicon))
|
|
|
|
c.model.vits.tokens = C.CString(config.Model.Vits.Tokens)
|
|
defer C.free(unsafe.Pointer(c.model.vits.tokens))
|
|
|
|
c.model.vits.data_dir = C.CString(config.Model.Vits.DataDir)
|
|
defer C.free(unsafe.Pointer(c.model.vits.data_dir))
|
|
|
|
c.model.vits.noise_scale = C.float(config.Model.Vits.NoiseScale)
|
|
c.model.vits.noise_scale_w = C.float(config.Model.Vits.NoiseScaleW)
|
|
c.model.vits.length_scale = C.float(config.Model.Vits.LengthScale)
|
|
|
|
c.model.vits.dict_dir = C.CString(config.Model.Vits.DictDir)
|
|
defer C.free(unsafe.Pointer(c.model.vits.dict_dir))
|
|
|
|
// matcha
|
|
c.model.matcha.acoustic_model = C.CString(config.Model.Matcha.AcousticModel)
|
|
defer C.free(unsafe.Pointer(c.model.matcha.acoustic_model))
|
|
|
|
c.model.matcha.vocoder = C.CString(config.Model.Matcha.Vocoder)
|
|
defer C.free(unsafe.Pointer(c.model.matcha.vocoder))
|
|
|
|
c.model.matcha.lexicon = C.CString(config.Model.Matcha.Lexicon)
|
|
defer C.free(unsafe.Pointer(c.model.matcha.lexicon))
|
|
|
|
c.model.matcha.tokens = C.CString(config.Model.Matcha.Tokens)
|
|
defer C.free(unsafe.Pointer(c.model.matcha.tokens))
|
|
|
|
c.model.matcha.data_dir = C.CString(config.Model.Matcha.DataDir)
|
|
defer C.free(unsafe.Pointer(c.model.matcha.data_dir))
|
|
|
|
c.model.matcha.noise_scale = C.float(config.Model.Matcha.NoiseScale)
|
|
c.model.matcha.length_scale = C.float(config.Model.Matcha.LengthScale)
|
|
|
|
c.model.matcha.dict_dir = C.CString(config.Model.Matcha.DictDir)
|
|
defer C.free(unsafe.Pointer(c.model.matcha.dict_dir))
|
|
|
|
// kokoro
|
|
c.model.kokoro.model = C.CString(config.Model.Kokoro.Model)
|
|
defer C.free(unsafe.Pointer(c.model.kokoro.model))
|
|
|
|
c.model.kokoro.voices = C.CString(config.Model.Kokoro.Voices)
|
|
defer C.free(unsafe.Pointer(c.model.kokoro.voices))
|
|
|
|
c.model.kokoro.tokens = C.CString(config.Model.Kokoro.Tokens)
|
|
defer C.free(unsafe.Pointer(c.model.kokoro.tokens))
|
|
|
|
c.model.kokoro.data_dir = C.CString(config.Model.Kokoro.DataDir)
|
|
defer C.free(unsafe.Pointer(c.model.kokoro.data_dir))
|
|
|
|
c.model.kokoro.dict_dir = C.CString(config.Model.Kokoro.DictDir)
|
|
defer C.free(unsafe.Pointer(c.model.kokoro.dict_dir))
|
|
|
|
c.model.kokoro.lexicon = C.CString(config.Model.Kokoro.Lexicon)
|
|
defer C.free(unsafe.Pointer(c.model.kokoro.lexicon))
|
|
|
|
c.model.kokoro.lang = C.CString(config.Model.Kokoro.Lang)
|
|
defer C.free(unsafe.Pointer(c.model.kokoro.lang))
|
|
|
|
c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale)
|
|
|
|
c.model.num_threads = C.int(config.Model.NumThreads)
|
|
c.model.debug = C.int(config.Model.Debug)
|
|
|
|
c.model.provider = C.CString(config.Model.Provider)
|
|
defer C.free(unsafe.Pointer(c.model.provider))
|
|
|
|
impl := C.SherpaOnnxCreateOfflineTts(&c)
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
tts := &OfflineTts{}
|
|
tts.impl = impl
|
|
return tts
|
|
}
|
|
|
|
func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedAudio {
|
|
s := C.CString(text)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
audio := C.SherpaOnnxOfflineTtsGenerate(tts.impl, s, C.int(sid), C.float(speed))
|
|
defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)
|
|
|
|
ans := &GeneratedAudio{}
|
|
ans.SampleRate = int(audio.sample_rate)
|
|
n := int(audio.n)
|
|
ans.Samples = make([]float32, n)
|
|
|
|
// see https://stackoverflow.com/questions/48756732/what-does-1-30c-yourtype-do-exactly-in-cgo
|
|
// :n:n means 0:n:n, means low:high:capacity
|
|
samples := unsafe.Slice(audio.samples, n)
|
|
for i := 0; i < n; i++ {
|
|
ans.Samples[i] = float32(samples[i])
|
|
}
|
|
|
|
return ans
|
|
}
|
|
|
|
func (tts *OfflineTts) GenerateWithCallback(text string, sid int, speed float32, cb sherpaOnnxGeneratedAudioCallbackWithArg) {
|
|
s := C.CString(text)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
h := cgo.NewHandle(cb)
|
|
defer h.Delete()
|
|
audio := C.SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(tts.impl, s, C.int(sid), C.float(speed), C.SherpaOnnxGeneratedAudioCallbackWithArg(C._cgoGeneratedAudioCallback), unsafe.Pointer(&h))
|
|
defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)
|
|
}
|
|
|
|
func (tts *OfflineTts) GenerateWithProgressCallback(text string, sid int, speed float32, cb sherpaOnnxGeneratedAudioProgressCallbackWithArg) {
|
|
s := C.CString(text)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
h := cgo.NewHandle(cb)
|
|
defer h.Delete()
|
|
audio := C.SherpaOnnxOfflineTtsGenerateWithProgressCallbackWithArg(tts.impl, s, C.int(sid), C.float(speed), C.SherpaOnnxGeneratedAudioProgressCallbackWithArg(C._cgoGeneratedAudioProgressCallback), unsafe.Pointer(&h))
|
|
defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)
|
|
}
|
|
|
|
func (audio *GeneratedAudio) Save(filename string) bool {
|
|
s := C.CString(filename)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s))
|
|
|
|
return ok == 1
|
|
}
|
|
|
|
// ============================================================
|
|
// For VAD
|
|
// ============================================================
|
|
type SileroVadModelConfig struct {
|
|
Model string
|
|
Threshold float32
|
|
MinSilenceDuration float32
|
|
MinSpeechDuration float32
|
|
WindowSize int
|
|
MaxSpeechDuration float32
|
|
}
|
|
|
|
type VadModelConfig struct {
|
|
SileroVad SileroVadModelConfig
|
|
SampleRate int
|
|
NumThreads int
|
|
Provider string
|
|
Debug int
|
|
}
|
|
|
|
type CircularBuffer struct {
|
|
impl *C.struct_SherpaOnnxCircularBuffer
|
|
}
|
|
|
|
func DeleteCircularBuffer(buffer *CircularBuffer) {
|
|
C.SherpaOnnxDestroyCircularBuffer(buffer.impl)
|
|
buffer.impl = nil
|
|
}
|
|
|
|
func NewCircularBuffer(capacity int) *CircularBuffer {
|
|
circularBuffer := &CircularBuffer{}
|
|
circularBuffer.impl = C.SherpaOnnxCreateCircularBuffer(C.int(capacity))
|
|
return circularBuffer
|
|
}
|
|
|
|
func (buffer *CircularBuffer) Push(samples []float32) {
|
|
C.SherpaOnnxCircularBufferPush(buffer.impl, (*C.float)(&samples[0]), C.int(len(samples)))
|
|
}
|
|
|
|
func (buffer *CircularBuffer) Get(start int, n int) []float32 {
|
|
samples := C.SherpaOnnxCircularBufferGet(buffer.impl, C.int(start), C.int(n))
|
|
defer C.SherpaOnnxCircularBufferFree(samples)
|
|
|
|
result := make([]float32, n)
|
|
|
|
p := unsafe.Slice(samples, n)
|
|
for i := 0; i < n; i++ {
|
|
result[i] = float32(p[i])
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func (buffer *CircularBuffer) Pop(n int) {
|
|
C.SherpaOnnxCircularBufferPop(buffer.impl, C.int(n))
|
|
}
|
|
|
|
func (buffer *CircularBuffer) Size() int {
|
|
return int(C.SherpaOnnxCircularBufferSize(buffer.impl))
|
|
}
|
|
|
|
func (buffer *CircularBuffer) Head() int {
|
|
return int(C.SherpaOnnxCircularBufferHead(buffer.impl))
|
|
}
|
|
|
|
func (buffer *CircularBuffer) Reset() {
|
|
C.SherpaOnnxCircularBufferReset(buffer.impl)
|
|
}
|
|
|
|
type SpeechSegment struct {
|
|
Start int
|
|
Samples []float32
|
|
}
|
|
|
|
type VoiceActivityDetector struct {
|
|
impl *C.struct_SherpaOnnxVoiceActivityDetector
|
|
}
|
|
|
|
func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float32) *VoiceActivityDetector {
|
|
c := C.struct_SherpaOnnxVadModelConfig{}
|
|
|
|
c.silero_vad.model = C.CString(config.SileroVad.Model)
|
|
defer C.free(unsafe.Pointer(c.silero_vad.model))
|
|
|
|
c.silero_vad.threshold = C.float(config.SileroVad.Threshold)
|
|
c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
|
|
c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
|
|
c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
|
|
c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration)
|
|
|
|
c.sample_rate = C.int(config.SampleRate)
|
|
c.num_threads = C.int(config.NumThreads)
|
|
c.provider = C.CString(config.Provider)
|
|
defer C.free(unsafe.Pointer(c.provider))
|
|
|
|
c.debug = C.int(config.Debug)
|
|
|
|
impl := C.SherpaOnnxCreateVoiceActivityDetector(&c, C.float(bufferSizeInSeconds))
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
vad := &VoiceActivityDetector{}
|
|
vad.impl = impl
|
|
return vad
|
|
}
|
|
|
|
func DeleteVoiceActivityDetector(vad *VoiceActivityDetector) {
|
|
C.SherpaOnnxDestroyVoiceActivityDetector(vad.impl)
|
|
vad.impl = nil
|
|
}
|
|
|
|
func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32) {
|
|
C.SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad.impl, (*C.float)(&samples[0]), C.int(len(samples)))
|
|
}
|
|
|
|
func (vad *VoiceActivityDetector) IsEmpty() bool {
|
|
return int(C.SherpaOnnxVoiceActivityDetectorEmpty(vad.impl)) == 1
|
|
}
|
|
|
|
func (vad *VoiceActivityDetector) IsSpeech() bool {
|
|
return int(C.SherpaOnnxVoiceActivityDetectorDetected(vad.impl)) == 1
|
|
}
|
|
|
|
func (vad *VoiceActivityDetector) Pop() {
|
|
C.SherpaOnnxVoiceActivityDetectorPop(vad.impl)
|
|
}
|
|
|
|
func (vad *VoiceActivityDetector) Clear() {
|
|
C.SherpaOnnxVoiceActivityDetectorClear(vad.impl)
|
|
}
|
|
|
|
func (vad *VoiceActivityDetector) Front() *SpeechSegment {
|
|
f := C.SherpaOnnxVoiceActivityDetectorFront(vad.impl)
|
|
defer C.SherpaOnnxDestroySpeechSegment(f)
|
|
|
|
ans := &SpeechSegment{}
|
|
ans.Start = int(f.start)
|
|
|
|
n := int(f.n)
|
|
ans.Samples = make([]float32, n)
|
|
|
|
samples := unsafe.Slice(f.samples, n)
|
|
|
|
for i := 0; i < n; i++ {
|
|
ans.Samples[i] = float32(samples[i])
|
|
}
|
|
|
|
return ans
|
|
}
|
|
|
|
func (vad *VoiceActivityDetector) Reset() {
|
|
C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
|
|
}
|
|
|
|
func (vad *VoiceActivityDetector) Flush() {
|
|
C.SherpaOnnxVoiceActivityDetectorFlush(vad.impl)
|
|
}
|
|
|
|
// Spoken language identification
|
|
|
|
type SpokenLanguageIdentificationWhisperConfig struct {
|
|
Encoder string
|
|
Decoder string
|
|
TailPaddings int
|
|
}
|
|
|
|
type SpokenLanguageIdentificationConfig struct {
|
|
Whisper SpokenLanguageIdentificationWhisperConfig
|
|
NumThreads int
|
|
Debug int
|
|
Provider string
|
|
}
|
|
|
|
type SpokenLanguageIdentification struct {
|
|
impl *C.struct_SherpaOnnxSpokenLanguageIdentification
|
|
}
|
|
|
|
type SpokenLanguageIdentificationResult struct {
|
|
Lang string
|
|
}
|
|
|
|
func NewSpokenLanguageIdentification(config *SpokenLanguageIdentificationConfig) *SpokenLanguageIdentification {
|
|
c := C.struct_SherpaOnnxSpokenLanguageIdentificationConfig{}
|
|
|
|
c.whisper.encoder = C.CString(config.Whisper.Encoder)
|
|
defer C.free(unsafe.Pointer(c.whisper.encoder))
|
|
|
|
c.whisper.decoder = C.CString(config.Whisper.Decoder)
|
|
defer C.free(unsafe.Pointer(c.whisper.decoder))
|
|
|
|
c.whisper.tail_paddings = C.int(config.Whisper.TailPaddings)
|
|
|
|
c.num_threads = C.int(config.NumThreads)
|
|
c.debug = C.int(config.Debug)
|
|
|
|
c.provider = C.CString(config.Provider)
|
|
defer C.free(unsafe.Pointer(c.provider))
|
|
|
|
slid := &SpokenLanguageIdentification{}
|
|
slid.impl = C.SherpaOnnxCreateSpokenLanguageIdentification(&c)
|
|
|
|
return slid
|
|
}
|
|
|
|
func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification) {
|
|
C.SherpaOnnxDestroySpokenLanguageIdentification(slid.impl)
|
|
slid.impl = nil
|
|
}
|
|
|
|
// The user has to invoke DeleteOfflineStream() to free the returned value
|
|
// to avoid memory leak
|
|
func (slid *SpokenLanguageIdentification) CreateStream() *OfflineStream {
|
|
stream := &OfflineStream{}
|
|
stream.impl = C.SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid.impl)
|
|
return stream
|
|
}
|
|
|
|
func (slid *SpokenLanguageIdentification) Compute(stream *OfflineStream) *SpokenLanguageIdentificationResult {
|
|
r := C.SherpaOnnxSpokenLanguageIdentificationCompute(slid.impl, stream.impl)
|
|
// defer C.SherpaOnnxDestroySpokenLanguageIdentificationResult(r)
|
|
|
|
ans := &SpokenLanguageIdentificationResult{}
|
|
ans.Lang = C.GoString(r.lang)
|
|
|
|
return ans
|
|
}
|
|
|
|
// ============================================================
|
|
// For speaker embedding extraction
|
|
// ============================================================
|
|
|
|
type SpeakerEmbeddingExtractorConfig struct {
|
|
Model string
|
|
NumThreads int
|
|
Debug int
|
|
Provider string
|
|
}
|
|
|
|
type SpeakerEmbeddingExtractor struct {
|
|
impl *C.struct_SherpaOnnxSpeakerEmbeddingExtractor
|
|
}
|
|
|
|
// The user has to invoke [DeleteSpeakerEmbeddingExtractor]() to free the returned value
|
|
// to avoid memory leak
|
|
func NewSpeakerEmbeddingExtractor(config *SpeakerEmbeddingExtractorConfig) *SpeakerEmbeddingExtractor {
|
|
c := C.struct_SherpaOnnxSpeakerEmbeddingExtractorConfig{}
|
|
|
|
c.model = C.CString(config.Model)
|
|
defer C.free(unsafe.Pointer(c.model))
|
|
|
|
c.num_threads = C.int(config.NumThreads)
|
|
c.debug = C.int(config.Debug)
|
|
|
|
c.provider = C.CString(config.Provider)
|
|
defer C.free(unsafe.Pointer(c.provider))
|
|
|
|
impl := C.SherpaOnnxCreateSpeakerEmbeddingExtractor(&c)
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
ex := &SpeakerEmbeddingExtractor{}
|
|
ex.impl = impl
|
|
return ex
|
|
}
|
|
|
|
func DeleteSpeakerEmbeddingExtractor(ex *SpeakerEmbeddingExtractor) {
|
|
C.SherpaOnnxDestroySpeakerEmbeddingExtractor(ex.impl)
|
|
ex.impl = nil
|
|
}
|
|
|
|
func (ex *SpeakerEmbeddingExtractor) Dim() int {
|
|
return int(C.SherpaOnnxSpeakerEmbeddingExtractorDim(ex.impl))
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOnlineStream]() to free
|
|
// the returned stream to avoid memory leak
|
|
func (ex *SpeakerEmbeddingExtractor) CreateStream() *OnlineStream {
|
|
stream := &OnlineStream{}
|
|
stream.impl = C.SherpaOnnxSpeakerEmbeddingExtractorCreateStream(ex.impl)
|
|
return stream
|
|
}
|
|
|
|
func (ex *SpeakerEmbeddingExtractor) IsReady(stream *OnlineStream) bool {
|
|
return int(C.SherpaOnnxSpeakerEmbeddingExtractorIsReady(ex.impl, stream.impl)) == 1
|
|
}
|
|
|
|
func (ex *SpeakerEmbeddingExtractor) Compute(stream *OnlineStream) []float32 {
|
|
embedding := C.SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(ex.impl, stream.impl)
|
|
defer C.SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(embedding)
|
|
|
|
n := ex.Dim()
|
|
ans := make([]float32, n)
|
|
|
|
// see https://stackoverflow.com/questions/48756732/what-does-1-30c-yourtype-do-exactly-in-cgo
|
|
// :n:n means 0:n:n, means low:high:capacity
|
|
c := unsafe.Slice(embedding, n)
|
|
|
|
for i := 0; i < n; i++ {
|
|
ans[i] = float32(c[i])
|
|
}
|
|
|
|
return ans
|
|
}
|
|
|
|
type SpeakerEmbeddingManager struct {
|
|
impl *C.struct_SherpaOnnxSpeakerEmbeddingManager
|
|
}
|
|
|
|
// The user has to invoke [DeleteSpeakerEmbeddingManager]() to free the returned
|
|
// value to avoid memory leak
|
|
func NewSpeakerEmbeddingManager(dim int) *SpeakerEmbeddingManager {
|
|
impl := C.SherpaOnnxCreateSpeakerEmbeddingManager(C.int(dim))
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
m := &SpeakerEmbeddingManager{}
|
|
m.impl = impl
|
|
return m
|
|
}
|
|
|
|
func DeleteSpeakerEmbeddingManager(m *SpeakerEmbeddingManager) {
|
|
C.SherpaOnnxDestroySpeakerEmbeddingManager(m.impl)
|
|
m.impl = nil
|
|
}
|
|
|
|
func (m *SpeakerEmbeddingManager) Register(name string, embedding []float32) bool {
|
|
s := C.CString(name)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
return C.int(C.SherpaOnnxSpeakerEmbeddingManagerAdd(m.impl, s, (*C.float)(&embedding[0]))) == 1
|
|
}
|
|
|
|
func (m *SpeakerEmbeddingManager) RegisterV(name string, embeddings [][]float32) bool {
|
|
s := C.CString(name)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
if len(embeddings) == 0 {
|
|
return false
|
|
}
|
|
|
|
dim := len(embeddings[0])
|
|
v := make([]float32, 0, dim*len(embeddings))
|
|
for _, embedding := range embeddings {
|
|
v = append(v, embedding...)
|
|
}
|
|
|
|
return C.int(C.SherpaOnnxSpeakerEmbeddingManagerAddListFlattened(m.impl, s, (*C.float)(&v[0]), C.int(len(embeddings)))) == 1
|
|
}
|
|
|
|
func (m *SpeakerEmbeddingManager) Remove(name string) bool {
|
|
s := C.CString(name)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
return C.int(C.SherpaOnnxSpeakerEmbeddingManagerRemove(m.impl, s)) == 1
|
|
}
|
|
|
|
func (m *SpeakerEmbeddingManager) Search(embedding []float32, threshold float32) string {
|
|
var s string
|
|
|
|
name := C.SherpaOnnxSpeakerEmbeddingManagerSearch(m.impl, (*C.float)(&embedding[0]), C.float(threshold))
|
|
defer C.SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name)
|
|
|
|
if name != nil {
|
|
s = C.GoString(name)
|
|
}
|
|
|
|
return s
|
|
}
|
|
|
|
func (m *SpeakerEmbeddingManager) Verify(name string, embedding []float32, threshold float32) bool {
|
|
s := C.CString(name)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
return C.int(C.SherpaOnnxSpeakerEmbeddingManagerVerify(m.impl, s, (*C.float)(&embedding[0]), C.float(threshold))) == 1
|
|
}
|
|
|
|
func (m *SpeakerEmbeddingManager) Contains(name string) bool {
|
|
s := C.CString(name)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
return C.int(C.SherpaOnnxSpeakerEmbeddingManagerContains(m.impl, s)) == 1
|
|
}
|
|
|
|
func (m *SpeakerEmbeddingManager) NumSpeakers() int {
|
|
return int(C.SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(m.impl))
|
|
}
|
|
|
|
func (m *SpeakerEmbeddingManager) AllSpeakers() []string {
|
|
all_speakers := C.SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(m.impl)
|
|
defer C.SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers)
|
|
|
|
n := m.NumSpeakers()
|
|
if n == 0 {
|
|
return nil
|
|
}
|
|
|
|
// https://stackoverflow.com/questions/62012070/convert-array-of-strings-from-cgo-in-go
|
|
p := unsafe.Slice(all_speakers, n)
|
|
|
|
ans := make([]string, n)
|
|
|
|
for i := 0; i < n; i++ {
|
|
ans[i] = C.GoString(p[i])
|
|
}
|
|
|
|
return ans
|
|
}
|
|
|
|
// Wave
|
|
|
|
// single channel wave
|
|
type Wave = GeneratedAudio
|
|
|
|
func ReadWave(filename string) *Wave {
|
|
s := C.CString(filename)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
w := C.SherpaOnnxReadWave(s)
|
|
defer C.SherpaOnnxFreeWave(w)
|
|
|
|
if w == nil {
|
|
return nil
|
|
}
|
|
|
|
n := int(w.num_samples)
|
|
if n == 0 {
|
|
return nil
|
|
}
|
|
|
|
ans := &Wave{}
|
|
ans.SampleRate = int(w.sample_rate)
|
|
samples := unsafe.Slice(w.samples, n)
|
|
|
|
ans.Samples = make([]float32, n)
|
|
|
|
for i := 0; i < n; i++ {
|
|
ans.Samples[i] = float32(samples[i])
|
|
}
|
|
|
|
return ans
|
|
}
|
|
|
|
// ============================================================
|
|
// For offline speaker diarization
|
|
// ============================================================
|
|
type OfflineSpeakerSegmentationPyannoteModelConfig struct {
|
|
Model string
|
|
}
|
|
|
|
type OfflineSpeakerSegmentationModelConfig struct {
|
|
Pyannote OfflineSpeakerSegmentationPyannoteModelConfig
|
|
NumThreads int
|
|
Debug int
|
|
Provider string
|
|
}
|
|
|
|
type FastClusteringConfig struct {
|
|
NumClusters int
|
|
Threshold float32
|
|
}
|
|
|
|
type OfflineSpeakerDiarizationConfig struct {
|
|
Segmentation OfflineSpeakerSegmentationModelConfig
|
|
Embedding SpeakerEmbeddingExtractorConfig
|
|
Clustering FastClusteringConfig
|
|
MinDurationOn float32
|
|
MinDurationOff float32
|
|
}
|
|
|
|
type OfflineSpeakerDiarization struct {
|
|
impl *C.struct_SherpaOnnxOfflineSpeakerDiarization
|
|
}
|
|
|
|
func DeleteOfflineSpeakerDiarization(sd *OfflineSpeakerDiarization) {
|
|
C.SherpaOnnxDestroyOfflineSpeakerDiarization(sd.impl)
|
|
sd.impl = nil
|
|
}
|
|
|
|
func NewOfflineSpeakerDiarization(config *OfflineSpeakerDiarizationConfig) *OfflineSpeakerDiarization {
|
|
c := C.struct_SherpaOnnxOfflineSpeakerDiarizationConfig{}
|
|
c.segmentation.pyannote.model = C.CString(config.Segmentation.Pyannote.Model)
|
|
defer C.free(unsafe.Pointer(c.segmentation.pyannote.model))
|
|
|
|
c.segmentation.num_threads = C.int(config.Segmentation.NumThreads)
|
|
|
|
c.segmentation.debug = C.int(config.Segmentation.Debug)
|
|
|
|
c.segmentation.provider = C.CString(config.Segmentation.Provider)
|
|
defer C.free(unsafe.Pointer(c.segmentation.provider))
|
|
|
|
c.embedding.model = C.CString(config.Embedding.Model)
|
|
defer C.free(unsafe.Pointer(c.embedding.model))
|
|
|
|
c.embedding.num_threads = C.int(config.Embedding.NumThreads)
|
|
|
|
c.embedding.debug = C.int(config.Embedding.Debug)
|
|
|
|
c.embedding.provider = C.CString(config.Embedding.Provider)
|
|
defer C.free(unsafe.Pointer(c.embedding.provider))
|
|
|
|
c.clustering.num_clusters = C.int(config.Clustering.NumClusters)
|
|
c.clustering.threshold = C.float(config.Clustering.Threshold)
|
|
c.min_duration_on = C.float(config.MinDurationOn)
|
|
c.min_duration_off = C.float(config.MinDurationOff)
|
|
|
|
p := C.SherpaOnnxCreateOfflineSpeakerDiarization(&c)
|
|
|
|
if p == nil {
|
|
return nil
|
|
}
|
|
|
|
sd := &OfflineSpeakerDiarization{}
|
|
sd.impl = p
|
|
|
|
return sd
|
|
}
|
|
|
|
func (sd *OfflineSpeakerDiarization) SampleRate() int {
|
|
return int(C.SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd.impl))
|
|
}
|
|
|
|
// only config.Clustering is used. All other fields are ignored
|
|
func (sd *OfflineSpeakerDiarization) SetConfig(config *OfflineSpeakerDiarizationConfig) {
|
|
c := C.struct_SherpaOnnxOfflineSpeakerDiarizationConfig{}
|
|
|
|
c.clustering.num_clusters = C.int(config.Clustering.NumClusters)
|
|
c.clustering.threshold = C.float(config.Clustering.Threshold)
|
|
|
|
C.SherpaOnnxOfflineSpeakerDiarizationSetConfig(sd.impl, &c)
|
|
}
|
|
|
|
type OfflineSpeakerDiarizationSegment struct {
|
|
Start float32
|
|
End float32
|
|
Speaker int
|
|
}
|
|
|
|
func (sd *OfflineSpeakerDiarization) Process(samples []float32) []OfflineSpeakerDiarizationSegment {
|
|
r := C.SherpaOnnxOfflineSpeakerDiarizationProcess(sd.impl, (*C.float)(&samples[0]), C.int(len(samples)))
|
|
defer C.SherpaOnnxOfflineSpeakerDiarizationDestroyResult(r)
|
|
|
|
n := int(C.SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(r))
|
|
|
|
if n == 0 {
|
|
return nil
|
|
}
|
|
|
|
s := C.SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(r)
|
|
defer C.SherpaOnnxOfflineSpeakerDiarizationDestroySegment(s)
|
|
|
|
ans := make([]OfflineSpeakerDiarizationSegment, n)
|
|
|
|
p := unsafe.Slice(s, n)
|
|
|
|
for i := 0; i < n; i++ {
|
|
ans[i].Start = float32(p[i].start)
|
|
ans[i].End = float32(p[i].end)
|
|
ans[i].Speaker = int(p[i].speaker)
|
|
}
|
|
|
|
return ans
|
|
}
|
|
|
|
// ============================================================
|
|
// For punctuation
|
|
// ============================================================
|
|
type OfflinePunctuationModelConfig struct {
|
|
CtTransformer string
|
|
NumThreads C.int
|
|
Debug C.int // true to print debug information of the model
|
|
Provider string
|
|
}
|
|
|
|
type OfflinePunctuationConfig struct {
|
|
Model OfflinePunctuationModelConfig
|
|
}
|
|
|
|
type OfflinePunctuation struct {
|
|
impl *C.struct_SherpaOnnxOfflinePunctuation
|
|
}
|
|
|
|
func NewOfflinePunctuation(config *OfflinePunctuationConfig) *OfflinePunctuation {
|
|
cfg := C.struct_SherpaOnnxOfflinePunctuationConfig{}
|
|
cfg.model.ct_transformer = C.CString(config.Model.CtTransformer)
|
|
defer C.free(unsafe.Pointer(cfg.model.ct_transformer))
|
|
|
|
cfg.model.num_threads = config.Model.NumThreads
|
|
cfg.model.debug = config.Model.Debug
|
|
cfg.model.provider = C.CString(config.Model.Provider)
|
|
defer C.free(unsafe.Pointer(cfg.model.provider))
|
|
|
|
impl := C.SherpaOnnxCreateOfflinePunctuation(&cfg)
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
punc := &OfflinePunctuation{}
|
|
punc.impl = impl
|
|
return punc
|
|
}
|
|
|
|
func DeleteOfflinePunc(punc *OfflinePunctuation) {
|
|
C.SherpaOnnxDestroyOfflinePunctuation(punc.impl)
|
|
punc.impl = nil
|
|
}
|
|
|
|
func (punc *OfflinePunctuation) AddPunct(text string) string {
|
|
p := C.SherpaOfflinePunctuationAddPunct(punc.impl, C.CString(text))
|
|
defer C.SherpaOfflinePunctuationFreeText(p)
|
|
|
|
text_with_punct := C.GoString(p)
|
|
|
|
return text_with_punct
|
|
}
|
|
|
|
// Configuration for the online/streaming recognizer.
|
|
type KeywordSpotterConfig struct {
|
|
FeatConfig FeatureConfig
|
|
ModelConfig OnlineModelConfig
|
|
MaxActivePaths int
|
|
KeywordsFile string
|
|
KeywordsScore float32
|
|
KeywordsThreshold float32
|
|
KeywordsBuf string
|
|
KeywordsBufSize int
|
|
}
|
|
|
|
type KeywordSpotterResult struct {
|
|
Keyword string
|
|
}
|
|
|
|
type KeywordSpotter struct {
|
|
impl *C.struct_SherpaOnnxKeywordSpotter
|
|
}
|
|
|
|
// Free the internal pointer inside the recognizer to avoid memory leak.
|
|
func DeleteKeywordSpotter(spotter *KeywordSpotter) {
|
|
C.SherpaOnnxDestroyKeywordSpotter(spotter.impl)
|
|
spotter.impl = nil
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteKeywordSpotter]() to free
|
|
// the returned spotter to avoid memory leak
|
|
func NewKeywordSpotter(config *KeywordSpotterConfig) *KeywordSpotter {
|
|
c := C.struct_SherpaOnnxKeywordSpotterConfig{}
|
|
c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate)
|
|
c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim)
|
|
|
|
c.model_config.transducer.encoder = C.CString(config.ModelConfig.Transducer.Encoder)
|
|
defer C.free(unsafe.Pointer(c.model_config.transducer.encoder))
|
|
|
|
c.model_config.transducer.decoder = C.CString(config.ModelConfig.Transducer.Decoder)
|
|
defer C.free(unsafe.Pointer(c.model_config.transducer.decoder))
|
|
|
|
c.model_config.transducer.joiner = C.CString(config.ModelConfig.Transducer.Joiner)
|
|
defer C.free(unsafe.Pointer(c.model_config.transducer.joiner))
|
|
|
|
c.model_config.paraformer.encoder = C.CString(config.ModelConfig.Paraformer.Encoder)
|
|
defer C.free(unsafe.Pointer(c.model_config.paraformer.encoder))
|
|
|
|
c.model_config.paraformer.decoder = C.CString(config.ModelConfig.Paraformer.Decoder)
|
|
defer C.free(unsafe.Pointer(c.model_config.paraformer.decoder))
|
|
|
|
c.model_config.zipformer2_ctc.model = C.CString(config.ModelConfig.Zipformer2Ctc.Model)
|
|
defer C.free(unsafe.Pointer(c.model_config.zipformer2_ctc.model))
|
|
|
|
c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
|
|
defer C.free(unsafe.Pointer(c.model_config.tokens))
|
|
|
|
c.model_config.num_threads = C.int(config.ModelConfig.NumThreads)
|
|
|
|
c.model_config.provider = C.CString(config.ModelConfig.Provider)
|
|
defer C.free(unsafe.Pointer(c.model_config.provider))
|
|
|
|
c.model_config.debug = C.int(config.ModelConfig.Debug)
|
|
|
|
c.model_config.model_type = C.CString(config.ModelConfig.ModelType)
|
|
defer C.free(unsafe.Pointer(c.model_config.model_type))
|
|
|
|
c.model_config.modeling_unit = C.CString(config.ModelConfig.ModelingUnit)
|
|
defer C.free(unsafe.Pointer(c.model_config.modeling_unit))
|
|
|
|
c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab)
|
|
defer C.free(unsafe.Pointer(c.model_config.bpe_vocab))
|
|
|
|
c.model_config.tokens_buf = C.CString(config.ModelConfig.TokensBuf)
|
|
defer C.free(unsafe.Pointer(c.model_config.tokens_buf))
|
|
|
|
c.model_config.tokens_buf_size = C.int(config.ModelConfig.TokensBufSize)
|
|
|
|
c.max_active_paths = C.int(config.MaxActivePaths)
|
|
|
|
c.keywords_file = C.CString(config.KeywordsFile)
|
|
defer C.free(unsafe.Pointer(c.keywords_file))
|
|
|
|
c.keywords_score = C.float(config.KeywordsScore)
|
|
|
|
c.keywords_threshold = C.float(config.KeywordsThreshold)
|
|
|
|
c.keywords_buf = C.CString(config.KeywordsBuf)
|
|
defer C.free(unsafe.Pointer(c.keywords_buf))
|
|
|
|
c.keywords_buf_size = C.int(config.KeywordsBufSize)
|
|
|
|
impl := C.SherpaOnnxCreateKeywordSpotter(&c)
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
spotter := &KeywordSpotter{}
|
|
spotter.impl = impl
|
|
return spotter
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOnlineStream]() to free
|
|
// the returned stream to avoid memory leak
|
|
func NewKeywordStream(spotter *KeywordSpotter) *OnlineStream {
|
|
stream := &OnlineStream{}
|
|
stream.impl = C.SherpaOnnxCreateKeywordStream(spotter.impl)
|
|
return stream
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOnlineStream]() to free
|
|
// the returned stream to avoid memory leak
|
|
func NewKeywordStreamWithKeywords(spotter *KeywordSpotter, keywords string) *OnlineStream {
|
|
stream := &OnlineStream{}
|
|
|
|
s := C.CString(keywords)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
stream.impl = C.SherpaOnnxCreateKeywordStreamWithKeywords(spotter.impl, s)
|
|
return stream
|
|
}
|
|
|
|
// Check whether the stream has enough feature frames for decoding.
|
|
// Return true if this stream is ready for decoding. Return false otherwise.
|
|
//
|
|
// You will usually use it like below:
|
|
//
|
|
// for spotter.IsReady(s) {
|
|
// spotter.Decode(s)
|
|
// }
|
|
func (spotter *KeywordSpotter) IsReady(s *OnlineStream) bool {
|
|
return C.SherpaOnnxIsKeywordStreamReady(spotter.impl, s.impl) == 1
|
|
}
|
|
|
|
// Decode the stream. Before calling this function, you have to ensure
|
|
// that spotter.IsReady(s) returns true. Otherwise, you will be SAD.
|
|
//
|
|
// You usually use it like below:
|
|
//
|
|
// for spotter.IsReady(s) {
|
|
// spotter.Decode(s)
|
|
// }
|
|
func (spotter *KeywordSpotter) Decode(s *OnlineStream) {
|
|
C.SherpaOnnxDecodeKeywordStream(spotter.impl, s.impl)
|
|
}
|
|
|
|
// You MUST call it right after detecting a keyword
|
|
func (spotter *KeywordSpotter) Reset(s *OnlineStream) {
|
|
C.SherpaOnnxResetKeywordStream(spotter.impl, s.impl)
|
|
}
|
|
|
|
// Get the current result of stream since the last invoke of Reset()
|
|
func (spotter *KeywordSpotter) GetResult(s *OnlineStream) *KeywordSpotterResult {
|
|
p := C.SherpaOnnxGetKeywordResult(spotter.impl, s.impl)
|
|
defer C.SherpaOnnxDestroyKeywordResult(p)
|
|
result := &KeywordSpotterResult{}
|
|
result.Keyword = C.GoString(p.keyword)
|
|
return result
|
|
}
|
|
|
|
// Configuration for the audio tagging.
|
|
type OfflineZipformerAudioTaggingModelConfig struct {
|
|
Model string
|
|
}
|
|
|
|
type AudioTaggingModelConfig struct {
|
|
Zipformer OfflineZipformerAudioTaggingModelConfig
|
|
Ced string
|
|
NumThreads int32
|
|
Debug int32
|
|
Provider string
|
|
}
|
|
|
|
type AudioTaggingConfig struct {
|
|
Model AudioTaggingModelConfig
|
|
Labels string
|
|
TopK int32
|
|
}
|
|
|
|
type AudioTagging struct {
|
|
impl *C.struct_SherpaOnnxAudioTagging
|
|
}
|
|
|
|
type AudioEvent struct {
|
|
Name string
|
|
Index int
|
|
Prob float32
|
|
}
|
|
|
|
func DeleteAudioTagging(tagging *AudioTagging) {
|
|
C.SherpaOnnxDestroyAudioTagging(tagging.impl)
|
|
tagging.impl = nil
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteAudioTagging]() to free
|
|
// the returned tagger to avoid memory leak
|
|
func NewAudioTagging(config *AudioTaggingConfig) *AudioTagging {
|
|
c := C.struct_SherpaOnnxAudioTaggingConfig{}
|
|
|
|
c.model.zipformer.model = C.CString(config.Model.Zipformer.Model)
|
|
defer C.free(unsafe.Pointer(c.model.zipformer.model))
|
|
|
|
c.model.ced = C.CString(config.Model.Ced)
|
|
defer C.free(unsafe.Pointer(c.model.ced))
|
|
|
|
c.model.num_threads = C.int(config.Model.NumThreads)
|
|
|
|
c.model.provider = C.CString(config.Model.Provider)
|
|
defer C.free(unsafe.Pointer(c.model.provider))
|
|
|
|
c.model.debug = C.int(config.Model.Debug)
|
|
|
|
c.labels = C.CString(config.Labels)
|
|
defer C.free(unsafe.Pointer(c.labels))
|
|
|
|
c.top_k = C.int(config.TopK)
|
|
|
|
impl := C.SherpaOnnxCreateAudioTagging(&c)
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
tagging := &AudioTagging{}
|
|
tagging.impl = impl
|
|
return tagging
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOfflineStream]() to free
|
|
// the returned stream to avoid memory leak
|
|
func NewAudioTaggingStream(tagging *AudioTagging) *OfflineStream {
|
|
stream := &OfflineStream{}
|
|
stream.impl = C.SherpaOnnxAudioTaggingCreateOfflineStream(tagging.impl)
|
|
return stream
|
|
}
|
|
|
|
func (tagging *AudioTagging) Compute(s *OfflineStream, topK int32) []AudioEvent {
|
|
r := C.SherpaOnnxAudioTaggingCompute(tagging.impl, s.impl, C.int(topK))
|
|
defer C.SherpaOnnxAudioTaggingFreeResults(r)
|
|
result := make([]AudioEvent, 0)
|
|
|
|
p := (*[1 << 25]*C.struct_SherpaOnnxAudioEvent)(unsafe.Pointer(r))
|
|
i := 0
|
|
for {
|
|
if p[i] == nil {
|
|
break
|
|
}
|
|
result = append(result, AudioEvent{
|
|
Name: C.GoString(p[i].name),
|
|
Index: int(p[i].index),
|
|
Prob: float32(p[i].prob),
|
|
})
|
|
i += 1
|
|
}
|
|
return result
|
|
}
|
|
|
|
type OfflineSpeechDenoiserGtcrnModelConfig struct {
|
|
Model string
|
|
}
|
|
|
|
type OfflineSpeechDenoiserModelConfig struct {
|
|
Gtcrn OfflineSpeechDenoiserGtcrnModelConfig
|
|
NumThreads int32
|
|
Debug int32
|
|
Provider string
|
|
}
|
|
|
|
type OfflineSpeechDenoiserConfig struct {
|
|
Model OfflineSpeechDenoiserModelConfig
|
|
}
|
|
|
|
type OfflineSpeechDenoiser struct {
|
|
impl *C.struct_SherpaOnnxOfflineSpeechDenoiser
|
|
}
|
|
|
|
type DenoisedAudio struct {
|
|
// Normalized samples in the range [-1, 1]
|
|
Samples []float32
|
|
|
|
SampleRate int
|
|
}
|
|
|
|
// Free the internal pointer inside the OfflineSpeechDenoiser to avoid memory leak.
|
|
func DeleteOfflineSpeechDenoiser(sd *OfflineSpeechDenoiser) {
|
|
C.SherpaOnnxDestroyOfflineSpeechDenoiser(sd.impl)
|
|
sd.impl = nil
|
|
}
|
|
|
|
// The user is responsible to invoke [DeleteOfflineSpeechDenoiser]() to free
|
|
// the returned tts to avoid memory leak
|
|
func NewOfflineSpeechDenoiser(config *OfflineSpeechDenoiserConfig) *OfflineSpeechDenoiser {
|
|
c := C.struct_SherpaOnnxOfflineSpeechDenoiserConfig{}
|
|
c.model.gtcrn.model = C.CString(config.Model.Gtcrn.Model)
|
|
defer C.free(unsafe.Pointer(c.model.gtcrn.model))
|
|
|
|
c.model.num_threads = C.int(config.Model.NumThreads)
|
|
c.model.debug = C.int(config.Model.Debug)
|
|
|
|
c.model.provider = C.CString(config.Model.Provider)
|
|
defer C.free(unsafe.Pointer(c.model.provider))
|
|
|
|
impl := C.SherpaOnnxCreateOfflineSpeechDenoiser(&c)
|
|
if impl == nil {
|
|
return nil
|
|
}
|
|
|
|
sd := &OfflineSpeechDenoiser{}
|
|
sd.impl = impl
|
|
return sd
|
|
}
|
|
|
|
func (sd *OfflineSpeechDenoiser) Run(samples []float32, sampleRate int) *DenoisedAudio {
|
|
audio := C.SherpaOnnxOfflineSpeechDenoiserRun(sd.impl, (*C.float)(&samples[0]), C.int(len(samples)), C.int(sampleRate))
|
|
defer C.SherpaOnnxDestroyDenoisedAudio(audio)
|
|
|
|
ans := &DenoisedAudio{}
|
|
ans.SampleRate = int(audio.sample_rate)
|
|
n := int(audio.n)
|
|
ans.Samples = make([]float32, n)
|
|
|
|
denoisedSamples := unsafe.Slice(audio.samples, n)
|
|
for i := 0; i < n; i++ {
|
|
ans.Samples[i] = float32(denoisedSamples[i])
|
|
}
|
|
|
|
return ans
|
|
}
|
|
|
|
func (audio *DenoisedAudio) Save(filename string) bool {
|
|
s := C.CString(filename)
|
|
defer C.free(unsafe.Pointer(s))
|
|
|
|
ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s))
|
|
|
|
return ok == 1
|
|
}
|
|
|
|
func (sd *OfflineSpeechDenoiser) SampleRate() int {
|
|
return int(C.SherpaOnnxOfflineSpeechDenoiserGetSampleRate(sd.impl))
|
|
}
|
|
|
|
func GetVersion() string {
|
|
return C.GoString(C.SherpaOnnxGetVersionStr())
|
|
}
|
|
|
|
func GetGitSha1() string {
|
|
return C.GoString(C.SherpaOnnxGetGitSha1())
|
|
}
|
|
|
|
func GetGitDate() string {
|
|
return C.GoString(C.SherpaOnnxGetGitDate())
|
|
}
|