/* Speech recognition with [Next-gen Kaldi]. [sherpa-onnx] is an open-source speech recognition framework for [Next-gen Kaldi]. It depends only on [onnxruntime], supporting both streaming and non-streaming speech recognition. It does not need to access the network during recognition and everything runs locally. It supports a variety of platforms, such as Linux (x86_64, aarch64, arm), Windows (x86_64, x86), macOS (x86_64, arm64), etc. Usage examples: 1. Real-time speech recognition from a microphone Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/real-time-speech-recognition-from-microphone 2. Decode files using a non-streaming model Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-decode-files 3. Decode files using a streaming model Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/streaming-decode-files 4. Convert text to speech using a non-streaming model Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-tts [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx [onnxruntime]: https://github.com/microsoft/onnxruntime [Next-gen Kaldi]: https://github.com/k2-fsa/ */ package sherpa_onnx // #include // #include "c-api.h" import "C" import "unsafe" // Configuration for online/streaming transducer models // // Please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html // to download pre-trained models type OnlineTransducerModelConfig struct { Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx Decoder string // Path to the decoder model. Joiner string // Path to the joiner model. } // Configuration for online/streaming paraformer models // // Please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html // to download pre-trained models type OnlineParaformerModelConfig struct { Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx Decoder string // Path to the decoder model. } // Please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-ctc/index.html // to download pre-trained models type OnlineZipformer2CtcModelConfig struct { Model string // Path to the onnx model } // Configuration for online/streaming models // // Please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html // to download pre-trained models type OnlineModelConfig struct { Transducer OnlineTransducerModelConfig Paraformer OnlineParaformerModelConfig Zipformer2Ctc OnlineZipformer2CtcModelConfig Tokens string // Path to tokens.txt NumThreads int // Number of threads to use for neural network computation Provider string // Optional. Valid values are: cpu, cuda, coreml Debug int // 1 to show model meta information while loading it. ModelType string // Optional. You can specify it for faster model initialization } // Configuration for the feature extractor type FeatureConfig struct { // Sample rate expected by the model. It is 16000 for all // pre-trained models provided by us SampleRate int // Feature dimension expected by the model. It is 80 for all // pre-trained models provided by us FeatureDim int } // Configuration for the online/streaming recognizer. type OnlineRecognizerConfig struct { FeatConfig FeatureConfig ModelConfig OnlineModelConfig // Valid decoding methods: greedy_search, modified_beam_search DecodingMethod string // Used only when DecodingMethod is modified_beam_search. It specifies // the maximum number of paths to keep during the search MaxActivePaths int EnableEndpoint int // 1 to enable endpoint detection. // Please see // https://k2-fsa.github.io/sherpa/ncnn/endpoint.html // for the meaning of Rule1MinTrailingSilence, Rule2MinTrailingSilence // and Rule3MinUtteranceLength. Rule1MinTrailingSilence float32 Rule2MinTrailingSilence float32 Rule3MinUtteranceLength float32 } // It contains the recognition result for a online stream. type OnlineRecognizerResult struct { Text string } // The online recognizer class. It wraps a pointer from C. type OnlineRecognizer struct { impl *C.struct_SherpaOnnxOnlineRecognizer } // The online stream class. It wraps a pointer from C. type OnlineStream struct { impl *C.struct_SherpaOnnxOnlineStream } // Free the internal pointer inside the recognizer to avoid memory leak. func DeleteOnlineRecognizer(recognizer *OnlineRecognizer) { C.DestroyOnlineRecognizer(recognizer.impl) recognizer.impl = nil } // The user is responsible to invoke [DeleteOnlineRecognizer]() to free // the returned recognizer to avoid memory leak func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer { c := C.struct_SherpaOnnxOnlineRecognizerConfig{} c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate) c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim) c.model_config.transducer.encoder = C.CString(config.ModelConfig.Transducer.Encoder) defer C.free(unsafe.Pointer(c.model_config.transducer.encoder)) c.model_config.transducer.decoder = C.CString(config.ModelConfig.Transducer.Decoder) defer C.free(unsafe.Pointer(c.model_config.transducer.decoder)) c.model_config.transducer.joiner = C.CString(config.ModelConfig.Transducer.Joiner) defer C.free(unsafe.Pointer(c.model_config.transducer.joiner)) c.model_config.paraformer.encoder = C.CString(config.ModelConfig.Paraformer.Encoder) defer C.free(unsafe.Pointer(c.model_config.paraformer.encoder)) c.model_config.paraformer.decoder = C.CString(config.ModelConfig.Paraformer.Decoder) defer C.free(unsafe.Pointer(c.model_config.paraformer.decoder)) c.model_config.zipformer2_ctc.model = C.CString(config.ModelConfig.Zipformer2Ctc.Model) defer C.free(unsafe.Pointer(c.model_config.zipformer2_ctc.model)) c.model_config.tokens = C.CString(config.ModelConfig.Tokens) defer C.free(unsafe.Pointer(c.model_config.tokens)) c.model_config.num_threads = C.int(config.ModelConfig.NumThreads) c.model_config.provider = C.CString(config.ModelConfig.Provider) defer C.free(unsafe.Pointer(c.model_config.provider)) c.model_config.debug = C.int(config.ModelConfig.Debug) c.model_config.model_type = C.CString(config.ModelConfig.ModelType) defer C.free(unsafe.Pointer(c.model_config.model_type)) c.decoding_method = C.CString(config.DecodingMethod) defer C.free(unsafe.Pointer(c.decoding_method)) c.max_active_paths = C.int(config.MaxActivePaths) c.enable_endpoint = C.int(config.EnableEndpoint) c.rule1_min_trailing_silence = C.float(config.Rule1MinTrailingSilence) c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence) c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength) recognizer := &OnlineRecognizer{} recognizer.impl = C.CreateOnlineRecognizer(&c) return recognizer } // Delete the internal pointer inside the stream to avoid memory leak. func DeleteOnlineStream(stream *OnlineStream) { C.DestroyOnlineStream(stream.impl) stream.impl = nil } // The user is responsible to invoke [DeleteOnlineStream]() to free // the returned stream to avoid memory leak func NewOnlineStream(recognizer *OnlineRecognizer) *OnlineStream { stream := &OnlineStream{} stream.impl = C.CreateOnlineStream(recognizer.impl) return stream } // Input audio samples for the stream. // // sampleRate is the actual sample rate of the input audio samples. If it // is different from the sample rate expected by the feature extractor, we will // do resampling inside. // // samples contains audio samples. Each sample is in the range [-1, 1] func (s *OnlineStream) AcceptWaveform(sampleRate int, samples []float32) { C.AcceptWaveform(s.impl, C.int(sampleRate), (*C.float)(&samples[0]), C.int(len(samples))) } // Signal that there will be no incoming audio samples. // After calling this function, you cannot call [OnlineStream.AcceptWaveform] any longer. // // The main purpose of this function is to flush the remaining audio samples // buffered inside for feature extraction. func (s *OnlineStream) InputFinished() { C.InputFinished(s.impl) } // Check whether the stream has enough feature frames for decoding. // Return true if this stream is ready for decoding. Return false otherwise. // // You will usually use it like below: // // for recognizer.IsReady(s) { // recognizer.Decode(s) // } func (recognizer *OnlineRecognizer) IsReady(s *OnlineStream) bool { return C.IsOnlineStreamReady(recognizer.impl, s.impl) == 1 } // Return true if an endpoint is detected. // // You usually use it like below: // // if recognizer.IsEndpoint(s) { // // do your own stuff after detecting an endpoint // // recognizer.Reset(s) // } func (recognizer *OnlineRecognizer) IsEndpoint(s *OnlineStream) bool { return C.IsEndpoint(recognizer.impl, s.impl) == 1 } // After calling this function, the internal neural network model states // are reset and IsEndpoint(s) would return false. GetResult(s) would also // return an empty string. func (recognizer *OnlineRecognizer) Reset(s *OnlineStream) { C.Reset(recognizer.impl, s.impl) } // Decode the stream. Before calling this function, you have to ensure // that recognizer.IsReady(s) returns true. Otherwise, you will be SAD. // // You usually use it like below: // // for recognizer.IsReady(s) { // recognizer.Decode(s) // } func (recognizer *OnlineRecognizer) Decode(s *OnlineStream) { C.DecodeOnlineStream(recognizer.impl, s.impl) } // Decode multiple streams in parallel, i.e., in batch. // You have to ensure that each stream is ready for decoding. Otherwise, // you will be SAD. func (recognizer *OnlineRecognizer) DecodeStreams(s []*OnlineStream) { ss := make([]*C.struct_SherpaOnnxOnlineStream, len(s)) for i, v := range s { ss[i] = v.impl } C.DecodeMultipleOnlineStreams(recognizer.impl, &ss[0], C.int(len(s))) } // Get the current result of stream since the last invoke of Reset() func (recognizer *OnlineRecognizer) GetResult(s *OnlineStream) *OnlineRecognizerResult { p := C.GetOnlineStreamResult(recognizer.impl, s.impl) defer C.DestroyOnlineRecognizerResult(p) result := &OnlineRecognizerResult{} result.Text = C.GoString(p.text) return result } // Configuration for offline/non-streaming transducer. // // Please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/index.html // to download pre-trained models type OfflineTransducerModelConfig struct { Encoder string // Path to the encoder model, i.e., encoder.onnx or encoder.int8.onnx Decoder string // Path to the decoder model Joiner string // Path to the joiner model } // Configuration for offline/non-streaming paraformer. // // please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html // to download pre-trained models type OfflineParaformerModelConfig struct { Model string // Path to the model, e.g., model.onnx or model.int8.onnx } // Configuration for offline/non-streaming NeMo CTC models. // // Please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/index.html // to download pre-trained models type OfflineNemoEncDecCtcModelConfig struct { Model string // Path to the model, e.g., model.onnx or model.int8.onnx } type OfflineWhisperModelConfig struct { Encoder string Decoder string } type OfflineTdnnModelConfig struct { Model string } // Configuration for offline LM. type OfflineLMConfig struct { Model string // Path to the model Scale float32 // scale for LM score } type OfflineModelConfig struct { Transducer OfflineTransducerModelConfig Paraformer OfflineParaformerModelConfig NemoCTC OfflineNemoEncDecCtcModelConfig Whisper OfflineWhisperModelConfig Tdnn OfflineTdnnModelConfig Tokens string // Path to tokens.txt // Number of threads to use for neural network computation NumThreads int // 1 to print model meta information while loading Debug int // Optional. Valid values: cpu, cuda, coreml Provider string // Optional. Specify it for faster model initialization. ModelType string } // Configuration for the offline/non-streaming recognizer. type OfflineRecognizerConfig struct { FeatConfig FeatureConfig ModelConfig OfflineModelConfig LmConfig OfflineLMConfig // Valid decoding method: greedy_search, modified_beam_search DecodingMethod string // Used only when DecodingMethod is modified_beam_search. MaxActivePaths int } // It wraps a pointer from C type OfflineRecognizer struct { impl *C.struct_SherpaOnnxOfflineRecognizer } // It wraps a pointer from C type OfflineStream struct { impl *C.struct_SherpaOnnxOfflineStream } // It contains recognition result of an offline stream. type OfflineRecognizerResult struct { Text string } // Frees the internal pointer of the recognition to avoid memory leak. func DeleteOfflineRecognizer(recognizer *OfflineRecognizer) { C.DestroyOfflineRecognizer(recognizer.impl) recognizer.impl = nil } // The user is responsible to invoke [DeleteOfflineRecognizer]() to free // the returned recognizer to avoid memory leak func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { c := C.struct_SherpaOnnxOfflineRecognizerConfig{} c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate) c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim) c.model_config.transducer.encoder = C.CString(config.ModelConfig.Transducer.Encoder) defer C.free(unsafe.Pointer(c.model_config.transducer.encoder)) c.model_config.transducer.decoder = C.CString(config.ModelConfig.Transducer.Decoder) defer C.free(unsafe.Pointer(c.model_config.transducer.decoder)) c.model_config.transducer.joiner = C.CString(config.ModelConfig.Transducer.Joiner) defer C.free(unsafe.Pointer(c.model_config.transducer.joiner)) c.model_config.paraformer.model = C.CString(config.ModelConfig.Paraformer.Model) defer C.free(unsafe.Pointer(c.model_config.paraformer.model)) c.model_config.nemo_ctc.model = C.CString(config.ModelConfig.NemoCTC.Model) defer C.free(unsafe.Pointer(c.model_config.nemo_ctc.model)) c.model_config.whisper.encoder = C.CString(config.ModelConfig.Whisper.Encoder) defer C.free(unsafe.Pointer(c.model_config.whisper.encoder)) c.model_config.whisper.decoder = C.CString(config.ModelConfig.Whisper.Decoder) defer C.free(unsafe.Pointer(c.model_config.whisper.decoder)) c.model_config.tdnn.model = C.CString(config.ModelConfig.Tdnn.Model) defer C.free(unsafe.Pointer(c.model_config.tdnn.model)) c.model_config.tokens = C.CString(config.ModelConfig.Tokens) defer C.free(unsafe.Pointer(c.model_config.tokens)) c.model_config.num_threads = C.int(config.ModelConfig.NumThreads) c.model_config.debug = C.int(config.ModelConfig.Debug) c.model_config.provider = C.CString(config.ModelConfig.Provider) defer C.free(unsafe.Pointer(c.model_config.provider)) c.model_config.model_type = C.CString(config.ModelConfig.ModelType) defer C.free(unsafe.Pointer(c.model_config.model_type)) c.lm_config.model = C.CString(config.LmConfig.Model) defer C.free(unsafe.Pointer(c.lm_config.model)) c.lm_config.scale = C.float(config.LmConfig.Scale) c.decoding_method = C.CString(config.DecodingMethod) defer C.free(unsafe.Pointer(c.decoding_method)) c.max_active_paths = C.int(config.MaxActivePaths) recognizer := &OfflineRecognizer{} recognizer.impl = C.CreateOfflineRecognizer(&c) return recognizer } // Frees the internal pointer of the stream to avoid memory leak. func DeleteOfflineStream(stream *OfflineStream) { C.DestroyOfflineStream(stream.impl) stream.impl = nil } // The user is responsible to invoke [DeleteOfflineStream]() to free // the returned stream to avoid memory leak func NewOfflineStream(recognizer *OfflineRecognizer) *OfflineStream { stream := &OfflineStream{} stream.impl = C.CreateOfflineStream(recognizer.impl) return stream } // Input audio samples for the offline stream. // Please only call it once. That is, input all samples at once. // // sampleRate is the sample rate of the input audio samples. If it is different // from the value expected by the feature extractor, we will do resampling inside. // // samples contains the actual audio samples. Each sample is in the range [-1, 1]. func (s *OfflineStream) AcceptWaveform(sampleRate int, samples []float32) { C.AcceptWaveformOffline(s.impl, C.int(sampleRate), (*C.float)(&samples[0]), C.int(len(samples))) } // Decode the offline stream. func (recognizer *OfflineRecognizer) Decode(s *OfflineStream) { C.DecodeOfflineStream(recognizer.impl, s.impl) } // Decode multiple streams in parallel, i.e., in batch. func (recognizer *OfflineRecognizer) DecodeStreams(s []*OfflineStream) { ss := make([]*C.struct_SherpaOnnxOfflineStream, len(s)) for i, v := range s { ss[i] = v.impl } C.DecodeMultipleOfflineStreams(recognizer.impl, &ss[0], C.int(len(s))) } // Get the recognition result of the offline stream. func (s *OfflineStream) GetResult() *OfflineRecognizerResult { p := C.GetOfflineStreamResult(s.impl) defer C.DestroyOfflineRecognizerResult(p) result := &OfflineRecognizerResult{} result.Text = C.GoString(p.text) return result } // Configuration for offline/non-streaming text-to-speech (TTS). // // Please refer to // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html // to download pre-trained models type OfflineTtsVitsModelConfig struct { Model string // Path to the VITS onnx model Lexicon string // Path to lexicon.txt Tokens string // Path to tokens.txt DataDir string // Path to tokens.txt NoiseScale float32 // noise scale for vits models. Please use 0.667 in general NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed } type OfflineTtsModelConfig struct { Vits OfflineTtsVitsModelConfig // Number of threads to use for neural network computation NumThreads int // 1 to print model meta information while loading Debug int // Optional. Valid values: cpu, cuda, coreml Provider string } type OfflineTtsConfig struct { Model OfflineTtsModelConfig RuleFsts string MaxNumSentences int } type GeneratedAudio struct { // Normalized samples in the range [-1, 1] Samples []float32 SampleRate int } // The offline tts class. It wraps a pointer from C. type OfflineTts struct { impl *C.struct_SherpaOnnxOfflineTts } // Free the internal pointer inside the tts to avoid memory leak. func DeleteOfflineTts(tts *OfflineTts) { C.SherpaOnnxDestroyOfflineTts(tts.impl) tts.impl = nil } // The user is responsible to invoke [DeleteOfflineTts]() to free // the returned tts to avoid memory leak func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { c := C.struct_SherpaOnnxOfflineTtsConfig{} c.rule_fsts = C.CString(config.RuleFsts) defer C.free(unsafe.Pointer(c.rule_fsts)) c.max_num_sentences = C.int(config.MaxNumSentences) c.model.vits.model = C.CString(config.Model.Vits.Model) defer C.free(unsafe.Pointer(c.model.vits.model)) c.model.vits.lexicon = C.CString(config.Model.Vits.Lexicon) defer C.free(unsafe.Pointer(c.model.vits.lexicon)) c.model.vits.tokens = C.CString(config.Model.Vits.Tokens) defer C.free(unsafe.Pointer(c.model.vits.tokens)) c.model.vits.data_dir = C.CString(config.Model.Vits.DataDir) defer C.free(unsafe.Pointer(c.model.vits.data_dir)) c.model.vits.noise_scale = C.float(config.Model.Vits.NoiseScale) c.model.vits.noise_scale_w = C.float(config.Model.Vits.NoiseScaleW) c.model.vits.length_scale = C.float(config.Model.Vits.LengthScale) c.model.num_threads = C.int(config.Model.NumThreads) c.model.debug = C.int(config.Model.Debug) c.model.provider = C.CString(config.Model.Provider) defer C.free(unsafe.Pointer(c.model.provider)) tts := &OfflineTts{} tts.impl = C.SherpaOnnxCreateOfflineTts(&c) return tts } func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedAudio { s := C.CString(text) defer C.free(unsafe.Pointer(s)) audio := C.SherpaOnnxOfflineTtsGenerate(tts.impl, s, C.int(sid), C.float(speed)) defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio) ans := &GeneratedAudio{} ans.SampleRate = int(audio.sample_rate) n := int(audio.n) ans.Samples = make([]float32, n) samples := (*[1 << 28]C.float)(unsafe.Pointer(audio.samples))[:n:n] // copy(ans.Samples, samples) for i := 0; i < n; i++ { ans.Samples[i] = float32(samples[i]) } return ans } func (audio *GeneratedAudio) Save(filename string) int { s := C.CString(filename) defer C.free(unsafe.Pointer(s)) ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s)) return ok }