diff --git a/go-api-examples/real-time-speech-recognition-from-microphone/go.mod b/go-api-examples/real-time-speech-recognition-from-microphone/go.mod index 58f35f46..5d7446ad 100644 --- a/go-api-examples/real-time-speech-recognition-from-microphone/go.mod +++ b/go-api-examples/real-time-speech-recognition-from-microphone/go.mod @@ -1,7 +1,3 @@ module real-time-speech-recognition-from-microphone go 1.17 - -require ( - github.com/csukuangfj/portaudio-go v1.0.3 -) diff --git a/go-api-examples/real-time-speech-recognition-from-microphone/main.go b/go-api-examples/real-time-speech-recognition-from-microphone/main.go index 5cbd919f..4110c2aa 100644 --- a/go-api-examples/real-time-speech-recognition-from-microphone/main.go +++ b/go-api-examples/real-time-speech-recognition-from-microphone/main.go @@ -2,34 +2,14 @@ package main import ( "fmt" - portaudio "github.com/csukuangfj/portaudio-go" + "github.com/gen2brain/malgo" sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" flag "github.com/spf13/pflag" "log" "strings" ) -func main() { - err := portaudio.Initialize() - if err != nil { - log.Fatalf("Unable to initialize portaudio: %v\n", err) - } - defer portaudio.Terminate() - - default_device, err := portaudio.DefaultInputDevice() - if err != nil { - log.Fatal("Failed to get default input device: %v\n", err) - } - fmt.Printf("Select default input device: %s\n", default_device.Name) - param := portaudio.StreamParameters{} - param.Input.Device = default_device - param.Input.Channels = 1 - param.Input.Latency = default_device.DefaultLowInputLatency - - param.SampleRate = 16000 - param.FramesPerBuffer = 0 - param.Flags = portaudio.ClipOff - +func initRecognizer() *sherpa.OnlineRecognizer { config := sherpa.OnlineRecognizerConfig{} config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80} @@ -55,37 +35,48 @@ func main() { log.Println("Initializing recognizer (may take several seconds)") recognizer := sherpa.NewOnlineRecognizer(&config) log.Println("Recognizer created!") + return recognizer +} + +func main() { + ctx, err := malgo.InitContext(nil, malgo.ContextConfig{}, func(message string) { + fmt.Printf("LOG <%v>", message) + }) + chk(err) + + defer func() { + _ = ctx.Uninit() + ctx.Free() + }() + + deviceConfig := malgo.DefaultDeviceConfig(malgo.Duplex) + deviceConfig.Capture.Format = malgo.FormatS16 + deviceConfig.Capture.Channels = 1 + deviceConfig.Playback.Format = malgo.FormatS16 + deviceConfig.Playback.Channels = 1 + deviceConfig.SampleRate = 16000 + deviceConfig.Alsa.NoMMap = 1 + + recognizer := initRecognizer() defer sherpa.DeleteOnlineRecognizer(recognizer) stream := sherpa.NewOnlineStream(recognizer) defer sherpa.DeleteOnlineStream(stream) - // you can choose another value for 0.1 if you want - samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second - - samples := make([]float32, samplesPerCall) - s, err := portaudio.OpenStream(param, samples) - if err != nil { - log.Fatalf("Failed to open the stream") - } - defer s.Close() - chk(s.Start()) - var last_text string segment_idx := 0 - fmt.Println("Started! Please speak") - - for { - chk(s.Read()) - stream.AcceptWaveform(int(param.SampleRate), samples) + onRecvFrames := func(_, pSample []byte, framecount uint32) { + samples := samplesInt16ToFloat(pSample) + stream.AcceptWaveform(16000, samples) + // Please use a separate goroutine for decoding in your app for recognizer.IsReady(stream) { recognizer.Decode(stream) } - text := recognizer.GetResult(stream).Text + if len(text) != 0 && last_text != text { last_text = strings.ToLower(text) fmt.Printf("\r%d: %s", segment_idx, last_text) @@ -100,7 +91,18 @@ func main() { } } - chk(s.Stop()) + captureCallbacks := malgo.DeviceCallbacks{ + Data: onRecvFrames, + } + + device, err := malgo.InitDevice(ctx.Context, deviceConfig, captureCallbacks) + chk(err) + + err = device.Start() + chk(err) + fmt.Println("Started. Please speak. Press ctrl + C to exit") + fmt.Scanln() + device.Uninit() } func chk(err error) { @@ -108,3 +110,16 @@ func chk(err error) { panic(err) } } + +func samplesInt16ToFloat(inSamples []byte) []float32 { + numSamples := len(inSamples) / 2 + outSamples := make([]float32, numSamples) + + for i := 0; i != numSamples; i++ { + // Decode two bytes into an int16 using bit manipulation + s16 := int16(inSamples[2*i]) | int16(inSamples[2*i+1])<<8 + outSamples[i] = float32(s16) / 32768 + } + + return outSamples +} diff --git a/nodejs-addon-examples/test_asr_non_streaming_whisper.js b/nodejs-addon-examples/test_asr_non_streaming_whisper.js index 411bb455..da8a32bf 100644 --- a/nodejs-addon-examples/test_asr_non_streaming_whisper.js +++ b/nodejs-addon-examples/test_asr_non_streaming_whisper.js @@ -1,6 +1,6 @@ // Copyright (c) 2024 Xiaomi Corporation const sherpa_onnx = require('sherpa-onnx-node'); -console.log(`verison : ${sherpa_onnx.version}`); +console.log(`version : ${sherpa_onnx.version}`); console.log(`git sha1: ${sherpa_onnx.gitSha1}`); console.log(`git date: ${sherpa_onnx.gitDate}`); diff --git a/nodejs-examples/test-offline-whisper.js b/nodejs-examples/test-offline-whisper.js index a685f50d..4702ae3c 100644 --- a/nodejs-examples/test-offline-whisper.js +++ b/nodejs-examples/test-offline-whisper.js @@ -1,7 +1,7 @@ // Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) // const sherpa_onnx = require('sherpa-onnx'); -console.log(`verison : ${sherpa_onnx.version}`); +console.log(`version : ${sherpa_onnx.version}`); console.log(`git sha1: ${sherpa_onnx.gitSha1}`); console.log(`git date: ${sherpa_onnx.gitDate}`);