diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index 1e8ad9c8..9c995111 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -134,6 +134,53 @@ jobs: name: ${{ matrix.os }}-libs path: to-upload/ + - name: Test non-streaming decoding files + shell: bash + run: | + cd scripts/go/_internal/non-streaming-decode-files/ + ls -lh + go mod tidy + cat go.mod + go build + ls -lh + + echo "Test Moonshine" + ./run-moonshine.sh + rm -rf sherpa-onnx-* + + echo "Test SenseVoice ctc" + ./run-sense-voice-small.sh + rm -rf sherpa-onnx-sense-* + + echo "Test telespeech ctc" + ./run-telespeech-ctc.sh + rm -rf sherpa-onnx-telespeech-ctc-* + + echo "Test transducer" + ./run-transducer.sh + rm -rf sherpa-onnx-zipformer-en-2023-06-26 + + echo "Test transducer" + ./run-transducer.sh + rm -rf sherpa-onnx-zipformer-en-2023-06-26 + + echo "Test paraformer" + ./run-paraformer.sh + ./run-paraformer-itn.sh + rm -rf sherpa-onnx-paraformer-zh-2023-09-14 + + echo "Test NeMo CTC" + ./run-nemo-ctc.sh + rm -rf sherpa-onnx-nemo-ctc-en-conformer-medium + + echo "Test Whisper tiny.en" + ./run-whisper.sh + rm -rf sherpa-onnx-whisper-tiny.en + + echo "Test Tdnn yesno" + ./run-tdnn-yesno.sh + rm -rf sherpa-onnx-tdnn-yesno + - name: Test adding punctuation shell: bash run: | @@ -193,49 +240,6 @@ jobs: name: tts-waves-${{ matrix.os }} path: tts-waves - - name: Test non-streaming decoding files - shell: bash - run: | - cd scripts/go/_internal/non-streaming-decode-files/ - ls -lh - go mod tidy - cat go.mod - go build - ls -lh - - echo "Test SenseVoice ctc" - ./run-sense-voice-small.sh - rm -rf sherpa-onnx-sense-* - - echo "Test telespeech ctc" - ./run-telespeech-ctc.sh - rm -rf sherpa-onnx-telespeech-ctc-* - - echo "Test transducer" - ./run-transducer.sh - rm -rf sherpa-onnx-zipformer-en-2023-06-26 - - echo "Test transducer" - ./run-transducer.sh - rm -rf sherpa-onnx-zipformer-en-2023-06-26 - - echo "Test paraformer" - ./run-paraformer.sh - ./run-paraformer-itn.sh - rm -rf sherpa-onnx-paraformer-zh-2023-09-14 - - echo "Test NeMo CTC" - ./run-nemo-ctc.sh - rm -rf sherpa-onnx-nemo-ctc-en-conformer-medium - - echo "Test Whisper tiny.en" - ./run-whisper.sh - rm -rf sherpa-onnx-whisper-tiny.en - - echo "Test Tdnn yesno" - ./run-tdnn-yesno.sh - rm -rf sherpa-onnx-tdnn-yesno - - name: Test streaming decoding files shell: bash run: | diff --git a/go-api-examples/README.md b/go-api-examples/README.md index 91f2c76e..e16dab69 100644 --- a/go-api-examples/README.md +++ b/go-api-examples/README.md @@ -6,28 +6,41 @@ Please refer to the documentation https://k2-fsa.github.io/sherpa/onnx/go-api/index.html for details. +- [./add-punctuation](./add-punctuation) It shows how to use + a punctuation model to add punctuations to text + - [./non-streaming-decode-files](./non-streaming-decode-files) It shows how to use a non-streaming ASR model to decode files +- [./non-streaming-speaker-diarization](./non-streaming-speaker-diarization) It shows how to use + a speaker segmentation model and a speaker embedding model for speaker diarization. + - [./non-streaming-tts](./non-streaming-tts) It shows how to use a non-streaming TTS model to convert text to speech - [./real-time-speech-recognition-from-microphone](./real-time-speech-recognition-from-microphone) It shows how to use a streaming ASR model to recognize speech from a microphone in real-time +- [./speaker-identification](./speaker-identification) It shows how to use a speaker + embedding model for speaker identification. + +- [./streaming-decode-files](./streaming-decode-files) It shows how to use a streaming + model for streaming speech recognition + +- [./streaming-hlg-decoding](./streaming-hlg-decoding) It shows how to use a streaming + model for streaming speech recognition with HLG decoding + - [./vad](./vad) It shows how to use silero VAD with Golang. -- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper +- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer for speech recognition. -- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer +- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper + +- [./vad-speaker-identification](./vad-speaker-identification) It shows how to use Go API for VAD + speaker identification. for speech recognition. - [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper for spoken language identification. -- [./speaker-identification](./speaker-identification) It shows how to use Go API for speaker identification. - -- [./vad-speaker-identification](./vad-speaker-identification) It shows how to use Go API for VAD + speaker identification. - [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx diff --git a/go-api-examples/non-streaming-decode-files/main.go b/go-api-examples/non-streaming-decode-files/main.go index 5373dcf2..92b23dc1 100644 --- a/go-api-examples/non-streaming-decode-files/main.go +++ b/go-api-examples/non-streaming-decode-files/main.go @@ -34,6 +34,11 @@ func main() { flag.StringVar(&config.ModelConfig.Whisper.Task, "whisper-task", "transcribe", "transcribe or translate") flag.IntVar(&config.ModelConfig.Whisper.TailPaddings, "whisper-tail-paddings", -1, "tail paddings for whisper") + flag.StringVar(&config.ModelConfig.Moonshine.Preprocessor, "moonshine-preprocessor", "", "Path to the moonshine preprocessor model") + flag.StringVar(&config.ModelConfig.Moonshine.Encoder, "moonshine-encoder", "", "Path to the moonshine encoder model") + flag.StringVar(&config.ModelConfig.Moonshine.UncachedDecoder, "moonshine-uncached-decoder", "", "Path to the moonshine uncached decoder model") + flag.StringVar(&config.ModelConfig.Moonshine.CachedDecoder, "moonshine-cached-decoder", "", "Path to the moonshine cached decoder model") + flag.StringVar(&config.ModelConfig.Tdnn.Model, "tdnn-model", "", "Path to the tdnn model") flag.StringVar(&config.ModelConfig.SenseVoice.Model, "sense-voice-model", "", "Path to the SenseVoice model") @@ -85,12 +90,8 @@ func main() { log.Println("Emotion: " + result.Emotion) log.Println("Lang: " + result.Lang) log.Println("Event: " + result.Event) - for _, v := range result.Timestamps { - log.Printf("Timestamp: %+v\n", v) - } - for _, v := range result.Tokens { - log.Println("Token: " + v) - } + log.Printf("Timestamp: %v\n", result.Timestamps) + log.Printf("Tokens: %v\n", result.Tokens) log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate)) } diff --git a/go-api-examples/non-streaming-decode-files/run-moonshine.sh b/go-api-examples/non-streaming-decode-files/run-moonshine.sh new file mode 100755 index 00000000..409101e4 --- /dev/null +++ b/go-api-examples/non-streaming-decode-files/run-moonshine.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +fi + +go mod tidy +go build + +./non-streaming-decode-files \ + --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \ + --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \ + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \ + --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \ + --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \ + ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav + diff --git a/scripts/go/_internal/non-streaming-decode-files/run-moonshine.sh b/scripts/go/_internal/non-streaming-decode-files/run-moonshine.sh new file mode 120000 index 00000000..95064f1a --- /dev/null +++ b/scripts/go/_internal/non-streaming-decode-files/run-moonshine.sh @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-decode-files/run-moonshine.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 30ca31dc..cde6513d 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -382,6 +382,13 @@ type OfflineWhisperModelConfig struct { TailPaddings int } +type OfflineMoonshineModelConfig struct { + Preprocessor string + Encoder string + UncachedDecoder string + CachedDecoder string +} + type OfflineTdnnModelConfig struct { Model string } @@ -405,6 +412,7 @@ type OfflineModelConfig struct { Whisper OfflineWhisperModelConfig Tdnn OfflineTdnnModelConfig SenseVoice OfflineSenseVoiceModelConfig + Moonshine OfflineMoonshineModelConfig Tokens string // Path to tokens.txt // Number of threads to use for neural network computation @@ -515,6 +523,18 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { c.model_config.sense_voice.use_itn = C.int(config.ModelConfig.SenseVoice.UseInverseTextNormalization) + c.model_config.moonshine.preprocessor = C.CString(config.ModelConfig.Moonshine.Preprocessor) + defer C.free(unsafe.Pointer(c.model_config.moonshine.preprocessor)) + + c.model_config.moonshine.encoder = C.CString(config.ModelConfig.Moonshine.Encoder) + defer C.free(unsafe.Pointer(c.model_config.moonshine.encoder)) + + c.model_config.moonshine.uncached_decoder = C.CString(config.ModelConfig.Moonshine.UncachedDecoder) + defer C.free(unsafe.Pointer(c.model_config.moonshine.uncached_decoder)) + + c.model_config.moonshine.cached_decoder = C.CString(config.ModelConfig.Moonshine.CachedDecoder) + defer C.free(unsafe.Pointer(c.model_config.moonshine.cached_decoder)) + c.model_config.tokens = C.CString(config.ModelConfig.Tokens) defer C.free(unsafe.Pointer(c.model_config.tokens))