Add Go API for Moonshine models (#1479)

2024-10-27 09:39:09 +08:00
parent 052b8645ba
commit 3d3edabb5f
6 changed files with 115 additions and 55 deletions
--- a/go-api-examples/README.md
+++ b/go-api-examples/README.md
@@ -6,28 +6,41 @@ Please refer to the documentation
 https://k2-fsa.github.io/sherpa/onnx/go-api/index.html
 for details.

+- [./add-punctuation](./add-punctuation) It shows how to use
+  a punctuation model to add punctuations to text
+
 - [./non-streaming-decode-files](./non-streaming-decode-files) It shows how to use
  a non-streaming ASR model to decode files

+- [./non-streaming-speaker-diarization](./non-streaming-speaker-diarization) It shows how to use
+  a speaker segmentation model and a speaker embedding model for speaker diarization.
+
 - [./non-streaming-tts](./non-streaming-tts) It shows how to use a non-streaming TTS
  model to convert text to speech

 - [./real-time-speech-recognition-from-microphone](./real-time-speech-recognition-from-microphone)
  It shows how to use a streaming ASR model to recognize speech from a microphone in real-time

+- [./speaker-identification](./speaker-identification) It shows how to use a speaker
+  embedding model for speaker identification.
+
+- [./streaming-decode-files](./streaming-decode-files) It shows how to use a streaming
+  model for streaming speech recognition
+
+- [./streaming-hlg-decoding](./streaming-hlg-decoding) It shows how to use a streaming
+  model for streaming speech recognition with HLG decoding
+
 - [./vad](./vad) It shows how to use silero VAD with Golang.

- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper
+- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer
  for speech recognition.

- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer
+- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper
+
+- [./vad-speaker-identification](./vad-speaker-identification) It shows how to use Go API for VAD + speaker identification.
  for speech recognition.

 - [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper
  for spoken language identification.

- [./speaker-identification](./speaker-identification) It shows how to use Go API for speaker identification.
-
- [./vad-speaker-identification](./vad-speaker-identification) It shows how to use Go API for VAD + speaker identification.
-
 [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx
--- a/go-api-examples/non-streaming-decode-files/main.go
+++ b/go-api-examples/non-streaming-decode-files/main.go
@@ -34,6 +34,11 @@ func main() {
 	flag.StringVar(&config.ModelConfig.Whisper.Task, "whisper-task", "transcribe", "transcribe or translate")
 	flag.IntVar(&config.ModelConfig.Whisper.TailPaddings, "whisper-tail-paddings", -1, "tail paddings for whisper")

+	flag.StringVar(&config.ModelConfig.Moonshine.Preprocessor, "moonshine-preprocessor", "", "Path to the moonshine preprocessor model")
+	flag.StringVar(&config.ModelConfig.Moonshine.Encoder, "moonshine-encoder", "", "Path to the moonshine encoder model")
+	flag.StringVar(&config.ModelConfig.Moonshine.UncachedDecoder, "moonshine-uncached-decoder", "", "Path to the moonshine uncached decoder model")
+	flag.StringVar(&config.ModelConfig.Moonshine.CachedDecoder, "moonshine-cached-decoder", "", "Path to the moonshine cached decoder model")
+
 	flag.StringVar(&config.ModelConfig.Tdnn.Model, "tdnn-model", "", "Path to the tdnn model")

 	flag.StringVar(&config.ModelConfig.SenseVoice.Model, "sense-voice-model", "", "Path to the SenseVoice model")
@@ -85,12 +90,8 @@ func main() {
 	log.Println("Emotion: " + result.Emotion)
 	log.Println("Lang: " + result.Lang)
 	log.Println("Event: " + result.Event)
-	for _, v := range result.Timestamps {
-		log.Printf("Timestamp: %+v\n", v)
-	}
-	for _, v := range result.Tokens {
-		log.Println("Token: " + v)
-	}
+	log.Printf("Timestamp: %v\n", result.Timestamps)
+	log.Printf("Tokens: %v\n", result.Tokens)
 	log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate))
 }

--- a/go-api-examples/non-streaming-decode-files/run-moonshine.sh
+++ b/go-api-examples/non-streaming-decode-files/run-moonshine.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+  tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+  rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+fi
+
+go mod tidy
+go build
+
+./non-streaming-decode-files \
+  --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \
+  --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \
+  --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \
+  --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \
+  --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \
+  ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav
+