diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index c5c6d5a4..b757781c 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,10 @@ cd dotnet-examples/ -cd spoken-language-identification +cd streaming-hlg-decoding/ +./run.sh + +cd ../spoken-language-identification ./run.sh cd ../online-decode-files diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index c205d388..1531aff2 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -58,6 +58,13 @@ rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 node ./test-online-zipformer2-ctc.js rm -rf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 + +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +node ./test-online-zipformer2-ctc-hlg.js +rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 + # offline tts curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 diff --git a/.github/scripts/test-swift.sh b/.github/scripts/test-swift.sh index ec276c41..536c04c4 100755 --- a/.github/scripts/test-swift.sh +++ b/.github/scripts/test-swift.sh @@ -7,6 +7,10 @@ echo "pwd: $PWD" cd swift-api-examples ls -lh +./run-streaming-hlg-decode-file.sh +rm ./streaming-hlg-decode-file +rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 + ./run-spoken-language-identification.sh rm -rf sherpa-onnx-whisper* @@ -31,4 +35,5 @@ sed -i.bak '20d' ./decode-file.swift ./run-decode-file-non-streaming.sh + ls -lh diff --git a/.github/workflows/test-dot-net.yaml b/.github/workflows/test-dot-net.yaml index aa8e7b1e..243b4f1a 100644 --- a/.github/workflows/test-dot-net.yaml +++ b/.github/workflows/test-dot-net.yaml @@ -178,6 +178,7 @@ jobs: cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/ cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/ cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/ + cp -v scripts/dotnet/examples/streaming-hlg-decoding.csproj dotnet-examples/streaming-hlg-decoding ls -lh /tmp diff --git a/.github/workflows/test-go-package.yaml b/.github/workflows/test-go-package.yaml index d761be4f..27132950 100644 --- a/.github/workflows/test-go-package.yaml +++ b/.github/workflows/test-go-package.yaml @@ -66,12 +66,77 @@ jobs: run: | gcc --version - - name: Test speaker identification + - name: Test streaming HLG decoding (Linux/macOS) + if: matrix.os != 'windows-latest' + shell: bash + run: | + cd go-api-examples/streaming-hlg-decoding/ + ./run.sh + + - name: Test speaker identification (Linux/macOS) + if: matrix.os != 'windows-latest' shell: bash run: | cd go-api-examples/speaker-identification ./run.sh + - name: Test speaker identification (Win64) + if: matrix.os == 'windows-latest' && matrix.arch == 'x64' + shell: bash + run: | + cd go-api-examples/speaker-identification + go mod tidy + cat go.mod + go build + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx + git clone https://github.com/csukuangfj/sr-data + ls -lh + echo $PWD + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/ + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/* + cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/x86_64-pc-windows-gnu/*.dll . + ls -lh + go mod tidy + go build + go run ./main.go + + - name: Test speaker identification (Win32) + if: matrix.os == 'windows-latest' && matrix.arch == 'x86' + shell: bash + run: | + cd go-api-examples/speaker-identification + go mod tidy + cat go.mod + ls -lh + + go env GOARCH + go env + echo "------------------------------" + go env -w GOARCH=386 + go env -w CGO_ENABLED=1 + go env + + go clean + go build + + echo $PWD + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx + git clone https://github.com/csukuangfj/sr-data + ls -lh + echo $PWD + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/ + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/* + cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/i686-pc-windows-gnu/*.dll . + ls -lh + go mod tidy + go build + go run ./main.go + + rm -rf sr-data + rm -rf *.onnx + - name: Test non-streaming TTS (Linux/macOS) if: matrix.os != 'windows-latest' shell: bash diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index 298403ec..17af77e6 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -74,6 +74,12 @@ jobs: go mod tidy go build + - name: Test streaming HLG decoding + shell: bash + run: | + cd scripts/go/_internal/streaming-hlg-decoding/ + ./run.sh + - name: Test speaker identification shell: bash run: | diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index 06956324..4c3669d1 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -15,6 +15,9 @@ target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api) add_executable(speaker-identification-c-api speaker-identification-c-api.c) target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api) +add_executable(streaming-hlg-decode-file-c-api streaming-hlg-decode-file-c-api.c) +target_link_libraries(streaming-hlg-decode-file-c-api sherpa-onnx-c-api) + if(SHERPA_ONNX_HAS_ALSA) add_subdirectory(./asr-microphone-example) elseif((UNIX AND NOT APPLE) OR LINUX) diff --git a/c-api-examples/streaming-hlg-decode-file-c-api.c b/c-api-examples/streaming-hlg-decode-file-c-api.c new file mode 100644 index 00000000..83422def --- /dev/null +++ b/c-api-examples/streaming-hlg-decode-file-c-api.c @@ -0,0 +1,130 @@ +// c-api-examples/streaming-hlg-decode-file-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation +/* +We use the following model as an example + +// clang-format off + +Download the model from +https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + +tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + +build/bin/streaming-hlg-decode-file-c-api + +(The above model is from https://github.com/k2-fsa/icefall/pull/1557) +*/ +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + // clang-format off + // + // Please download the model from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + const char *model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx"; + const char *tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt"; + const char *graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst"; + const char *wav_filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav"; + // clang-format on + + SherpaOnnxOnlineRecognizerConfig config; + + memset(&config, 0, sizeof(config)); + config.feat_config.sample_rate = 16000; + config.feat_config.feature_dim = 80; + config.model_config.zipformer2_ctc.model = model; + config.model_config.tokens = tokens; + config.model_config.num_threads = 1; + config.model_config.provider = "cpu"; + config.model_config.debug = 0; + config.ctc_fst_decoder_config.graph = graph; + const SherpaOnnxOnlineRecognizer *recognizer = + CreateOnlineRecognizer(&config); + if (!recognizer) { + fprintf(stderr, "Failed to create recognizer"); + exit(-1); + } + + const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); + + const SherpaOnnxDisplay *display = CreateDisplay(50); + int32_t segment_id = 0; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + exit(-1); + } + +// simulate streaming. You can choose an arbitrary N +#define N 3200 + + int16_t buffer[N]; + float samples[N]; + fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n", + wave->sample_rate, wave->num_samples, + (float)wave->num_samples / wave->sample_rate); + + int32_t k = 0; + while (k < wave->num_samples) { + int32_t start = k; + int32_t end = + (start + N > wave->num_samples) ? wave->num_samples : (start + N); + k += N; + + AcceptWaveform(stream, wave->sample_rate, wave->samples + start, + end - start); + while (IsOnlineStreamReady(recognizer, stream)) { + DecodeOnlineStream(recognizer, stream); + } + + const SherpaOnnxOnlineRecognizerResult *r = + GetOnlineStreamResult(recognizer, stream); + + if (strlen(r->text)) { + SherpaOnnxPrint(display, segment_id, r->text); + } + + if (IsEndpoint(recognizer, stream)) { + if (strlen(r->text)) { + ++segment_id; + } + Reset(recognizer, stream); + } + + DestroyOnlineRecognizerResult(r); + } + + // add some tail padding + float tail_paddings[4800] = {0}; // 0.3 seconds at 16 kHz sample rate + AcceptWaveform(stream, wave->sample_rate, tail_paddings, 4800); + + SherpaOnnxFreeWave(wave); + + InputFinished(stream); + while (IsOnlineStreamReady(recognizer, stream)) { + DecodeOnlineStream(recognizer, stream); + } + + const SherpaOnnxOnlineRecognizerResult *r = + GetOnlineStreamResult(recognizer, stream); + + if (strlen(r->text)) { + SherpaOnnxPrint(display, segment_id, r->text); + } + + DestroyOnlineRecognizerResult(r); + + DestroyDisplay(display); + DestroyOnlineStream(stream); + DestroyOnlineRecognizer(recognizer); + fprintf(stderr, "\n"); + + return 0; +} diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index fe2992ed..ae22bfab 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -5,7 +5,7 @@ function(download_onnxruntime) message(STATUS "CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") if(SHERPA_ONNX_ENABLE_WASM) - include(onnxruntime-wasm-simd) + include(onnxruntime-wasm-simd) elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL riscv64) if(BUILD_SHARED_LIBS) include(onnxruntime-linux-riscv64) diff --git a/dotnet-examples/sherpa-onnx.sln b/dotnet-examples/sherpa-onnx.sln index 6c469ba3..ff514df3 100644 --- a/dotnet-examples/sherpa-onnx.sln +++ b/dotnet-examples/sherpa-onnx.sln @@ -15,6 +15,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identification", "spoken-language-identification\spoken-language-identification.csproj", "{3D7CF3D6-AC45-4D50-9619-5687B1443E94}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "streaming-hlg-decoding", "streaming-hlg-decoding\streaming-hlg-decoding.csproj", "{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -48,5 +50,9 @@ Global {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.Build.0 = Debug|Any CPU {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.ActiveCfg = Release|Any CPU {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.Build.0 = Release|Any CPU + {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection EndGlobal diff --git a/dotnet-examples/streaming-hlg-decoding/Program.cs b/dotnet-examples/streaming-hlg-decoding/Program.cs new file mode 100644 index 00000000..6ac7c8c9 --- /dev/null +++ b/dotnet-examples/streaming-hlg-decoding/Program.cs @@ -0,0 +1,66 @@ +// Copyright (c) 2024 Xiaomi Corporation +// +// This file shows how to do streaming HLG decoding. +// +// 1. Download the model for testing +// +// curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +// tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +// rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +// +// 2. Now run it +// +// dotnet run + +using SherpaOnnx; +using System.Collections.Generic; +using System; + +class StreamingHlgDecodingDemo +{ + + static void Main(string[] args) + { + var config = new OnlineRecognizerConfig(); + config.FeatConfig.SampleRate = 16000; + config.FeatConfig.FeatureDim = 80; + config.ModelConfig.Zipformer2Ctc.Model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx"; + + config.ModelConfig.Tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt"; + config.ModelConfig.Provider = "cpu"; + config.ModelConfig.NumThreads = 1; + config.ModelConfig.Debug = 0; + config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst"; + + OnlineRecognizer recognizer = new OnlineRecognizer(config); + + var filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav"; + + WaveReader waveReader = new WaveReader(filename); + OnlineStream s = recognizer.CreateStream(); + s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples); + + float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)]; + s.AcceptWaveform(waveReader.SampleRate, tailPadding); + s.InputFinished(); + + while (recognizer.IsReady(s)) + { + recognizer.Decode(s); + } + + OnlineRecognizerResult r = recognizer.GetResult(s); + var text = r.Text; + var tokens = r.Tokens; + Console.WriteLine("--------------------"); + Console.WriteLine(filename); + Console.WriteLine("text: {0}", text); + Console.WriteLine("tokens: [{0}]", string.Join(", ", tokens)); + Console.Write("timestamps: ["); + r.Timestamps.ToList().ForEach(i => Console.Write(String.Format("{0:0.00}", i) + ", ")); + Console.WriteLine("]"); + Console.WriteLine("--------------------"); + } +} + + diff --git a/dotnet-examples/streaming-hlg-decoding/WaveReader.cs b/dotnet-examples/streaming-hlg-decoding/WaveReader.cs new file mode 120000 index 00000000..bedfc634 --- /dev/null +++ b/dotnet-examples/streaming-hlg-decoding/WaveReader.cs @@ -0,0 +1 @@ +../online-decode-files/WaveReader.cs \ No newline at end of file diff --git a/dotnet-examples/streaming-hlg-decoding/run.sh b/dotnet-examples/streaming-hlg-decoding/run.sh new file mode 100755 index 00000000..2e031974 --- /dev/null +++ b/dotnet-examples/streaming-hlg-decoding/run.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +fi + +dotnet run -c Release diff --git a/dotnet-examples/streaming-hlg-decoding/streaming-hlg-decoding.csproj b/dotnet-examples/streaming-hlg-decoding/streaming-hlg-decoding.csproj new file mode 100644 index 00000000..6030ec85 --- /dev/null +++ b/dotnet-examples/streaming-hlg-decoding/streaming-hlg-decoding.csproj @@ -0,0 +1,15 @@ + + + + Exe + net6.0 + streaming_hlg_decoding + enable + enable + + + + + + + diff --git a/go-api-examples/streaming-hlg-decoding/go.mod b/go-api-examples/streaming-hlg-decoding/go.mod new file mode 100644 index 00000000..1b9b9893 --- /dev/null +++ b/go-api-examples/streaming-hlg-decoding/go.mod @@ -0,0 +1,3 @@ +module streaming-hlg-decoding + +go 1.12 diff --git a/go-api-examples/streaming-hlg-decoding/main.go b/go-api-examples/streaming-hlg-decoding/main.go new file mode 100644 index 00000000..8c0a9700 --- /dev/null +++ b/go-api-examples/streaming-hlg-decoding/main.go @@ -0,0 +1,109 @@ +package main + +import ( + "bytes" + "encoding/binary" + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" + "github.com/youpy/go-wav" + "log" + "os" + "strings" +) + +func main() { + log.SetFlags(log.LstdFlags | log.Lmicroseconds) + + config := sherpa.OnlineRecognizerConfig{} + config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80} + + // please download model files from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + config.ModelConfig.Zipformer2Ctc.Model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx" + config.ModelConfig.Tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt" + + config.ModelConfig.NumThreads = 1 + config.ModelConfig.Debug = 0 + config.ModelConfig.Provider = "cpu" + config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst" + + wav_filename := "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav" + + samples, sampleRate := readWave(wav_filename) + + log.Println("Initializing recognizer (may take several seconds)") + recognizer := sherpa.NewOnlineRecognizer(&config) + log.Println("Recognizer created!") + defer sherpa.DeleteOnlineRecognizer(recognizer) + + log.Println("Start decoding!") + stream := sherpa.NewOnlineStream(recognizer) + defer sherpa.DeleteOnlineStream(stream) + + stream.AcceptWaveform(sampleRate, samples) + + tailPadding := make([]float32, int(float32(sampleRate)*0.3)) + stream.AcceptWaveform(sampleRate, tailPadding) + + for recognizer.IsReady(stream) { + recognizer.Decode(stream) + } + log.Println("Decoding done!") + result := recognizer.GetResult(stream) + log.Println(strings.ToLower(result.Text)) + log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate)) +} + +func readWave(filename string) (samples []float32, sampleRate int) { + file, _ := os.Open(filename) + defer file.Close() + + reader := wav.NewReader(file) + format, err := reader.Format() + if err != nil { + log.Fatalf("Failed to read wave format") + } + + if format.AudioFormat != 1 { + log.Fatalf("Support only PCM format. Given: %v\n", format.AudioFormat) + } + + if format.NumChannels != 1 { + log.Fatalf("Support only 1 channel wave file. Given: %v\n", format.NumChannels) + } + + if format.BitsPerSample != 16 { + log.Fatalf("Support only 16-bit per sample. Given: %v\n", format.BitsPerSample) + } + + reader.Duration() // so that it initializes reader.Size + + buf := make([]byte, reader.Size) + n, err := reader.Read(buf) + if n != int(reader.Size) { + log.Fatalf("Failed to read %v bytes. Returned %v bytes\n", reader.Size, n) + } + + samples = samplesInt16ToFloat(buf) + sampleRate = int(format.SampleRate) + + return +} + +func samplesInt16ToFloat(inSamples []byte) []float32 { + numSamples := len(inSamples) / 2 + outSamples := make([]float32, numSamples) + + for i := 0; i != numSamples; i++ { + s := inSamples[i*2 : (i+1)*2] + + var s16 int16 + buf := bytes.NewReader(s) + err := binary.Read(buf, binary.LittleEndian, &s16) + if err != nil { + log.Fatal("Failed to parse 16-bit sample") + } + outSamples[i] = float32(s16) / 32768 + } + + return outSamples +} diff --git a/go-api-examples/streaming-hlg-decoding/run.sh b/go-api-examples/streaming-hlg-decoding/run.sh new file mode 100755 index 00000000..fb7549c5 --- /dev/null +++ b/go-api-examples/streaming-hlg-decoding/run.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +fi + +go mod tidy +go build +ls -lh +./streaming-hlg-decoding diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index f2dc14c9..9c13bee5 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -174,3 +174,16 @@ wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherp tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 node ./test-online-zipformer2-ctc.js ``` + +## ./test-online-zipformer2-ctc-hlg.js +[./test-online-zipformer2-ctc-hlg.js](./test-online-zipformer2-ctc-hlg.js) demonstrates +how to decode a file using a streaming zipformer2 CTC model with HLG. In the code +we use [sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2). + +You can use the following command to run it: + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +node ./test-online-zipformer2-ctc-hlg.js +``` diff --git a/nodejs-examples/test-online-paraformer-microphone.js b/nodejs-examples/test-online-paraformer-microphone.js index 4b76f4cd..591b4dbb 100644 --- a/nodejs-examples/test-online-paraformer-microphone.js +++ b/nodejs-examples/test-online-paraformer-microphone.js @@ -50,6 +50,10 @@ function createOnlineRecognizer() { rule3MinUtteranceLength: 20, hotwordsFile: '', hotwordsScore: 1.5, + ctcFstDecoderConfig: { + graph: '', + maxActive: 3000, + } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-online-paraformer.js b/nodejs-examples/test-online-paraformer.js index 09982988..01b8feeb 100644 --- a/nodejs-examples/test-online-paraformer.js +++ b/nodejs-examples/test-online-paraformer.js @@ -51,6 +51,10 @@ function createOnlineRecognizer() { rule3MinUtteranceLength: 20, hotwordsFile: '', hotwordsScore: 1.5, + ctcFstDecoderConfig: { + graph: '', + maxActive: 3000, + } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-online-transducer-microphone.js b/nodejs-examples/test-online-transducer-microphone.js index 9fa7c92c..6312b567 100644 --- a/nodejs-examples/test-online-transducer-microphone.js +++ b/nodejs-examples/test-online-transducer-microphone.js @@ -52,6 +52,10 @@ function createOnlineRecognizer() { rule3MinUtteranceLength: 20, hotwordsFile: '', hotwordsScore: 1.5, + ctcFstDecoderConfig: { + graph: '', + maxActive: 3000, + } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-online-transducer.js b/nodejs-examples/test-online-transducer.js index 4293cbc9..e4bb46d2 100644 --- a/nodejs-examples/test-online-transducer.js +++ b/nodejs-examples/test-online-transducer.js @@ -53,6 +53,10 @@ function createOnlineRecognizer() { rule3MinUtteranceLength: 20, hotwordsFile: '', hotwordsScore: 1.5, + ctcFstDecoderConfig: { + graph: '', + maxActive: 3000, + } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-online-zipformer2-ctc-hlg.js b/nodejs-examples/test-online-zipformer2-ctc-hlg.js new file mode 100644 index 00000000..1bf99992 --- /dev/null +++ b/nodejs-examples/test-online-zipformer2-ctc-hlg.js @@ -0,0 +1,125 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createOnlineRecognizer() { + let onlineTransducerModelConfig = { + encoder: '', + decoder: '', + joiner: '', + }; + + let onlineParaformerModelConfig = { + encoder: '', + decoder: '', + }; + + let onlineZipformer2CtcModelConfig = { + model: + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx', + }; + + let onlineModelConfig = { + transducer: onlineTransducerModelConfig, + paraformer: onlineParaformerModelConfig, + zipformer2Ctc: onlineZipformer2CtcModelConfig, + tokens: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt', + numThreads: 1, + provider: 'cpu', + debug: 0, + modelType: '', + }; + + let featureConfig = { + sampleRate: 16000, + featureDim: 80, + }; + + let recognizerConfig = { + featConfig: featureConfig, + modelConfig: onlineModelConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + enableEndpoint: 1, + rule1MinTrailingSilence: 2.4, + rule2MinTrailingSilence: 1.2, + rule3MinUtteranceLength: 20, + hotwordsFile: '', + hotwordsScore: 1.5, + ctcFstDecoderConfig: { + graph: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst', + maxActive: 3000, + } + }; + + return sherpa_onnx.createOnlineRecognizer(recognizerConfig); +} + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); + +const waveFilename = + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); + +function decode(samples) { + stream.acceptWaveform(gSampleRate, samples); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + const text = recognizer.getResult(stream); + console.log(text); +} + +let gSampleRate = 16000; + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + gSampleRate = sampleRate; + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {'highWaterMark': 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + decode(floatSamples); + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + decode(floatSamples); + } +}); diff --git a/nodejs-examples/test-online-zipformer2-ctc.js b/nodejs-examples/test-online-zipformer2-ctc.js index 4f3506a2..2e85d69a 100644 --- a/nodejs-examples/test-online-zipformer2-ctc.js +++ b/nodejs-examples/test-online-zipformer2-ctc.js @@ -51,6 +51,10 @@ function createOnlineRecognizer() { rule3MinUtteranceLength: 20, hotwordsFile: '', hotwordsScore: 1.5, + ctcFstDecoderConfig: { + graph: '', + maxActive: 3000, + } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/scripts/dotnet/examples/streaming-hlg-decoding.csproj b/scripts/dotnet/examples/streaming-hlg-decoding.csproj new file mode 100644 index 00000000..4b982c31 --- /dev/null +++ b/scripts/dotnet/examples/streaming-hlg-decoding.csproj @@ -0,0 +1,19 @@ + + + + Exe + net6.0 + streaming_hlg_decoding + enable + enable + + + + /tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json + + + + + + + diff --git a/scripts/dotnet/online.cs b/scripts/dotnet/online.cs index 09b827ad..a9dd95de 100644 --- a/scripts/dotnet/online.cs +++ b/scripts/dotnet/online.cs @@ -116,6 +116,21 @@ namespace SherpaOnnx public int FeatureDim; } + [StructLayout(LayoutKind.Sequential)] + public struct OnlineCtcFstDecoderConfig + { + public OnlineCtcFstDecoderConfig() + { + Graph = ""; + MaxActive = 3000; + } + + [MarshalAs(UnmanagedType.LPStr)] + public string Graph; + + public int MaxActive; + } + [StructLayout(LayoutKind.Sequential)] public struct OnlineRecognizerConfig { @@ -131,6 +146,7 @@ namespace SherpaOnnx Rule3MinUtteranceLength = 20.0F; HotwordsFile = ""; HotwordsScore = 1.5F; + CtcFstDecoderConfig = new OnlineCtcFstDecoderConfig(); } public FeatureConfig FeatConfig; public OnlineModelConfig ModelConfig; @@ -167,6 +183,8 @@ namespace SherpaOnnx /// Bonus score for each token in hotwords. public float HotwordsScore; + + public OnlineCtcFstDecoderConfig CtcFstDecoderConfig; } public class OnlineRecognizerResult diff --git a/scripts/go/_internal/streaming-hlg-decoding/.gitignore b/scripts/go/_internal/streaming-hlg-decoding/.gitignore new file mode 100644 index 00000000..4bc5d691 --- /dev/null +++ b/scripts/go/_internal/streaming-hlg-decoding/.gitignore @@ -0,0 +1 @@ +streaming-hlg-decoding diff --git a/scripts/go/_internal/streaming-hlg-decoding/go.mod b/scripts/go/_internal/streaming-hlg-decoding/go.mod new file mode 100644 index 00000000..55c0c92a --- /dev/null +++ b/scripts/go/_internal/streaming-hlg-decoding/go.mod @@ -0,0 +1,5 @@ +module streaming-hlg-decoding + +go 1.12 + +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../ diff --git a/scripts/go/_internal/streaming-hlg-decoding/main.go b/scripts/go/_internal/streaming-hlg-decoding/main.go new file mode 120000 index 00000000..0b7bc3b9 --- /dev/null +++ b/scripts/go/_internal/streaming-hlg-decoding/main.go @@ -0,0 +1 @@ +../../../../go-api-examples/streaming-hlg-decoding/main.go \ No newline at end of file diff --git a/scripts/go/_internal/streaming-hlg-decoding/run.sh b/scripts/go/_internal/streaming-hlg-decoding/run.sh new file mode 120000 index 00000000..89440471 --- /dev/null +++ b/scripts/go/_internal/streaming-hlg-decoding/run.sh @@ -0,0 +1 @@ +../../../../go-api-examples/streaming-hlg-decoding/run.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 1b4c60ab..361d9775 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -99,6 +99,11 @@ type FeatureConfig struct { FeatureDim int } +type OnlineCtcFstDecoderConfig struct { + Graph string + MaxActive int +} + // Configuration for the online/streaming recognizer. type OnlineRecognizerConfig struct { FeatConfig FeatureConfig @@ -120,6 +125,7 @@ type OnlineRecognizerConfig struct { Rule1MinTrailingSilence float32 Rule2MinTrailingSilence float32 Rule3MinUtteranceLength float32 + CtcFstDecoderConfig OnlineCtcFstDecoderConfig } // It contains the recognition result for a online stream. @@ -190,6 +196,10 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer { c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence) c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength) + c.ctc_fst_decoder_config.graph = C.CString(config.CtcFstDecoderConfig.Graph) + defer C.free(unsafe.Pointer(c.ctc_fst_decoder_config.graph)) + c.ctc_fst_decoder_config.max_active = C.int(config.CtcFstDecoderConfig.MaxActive) + recognizer := &OnlineRecognizer{} recognizer.impl = C.CreateOnlineRecognizer(&c) diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 685091c1..8baecd06 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -99,6 +99,11 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( recognizer_config.hotwords_score = SHERPA_ONNX_OR(config->hotwords_score, 1.5); + recognizer_config.ctc_fst_decoder_config.graph = + SHERPA_ONNX_OR(config->ctc_fst_decoder_config.graph, ""); + recognizer_config.ctc_fst_decoder_config.max_active = + SHERPA_ONNX_OR(config->ctc_fst_decoder_config.max_active, 3000); + if (config->model_config.debug) { SHERPA_ONNX_LOGE("%s\n", recognizer_config.ToString().c_str()); } diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 66c33bf2..55ad4663 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -96,6 +96,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxFeatureConfig { int32_t feature_dim; } SherpaOnnxFeatureConfig; +SHERPA_ONNX_API typedef struct SherpaOnnxOnlineCtcFstDecoderConfig { + const char *graph; + int32_t max_active; +} SherpaOnnxOnlineCtcFstDecoderConfig; + SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig { SherpaOnnxFeatureConfig feat_config; SherpaOnnxOnlineModelConfig model_config; @@ -131,6 +136,8 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig { /// Bonus score for each token in hotwords. float hotwords_score; + + SherpaOnnxOnlineCtcFstDecoderConfig ctc_fst_decoder_config; } SherpaOnnxOnlineRecognizerConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerResult { diff --git a/swift-api-examples/.gitignore b/swift-api-examples/.gitignore index 4b76201d..f4290242 100644 --- a/swift-api-examples/.gitignore +++ b/swift-api-examples/.gitignore @@ -7,3 +7,4 @@ vits-vctk sherpa-onnx-paraformer-zh-2023-09-14 !*.sh *.bak +streaming-hlg-decode-file diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index c93fbf37..b463c866 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -111,6 +111,15 @@ func sherpaOnnxFeatureConfig( feature_dim: Int32(featureDim)) } +func sherpaOnnxOnlineCtcFstDecoderConfig( + graph: String = "", + maxActive: Int = 3000 +) -> SherpaOnnxOnlineCtcFstDecoderConfig { + return SherpaOnnxOnlineCtcFstDecoderConfig( + graph: toCPointer(graph), + max_active: Int32(maxActive)) +} + func sherpaOnnxOnlineRecognizerConfig( featConfig: SherpaOnnxFeatureConfig, modelConfig: SherpaOnnxOnlineModelConfig, @@ -121,7 +130,8 @@ func sherpaOnnxOnlineRecognizerConfig( decodingMethod: String = "greedy_search", maxActivePaths: Int = 4, hotwordsFile: String = "", - hotwordsScore: Float = 1.5 + hotwordsScore: Float = 1.5, + ctcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig() ) -> SherpaOnnxOnlineRecognizerConfig { return SherpaOnnxOnlineRecognizerConfig( feat_config: featConfig, @@ -133,7 +143,9 @@ func sherpaOnnxOnlineRecognizerConfig( rule2_min_trailing_silence: rule2MinTrailingSilence, rule3_min_utterance_length: rule3MinUtteranceLength, hotwords_file: toCPointer(hotwordsFile), - hotwords_score: hotwordsScore) + hotwords_score: hotwordsScore, + ctc_fst_decoder_config: ctcFstDecoderConfig + ) } /// Wrapper for recognition result. diff --git a/swift-api-examples/run-streaming-hlg-decode-file.sh b/swift-api-examples/run-streaming-hlg-decode-file.sh new file mode 100755 index 00000000..5b641b8b --- /dev/null +++ b/swift-api-examples/run-streaming-hlg-decode-file.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d ../build-swift-macos ]; then + echo "Please run ../build-swift-macos.sh first!" + exit 1 +fi + +if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then + echo "Downloading the pre-trained model for testing." + + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +fi + +if [ ! -e ./streaming-hlg-decode-file ]; then + # Note: We use -lc++ to link against libc++ instead of libstdc++ + swiftc \ + -lc++ \ + -I ../build-swift-macos/install/include \ + -import-objc-header ./SherpaOnnx-Bridging-Header.h \ + ./streaming-hlg-decode-file.swift ./SherpaOnnx.swift \ + -L ../build-swift-macos/install/lib/ \ + -l sherpa-onnx \ + -l onnxruntime \ + -o streaming-hlg-decode-file + + strip ./streaming-hlg-decode-file +else + echo "./streaming-hlg-decode-file exists - skip building" +fi + +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH +./streaming-hlg-decode-file diff --git a/swift-api-examples/streaming-hlg-decode-file.swift b/swift-api-examples/streaming-hlg-decode-file.swift new file mode 100644 index 00000000..e57d118e --- /dev/null +++ b/swift-api-examples/streaming-hlg-decode-file.swift @@ -0,0 +1,79 @@ +import AVFoundation + +extension AudioBuffer { + func array() -> [Float] { + return Array(UnsafeBufferPointer(self)) + } +} + +extension AVAudioPCMBuffer { + func array() -> [Float] { + return self.audioBufferList.pointee.mBuffers.array() + } +} + +func run() { + let filePath = + "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav" + let model = + "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx" + let tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt" + let zipfomer2CtcModelConfig = sherpaOnnxOnlineZipformer2CtcModelConfig( + model: model + ) + + let modelConfig = sherpaOnnxOnlineModelConfig( + tokens: tokens, + zipformer2Ctc: zipfomer2CtcModelConfig + ) + + let featConfig = sherpaOnnxFeatureConfig( + sampleRate: 16000, + featureDim: 80 + ) + + let ctcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig( + graph: "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst", + maxActive: 3000 + ) + + var config = sherpaOnnxOnlineRecognizerConfig( + featConfig: featConfig, + modelConfig: modelConfig, + ctcFstDecoderConfig: ctcFstDecoderConfig + ) + + let recognizer = SherpaOnnxRecognizer(config: &config) + + let fileURL: NSURL = NSURL(fileURLWithPath: filePath) + let audioFile = try! AVAudioFile(forReading: fileURL as URL) + + let audioFormat = audioFile.processingFormat + assert(audioFormat.channelCount == 1) + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32) + + let audioFrameCount = UInt32(audioFile.length) + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount) + + try! audioFile.read(into: audioFileBuffer!) + let array: [Float]! = audioFileBuffer?.array() + recognizer.acceptWaveform(samples: array, sampleRate: Int(audioFormat.sampleRate)) + + let tailPadding = [Float](repeating: 0.0, count: 3200) + recognizer.acceptWaveform(samples: tailPadding, sampleRate: Int(audioFormat.sampleRate)) + + recognizer.inputFinished() + while recognizer.isReady() { + recognizer.decode() + } + + let result = recognizer.getResult() + print("\nresult is:\n\(result.text)") +} + +@main +struct App { + static func main() { + run() + } +} diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index 55c8f2d9..e61757cf 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -43,6 +43,10 @@ function freeConfig(config, Module) { freeConfig(config.lm, Module) } + if ('ctcFstDecoder' in config) { + freeConfig(config.ctcFstDecoder, Module) + } + Module._free(config.ptr); } @@ -193,11 +197,26 @@ function initSherpaOnnxFeatureConfig(config, Module) { return {ptr: ptr, len: len}; } +function initSherpaOnnxOnlineCtcFstDecoderConfig(config, Module) { + const len = 2 * 4; + const ptr = Module._malloc(len); + + const graphLen = Module.lengthBytesUTF8(config.graph) + 1; + const buffer = Module._malloc(graphLen); + Module.stringToUTF8(config.graph, buffer, graphLen); + + Module.setValue(ptr, buffer, 'i8*'); + Module.setValue(ptr + 4, config.maxActive, 'i32'); + return {ptr: ptr, len: len, buffer: buffer}; +} + function initSherpaOnnxOnlineRecognizerConfig(config, Module) { const feat = initSherpaOnnxFeatureConfig(config.featConfig, Module); const model = initSherpaOnnxOnlineModelConfig(config.modelConfig, Module); + const ctcFstDecoder = initSherpaOnnxOnlineCtcFstDecoderConfig( + config.ctcFstDecoderConfig, Module) - const len = feat.len + model.len + 8 * 4; + const len = feat.len + model.len + 8 * 4 + ctcFstDecoder.len; const ptr = Module._malloc(len); let offset = 0; @@ -243,8 +262,11 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { Module.setValue(ptr + offset, config.hotwordsScore, 'float'); offset += 4; + Module._CopyHeap(ctcFstDecoder.ptr, ctcFstDecoder.len, ptr + offset); + return { - buffer: buffer, ptr: ptr, len: len, feat: feat, model: model + buffer: buffer, ptr: ptr, len: len, feat: feat, model: model, + ctcFstDecoder: ctcFstDecoder } } @@ -313,6 +335,10 @@ function createOnlineRecognizer(Module, myConfig) { rule3MinUtteranceLength: 20, hotwordsFile: '', hotwordsScore: 1.5, + ctcFstDecoderConfig: { + graph: '', + maxActive: 3000, + } }; if (myConfig) { recognizerConfig = myConfig; diff --git a/wasm/asr/sherpa-onnx-wasm-main-asr.cc b/wasm/asr/sherpa-onnx-wasm-main-asr.cc index 951391e1..70d13f1c 100644 --- a/wasm/asr/sherpa-onnx-wasm-main-asr.cc +++ b/wasm/asr/sherpa-onnx-wasm-main-asr.cc @@ -22,9 +22,11 @@ static_assert(sizeof(SherpaOnnxOnlineModelConfig) == sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4, ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); +static_assert(sizeof(SherpaOnnxOnlineCtcFstDecoderConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOnlineRecognizerConfig) == sizeof(SherpaOnnxFeatureConfig) + - sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4, + sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4 + + sizeof(SherpaOnnxOnlineCtcFstDecoderConfig), ""); void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) { @@ -67,6 +69,11 @@ void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) { config->rule3_min_utterance_length); fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file); fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score); + + fprintf(stdout, "----------ctc fst decoder config----------\n"); + fprintf(stdout, "graph: %s\n", config->ctc_fst_decoder_config.graph); + fprintf(stdout, "max_active: %d\n", + config->ctc_fst_decoder_config.max_active); } void CopyHeap(const char *src, int32_t num_bytes, char *dst) {