Add C API for streaming HLG decoding (#734)
This commit is contained in:
5
.github/scripts/test-dot-net.sh
vendored
5
.github/scripts/test-dot-net.sh
vendored
@@ -2,7 +2,10 @@
|
||||
|
||||
cd dotnet-examples/
|
||||
|
||||
cd spoken-language-identification
|
||||
cd streaming-hlg-decoding/
|
||||
./run.sh
|
||||
|
||||
cd ../spoken-language-identification
|
||||
./run.sh
|
||||
|
||||
cd ../online-decode-files
|
||||
|
||||
7
.github/scripts/test-nodejs-npm.sh
vendored
7
.github/scripts/test-nodejs-npm.sh
vendored
@@ -58,6 +58,13 @@ rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
|
||||
node ./test-online-zipformer2-ctc.js
|
||||
rm -rf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
|
||||
|
||||
|
||||
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
node ./test-online-zipformer2-ctc-hlg.js
|
||||
rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
|
||||
|
||||
# offline tts
|
||||
|
||||
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
|
||||
5
.github/scripts/test-swift.sh
vendored
5
.github/scripts/test-swift.sh
vendored
@@ -7,6 +7,10 @@ echo "pwd: $PWD"
|
||||
cd swift-api-examples
|
||||
ls -lh
|
||||
|
||||
./run-streaming-hlg-decode-file.sh
|
||||
rm ./streaming-hlg-decode-file
|
||||
rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
|
||||
|
||||
./run-spoken-language-identification.sh
|
||||
rm -rf sherpa-onnx-whisper*
|
||||
|
||||
@@ -31,4 +35,5 @@ sed -i.bak '20d' ./decode-file.swift
|
||||
|
||||
./run-decode-file-non-streaming.sh
|
||||
|
||||
|
||||
ls -lh
|
||||
|
||||
1
.github/workflows/test-dot-net.yaml
vendored
1
.github/workflows/test-dot-net.yaml
vendored
@@ -178,6 +178,7 @@ jobs:
|
||||
cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/
|
||||
cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/
|
||||
cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/
|
||||
cp -v scripts/dotnet/examples/streaming-hlg-decoding.csproj dotnet-examples/streaming-hlg-decoding
|
||||
|
||||
ls -lh /tmp
|
||||
|
||||
|
||||
67
.github/workflows/test-go-package.yaml
vendored
67
.github/workflows/test-go-package.yaml
vendored
@@ -66,12 +66,77 @@ jobs:
|
||||
run: |
|
||||
gcc --version
|
||||
|
||||
- name: Test speaker identification
|
||||
- name: Test streaming HLG decoding (Linux/macOS)
|
||||
if: matrix.os != 'windows-latest'
|
||||
shell: bash
|
||||
run: |
|
||||
cd go-api-examples/streaming-hlg-decoding/
|
||||
./run.sh
|
||||
|
||||
- name: Test speaker identification (Linux/macOS)
|
||||
if: matrix.os != 'windows-latest'
|
||||
shell: bash
|
||||
run: |
|
||||
cd go-api-examples/speaker-identification
|
||||
./run.sh
|
||||
|
||||
- name: Test speaker identification (Win64)
|
||||
if: matrix.os == 'windows-latest' && matrix.arch == 'x64'
|
||||
shell: bash
|
||||
run: |
|
||||
cd go-api-examples/speaker-identification
|
||||
go mod tidy
|
||||
cat go.mod
|
||||
go build
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx
|
||||
git clone https://github.com/csukuangfj/sr-data
|
||||
ls -lh
|
||||
echo $PWD
|
||||
ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/
|
||||
ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/*
|
||||
cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/x86_64-pc-windows-gnu/*.dll .
|
||||
ls -lh
|
||||
go mod tidy
|
||||
go build
|
||||
go run ./main.go
|
||||
|
||||
- name: Test speaker identification (Win32)
|
||||
if: matrix.os == 'windows-latest' && matrix.arch == 'x86'
|
||||
shell: bash
|
||||
run: |
|
||||
cd go-api-examples/speaker-identification
|
||||
go mod tidy
|
||||
cat go.mod
|
||||
ls -lh
|
||||
|
||||
go env GOARCH
|
||||
go env
|
||||
echo "------------------------------"
|
||||
go env -w GOARCH=386
|
||||
go env -w CGO_ENABLED=1
|
||||
go env
|
||||
|
||||
go clean
|
||||
go build
|
||||
|
||||
echo $PWD
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx
|
||||
git clone https://github.com/csukuangfj/sr-data
|
||||
ls -lh
|
||||
echo $PWD
|
||||
ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/
|
||||
ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/*
|
||||
cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/i686-pc-windows-gnu/*.dll .
|
||||
ls -lh
|
||||
go mod tidy
|
||||
go build
|
||||
go run ./main.go
|
||||
|
||||
rm -rf sr-data
|
||||
rm -rf *.onnx
|
||||
|
||||
- name: Test non-streaming TTS (Linux/macOS)
|
||||
if: matrix.os != 'windows-latest'
|
||||
shell: bash
|
||||
|
||||
6
.github/workflows/test-go.yaml
vendored
6
.github/workflows/test-go.yaml
vendored
@@ -74,6 +74,12 @@ jobs:
|
||||
go mod tidy
|
||||
go build
|
||||
|
||||
- name: Test streaming HLG decoding
|
||||
shell: bash
|
||||
run: |
|
||||
cd scripts/go/_internal/streaming-hlg-decoding/
|
||||
./run.sh
|
||||
|
||||
- name: Test speaker identification
|
||||
shell: bash
|
||||
run: |
|
||||
|
||||
@@ -15,6 +15,9 @@ target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
|
||||
add_executable(speaker-identification-c-api speaker-identification-c-api.c)
|
||||
target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api)
|
||||
|
||||
add_executable(streaming-hlg-decode-file-c-api streaming-hlg-decode-file-c-api.c)
|
||||
target_link_libraries(streaming-hlg-decode-file-c-api sherpa-onnx-c-api)
|
||||
|
||||
if(SHERPA_ONNX_HAS_ALSA)
|
||||
add_subdirectory(./asr-microphone-example)
|
||||
elseif((UNIX AND NOT APPLE) OR LINUX)
|
||||
|
||||
130
c-api-examples/streaming-hlg-decode-file-c-api.c
Normal file
130
c-api-examples/streaming-hlg-decode-file-c-api.c
Normal file
@@ -0,0 +1,130 @@
|
||||
// c-api-examples/streaming-hlg-decode-file-c-api.c
|
||||
//
|
||||
// Copyright (c) 2024 Xiaomi Corporation
|
||||
/*
|
||||
We use the following model as an example
|
||||
|
||||
// clang-format off
|
||||
|
||||
Download the model from
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
|
||||
build/bin/streaming-hlg-decode-file-c-api
|
||||
|
||||
(The above model is from https://github.com/k2-fsa/icefall/pull/1557)
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sherpa-onnx/c-api/c-api.h"
|
||||
|
||||
int32_t main() {
|
||||
// clang-format off
|
||||
//
|
||||
// Please download the model from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
const char *model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx";
|
||||
const char *tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt";
|
||||
const char *graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst";
|
||||
const char *wav_filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav";
|
||||
// clang-format on
|
||||
|
||||
SherpaOnnxOnlineRecognizerConfig config;
|
||||
|
||||
memset(&config, 0, sizeof(config));
|
||||
config.feat_config.sample_rate = 16000;
|
||||
config.feat_config.feature_dim = 80;
|
||||
config.model_config.zipformer2_ctc.model = model;
|
||||
config.model_config.tokens = tokens;
|
||||
config.model_config.num_threads = 1;
|
||||
config.model_config.provider = "cpu";
|
||||
config.model_config.debug = 0;
|
||||
config.ctc_fst_decoder_config.graph = graph;
|
||||
const SherpaOnnxOnlineRecognizer *recognizer =
|
||||
CreateOnlineRecognizer(&config);
|
||||
if (!recognizer) {
|
||||
fprintf(stderr, "Failed to create recognizer");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
|
||||
|
||||
const SherpaOnnxDisplay *display = CreateDisplay(50);
|
||||
int32_t segment_id = 0;
|
||||
|
||||
const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
|
||||
if (wave == NULL) {
|
||||
fprintf(stderr, "Failed to read %s\n", wav_filename);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// simulate streaming. You can choose an arbitrary N
|
||||
#define N 3200
|
||||
|
||||
int16_t buffer[N];
|
||||
float samples[N];
|
||||
fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n",
|
||||
wave->sample_rate, wave->num_samples,
|
||||
(float)wave->num_samples / wave->sample_rate);
|
||||
|
||||
int32_t k = 0;
|
||||
while (k < wave->num_samples) {
|
||||
int32_t start = k;
|
||||
int32_t end =
|
||||
(start + N > wave->num_samples) ? wave->num_samples : (start + N);
|
||||
k += N;
|
||||
|
||||
AcceptWaveform(stream, wave->sample_rate, wave->samples + start,
|
||||
end - start);
|
||||
while (IsOnlineStreamReady(recognizer, stream)) {
|
||||
DecodeOnlineStream(recognizer, stream);
|
||||
}
|
||||
|
||||
const SherpaOnnxOnlineRecognizerResult *r =
|
||||
GetOnlineStreamResult(recognizer, stream);
|
||||
|
||||
if (strlen(r->text)) {
|
||||
SherpaOnnxPrint(display, segment_id, r->text);
|
||||
}
|
||||
|
||||
if (IsEndpoint(recognizer, stream)) {
|
||||
if (strlen(r->text)) {
|
||||
++segment_id;
|
||||
}
|
||||
Reset(recognizer, stream);
|
||||
}
|
||||
|
||||
DestroyOnlineRecognizerResult(r);
|
||||
}
|
||||
|
||||
// add some tail padding
|
||||
float tail_paddings[4800] = {0}; // 0.3 seconds at 16 kHz sample rate
|
||||
AcceptWaveform(stream, wave->sample_rate, tail_paddings, 4800);
|
||||
|
||||
SherpaOnnxFreeWave(wave);
|
||||
|
||||
InputFinished(stream);
|
||||
while (IsOnlineStreamReady(recognizer, stream)) {
|
||||
DecodeOnlineStream(recognizer, stream);
|
||||
}
|
||||
|
||||
const SherpaOnnxOnlineRecognizerResult *r =
|
||||
GetOnlineStreamResult(recognizer, stream);
|
||||
|
||||
if (strlen(r->text)) {
|
||||
SherpaOnnxPrint(display, segment_id, r->text);
|
||||
}
|
||||
|
||||
DestroyOnlineRecognizerResult(r);
|
||||
|
||||
DestroyDisplay(display);
|
||||
DestroyOnlineStream(stream);
|
||||
DestroyOnlineRecognizer(recognizer);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -5,7 +5,7 @@ function(download_onnxruntime)
|
||||
message(STATUS "CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}")
|
||||
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
if(SHERPA_ONNX_ENABLE_WASM)
|
||||
include(onnxruntime-wasm-simd)
|
||||
include(onnxruntime-wasm-simd)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL riscv64)
|
||||
if(BUILD_SHARED_LIBS)
|
||||
include(onnxruntime-linux-riscv64)
|
||||
|
||||
@@ -15,6 +15,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identification", "spoken-language-identification\spoken-language-identification.csproj", "{3D7CF3D6-AC45-4D50-9619-5687B1443E94}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "streaming-hlg-decoding", "streaming-hlg-decoding\streaming-hlg-decoding.csproj", "{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
@@ -48,5 +50,9 @@ Global
|
||||
{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
EndGlobal
|
||||
|
||||
66
dotnet-examples/streaming-hlg-decoding/Program.cs
Normal file
66
dotnet-examples/streaming-hlg-decoding/Program.cs
Normal file
@@ -0,0 +1,66 @@
|
||||
// Copyright (c) 2024 Xiaomi Corporation
|
||||
//
|
||||
// This file shows how to do streaming HLG decoding.
|
||||
//
|
||||
// 1. Download the model for testing
|
||||
//
|
||||
// curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
// tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
// rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
//
|
||||
// 2. Now run it
|
||||
//
|
||||
// dotnet run
|
||||
|
||||
using SherpaOnnx;
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
|
||||
class StreamingHlgDecodingDemo
|
||||
{
|
||||
|
||||
static void Main(string[] args)
|
||||
{
|
||||
var config = new OnlineRecognizerConfig();
|
||||
config.FeatConfig.SampleRate = 16000;
|
||||
config.FeatConfig.FeatureDim = 80;
|
||||
config.ModelConfig.Zipformer2Ctc.Model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx";
|
||||
|
||||
config.ModelConfig.Tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt";
|
||||
config.ModelConfig.Provider = "cpu";
|
||||
config.ModelConfig.NumThreads = 1;
|
||||
config.ModelConfig.Debug = 0;
|
||||
config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst";
|
||||
|
||||
OnlineRecognizer recognizer = new OnlineRecognizer(config);
|
||||
|
||||
var filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav";
|
||||
|
||||
WaveReader waveReader = new WaveReader(filename);
|
||||
OnlineStream s = recognizer.CreateStream();
|
||||
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
|
||||
|
||||
float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
|
||||
s.AcceptWaveform(waveReader.SampleRate, tailPadding);
|
||||
s.InputFinished();
|
||||
|
||||
while (recognizer.IsReady(s))
|
||||
{
|
||||
recognizer.Decode(s);
|
||||
}
|
||||
|
||||
OnlineRecognizerResult r = recognizer.GetResult(s);
|
||||
var text = r.Text;
|
||||
var tokens = r.Tokens;
|
||||
Console.WriteLine("--------------------");
|
||||
Console.WriteLine(filename);
|
||||
Console.WriteLine("text: {0}", text);
|
||||
Console.WriteLine("tokens: [{0}]", string.Join(", ", tokens));
|
||||
Console.Write("timestamps: [");
|
||||
r.Timestamps.ToList().ForEach(i => Console.Write(String.Format("{0:0.00}", i) + ", "));
|
||||
Console.WriteLine("]");
|
||||
Console.WriteLine("--------------------");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
1
dotnet-examples/streaming-hlg-decoding/WaveReader.cs
Symbolic link
1
dotnet-examples/streaming-hlg-decoding/WaveReader.cs
Symbolic link
@@ -0,0 +1 @@
|
||||
../online-decode-files/WaveReader.cs
|
||||
11
dotnet-examples/streaming-hlg-decoding/run.sh
Executable file
11
dotnet-examples/streaming-hlg-decoding/run.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
fi
|
||||
|
||||
dotnet run -c Release
|
||||
@@ -0,0 +1,15 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<RootNamespace>streaming_hlg_decoding</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
3
go-api-examples/streaming-hlg-decoding/go.mod
Normal file
3
go-api-examples/streaming-hlg-decoding/go.mod
Normal file
@@ -0,0 +1,3 @@
|
||||
module streaming-hlg-decoding
|
||||
|
||||
go 1.12
|
||||
109
go-api-examples/streaming-hlg-decoding/main.go
Normal file
109
go-api-examples/streaming-hlg-decoding/main.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
|
||||
"github.com/youpy/go-wav"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
log.SetFlags(log.LstdFlags | log.Lmicroseconds)
|
||||
|
||||
config := sherpa.OnlineRecognizerConfig{}
|
||||
config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80}
|
||||
|
||||
// please download model files from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
config.ModelConfig.Zipformer2Ctc.Model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx"
|
||||
config.ModelConfig.Tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt"
|
||||
|
||||
config.ModelConfig.NumThreads = 1
|
||||
config.ModelConfig.Debug = 0
|
||||
config.ModelConfig.Provider = "cpu"
|
||||
config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst"
|
||||
|
||||
wav_filename := "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav"
|
||||
|
||||
samples, sampleRate := readWave(wav_filename)
|
||||
|
||||
log.Println("Initializing recognizer (may take several seconds)")
|
||||
recognizer := sherpa.NewOnlineRecognizer(&config)
|
||||
log.Println("Recognizer created!")
|
||||
defer sherpa.DeleteOnlineRecognizer(recognizer)
|
||||
|
||||
log.Println("Start decoding!")
|
||||
stream := sherpa.NewOnlineStream(recognizer)
|
||||
defer sherpa.DeleteOnlineStream(stream)
|
||||
|
||||
stream.AcceptWaveform(sampleRate, samples)
|
||||
|
||||
tailPadding := make([]float32, int(float32(sampleRate)*0.3))
|
||||
stream.AcceptWaveform(sampleRate, tailPadding)
|
||||
|
||||
for recognizer.IsReady(stream) {
|
||||
recognizer.Decode(stream)
|
||||
}
|
||||
log.Println("Decoding done!")
|
||||
result := recognizer.GetResult(stream)
|
||||
log.Println(strings.ToLower(result.Text))
|
||||
log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate))
|
||||
}
|
||||
|
||||
func readWave(filename string) (samples []float32, sampleRate int) {
|
||||
file, _ := os.Open(filename)
|
||||
defer file.Close()
|
||||
|
||||
reader := wav.NewReader(file)
|
||||
format, err := reader.Format()
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to read wave format")
|
||||
}
|
||||
|
||||
if format.AudioFormat != 1 {
|
||||
log.Fatalf("Support only PCM format. Given: %v\n", format.AudioFormat)
|
||||
}
|
||||
|
||||
if format.NumChannels != 1 {
|
||||
log.Fatalf("Support only 1 channel wave file. Given: %v\n", format.NumChannels)
|
||||
}
|
||||
|
||||
if format.BitsPerSample != 16 {
|
||||
log.Fatalf("Support only 16-bit per sample. Given: %v\n", format.BitsPerSample)
|
||||
}
|
||||
|
||||
reader.Duration() // so that it initializes reader.Size
|
||||
|
||||
buf := make([]byte, reader.Size)
|
||||
n, err := reader.Read(buf)
|
||||
if n != int(reader.Size) {
|
||||
log.Fatalf("Failed to read %v bytes. Returned %v bytes\n", reader.Size, n)
|
||||
}
|
||||
|
||||
samples = samplesInt16ToFloat(buf)
|
||||
sampleRate = int(format.SampleRate)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func samplesInt16ToFloat(inSamples []byte) []float32 {
|
||||
numSamples := len(inSamples) / 2
|
||||
outSamples := make([]float32, numSamples)
|
||||
|
||||
for i := 0; i != numSamples; i++ {
|
||||
s := inSamples[i*2 : (i+1)*2]
|
||||
|
||||
var s16 int16
|
||||
buf := bytes.NewReader(s)
|
||||
err := binary.Read(buf, binary.LittleEndian, &s16)
|
||||
if err != nil {
|
||||
log.Fatal("Failed to parse 16-bit sample")
|
||||
}
|
||||
outSamples[i] = float32(s16) / 32768
|
||||
}
|
||||
|
||||
return outSamples
|
||||
}
|
||||
14
go-api-examples/streaming-hlg-decoding/run.sh
Executable file
14
go-api-examples/streaming-hlg-decoding/run.sh
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
fi
|
||||
|
||||
go mod tidy
|
||||
go build
|
||||
ls -lh
|
||||
./streaming-hlg-decoding
|
||||
@@ -174,3 +174,16 @@ wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherp
|
||||
tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
|
||||
node ./test-online-zipformer2-ctc.js
|
||||
```
|
||||
|
||||
## ./test-online-zipformer2-ctc-hlg.js
|
||||
[./test-online-zipformer2-ctc-hlg.js](./test-online-zipformer2-ctc-hlg.js) demonstrates
|
||||
how to decode a file using a streaming zipformer2 CTC model with HLG. In the code
|
||||
we use [sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2).
|
||||
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
node ./test-online-zipformer2-ctc-hlg.js
|
||||
```
|
||||
|
||||
@@ -50,6 +50,10 @@ function createOnlineRecognizer() {
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
ctcFstDecoderConfig: {
|
||||
graph: '',
|
||||
maxActive: 3000,
|
||||
}
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
|
||||
@@ -51,6 +51,10 @@ function createOnlineRecognizer() {
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
ctcFstDecoderConfig: {
|
||||
graph: '',
|
||||
maxActive: 3000,
|
||||
}
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
|
||||
@@ -52,6 +52,10 @@ function createOnlineRecognizer() {
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
ctcFstDecoderConfig: {
|
||||
graph: '',
|
||||
maxActive: 3000,
|
||||
}
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
|
||||
@@ -53,6 +53,10 @@ function createOnlineRecognizer() {
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
ctcFstDecoderConfig: {
|
||||
graph: '',
|
||||
maxActive: 3000,
|
||||
}
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
|
||||
125
nodejs-examples/test-online-zipformer2-ctc-hlg.js
Normal file
125
nodejs-examples/test-online-zipformer2-ctc-hlg.js
Normal file
@@ -0,0 +1,125 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createOnlineRecognizer() {
|
||||
let onlineTransducerModelConfig = {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
joiner: '',
|
||||
};
|
||||
|
||||
let onlineParaformerModelConfig = {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
};
|
||||
|
||||
let onlineZipformer2CtcModelConfig = {
|
||||
model:
|
||||
'./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx',
|
||||
};
|
||||
|
||||
let onlineModelConfig = {
|
||||
transducer: onlineTransducerModelConfig,
|
||||
paraformer: onlineParaformerModelConfig,
|
||||
zipformer2Ctc: onlineZipformer2CtcModelConfig,
|
||||
tokens: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt',
|
||||
numThreads: 1,
|
||||
provider: 'cpu',
|
||||
debug: 0,
|
||||
modelType: '',
|
||||
};
|
||||
|
||||
let featureConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
let recognizerConfig = {
|
||||
featConfig: featureConfig,
|
||||
modelConfig: onlineModelConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
enableEndpoint: 1,
|
||||
rule1MinTrailingSilence: 2.4,
|
||||
rule2MinTrailingSilence: 1.2,
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
ctcFstDecoderConfig: {
|
||||
graph: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst',
|
||||
maxActive: 3000,
|
||||
}
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
}
|
||||
|
||||
const recognizer = createOnlineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav';
|
||||
|
||||
const reader = new wav.Reader();
|
||||
const readable = new Readable().wrap(reader);
|
||||
|
||||
function decode(samples) {
|
||||
stream.acceptWaveform(gSampleRate, samples);
|
||||
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream);
|
||||
}
|
||||
const text = recognizer.getResult(stream);
|
||||
console.log(text);
|
||||
}
|
||||
|
||||
let gSampleRate = 16000;
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
gSampleRate = sampleRate;
|
||||
|
||||
if (audioFormat != 1) {
|
||||
throw new Error(`Only support PCM format. Given ${audioFormat}`);
|
||||
}
|
||||
|
||||
if (channels != 1) {
|
||||
throw new Error(`Only a single channel. Given ${channel}`);
|
||||
}
|
||||
|
||||
if (bitDepth != 16) {
|
||||
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
|
||||
}
|
||||
});
|
||||
|
||||
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
|
||||
.pipe(reader)
|
||||
.on('finish', function(err) {
|
||||
// tail padding
|
||||
const floatSamples =
|
||||
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
|
||||
decode(floatSamples);
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
readable.on('readable', function() {
|
||||
let chunk;
|
||||
while ((chunk = readable.read()) != null) {
|
||||
const int16Samples = new Int16Array(
|
||||
chunk.buffer, chunk.byteOffset,
|
||||
chunk.length / Int16Array.BYTES_PER_ELEMENT);
|
||||
|
||||
const floatSamples = new Float32Array(int16Samples.length);
|
||||
|
||||
for (let i = 0; i < floatSamples.length; i++) {
|
||||
floatSamples[i] = int16Samples[i] / 32768.0;
|
||||
}
|
||||
|
||||
decode(floatSamples);
|
||||
}
|
||||
});
|
||||
@@ -51,6 +51,10 @@ function createOnlineRecognizer() {
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
ctcFstDecoderConfig: {
|
||||
graph: '',
|
||||
maxActive: 3000,
|
||||
}
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
|
||||
19
scripts/dotnet/examples/streaming-hlg-decoding.csproj
Normal file
19
scripts/dotnet/examples/streaming-hlg-decoding.csproj
Normal file
@@ -0,0 +1,19 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<RootNamespace>streaming_hlg_decoding</RootNamespace>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<PropertyGroup>
|
||||
<RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -116,6 +116,21 @@ namespace SherpaOnnx
|
||||
public int FeatureDim;
|
||||
}
|
||||
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct OnlineCtcFstDecoderConfig
|
||||
{
|
||||
public OnlineCtcFstDecoderConfig()
|
||||
{
|
||||
Graph = "";
|
||||
MaxActive = 3000;
|
||||
}
|
||||
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string Graph;
|
||||
|
||||
public int MaxActive;
|
||||
}
|
||||
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct OnlineRecognizerConfig
|
||||
{
|
||||
@@ -131,6 +146,7 @@ namespace SherpaOnnx
|
||||
Rule3MinUtteranceLength = 20.0F;
|
||||
HotwordsFile = "";
|
||||
HotwordsScore = 1.5F;
|
||||
CtcFstDecoderConfig = new OnlineCtcFstDecoderConfig();
|
||||
}
|
||||
public FeatureConfig FeatConfig;
|
||||
public OnlineModelConfig ModelConfig;
|
||||
@@ -167,6 +183,8 @@ namespace SherpaOnnx
|
||||
|
||||
/// Bonus score for each token in hotwords.
|
||||
public float HotwordsScore;
|
||||
|
||||
public OnlineCtcFstDecoderConfig CtcFstDecoderConfig;
|
||||
}
|
||||
|
||||
public class OnlineRecognizerResult
|
||||
|
||||
1
scripts/go/_internal/streaming-hlg-decoding/.gitignore
vendored
Normal file
1
scripts/go/_internal/streaming-hlg-decoding/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
streaming-hlg-decoding
|
||||
5
scripts/go/_internal/streaming-hlg-decoding/go.mod
Normal file
5
scripts/go/_internal/streaming-hlg-decoding/go.mod
Normal file
@@ -0,0 +1,5 @@
|
||||
module streaming-hlg-decoding
|
||||
|
||||
go 1.12
|
||||
|
||||
replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
|
||||
1
scripts/go/_internal/streaming-hlg-decoding/main.go
Symbolic link
1
scripts/go/_internal/streaming-hlg-decoding/main.go
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../../go-api-examples/streaming-hlg-decoding/main.go
|
||||
1
scripts/go/_internal/streaming-hlg-decoding/run.sh
Symbolic link
1
scripts/go/_internal/streaming-hlg-decoding/run.sh
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../../go-api-examples/streaming-hlg-decoding/run.sh
|
||||
@@ -99,6 +99,11 @@ type FeatureConfig struct {
|
||||
FeatureDim int
|
||||
}
|
||||
|
||||
type OnlineCtcFstDecoderConfig struct {
|
||||
Graph string
|
||||
MaxActive int
|
||||
}
|
||||
|
||||
// Configuration for the online/streaming recognizer.
|
||||
type OnlineRecognizerConfig struct {
|
||||
FeatConfig FeatureConfig
|
||||
@@ -120,6 +125,7 @@ type OnlineRecognizerConfig struct {
|
||||
Rule1MinTrailingSilence float32
|
||||
Rule2MinTrailingSilence float32
|
||||
Rule3MinUtteranceLength float32
|
||||
CtcFstDecoderConfig OnlineCtcFstDecoderConfig
|
||||
}
|
||||
|
||||
// It contains the recognition result for a online stream.
|
||||
@@ -190,6 +196,10 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer {
|
||||
c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence)
|
||||
c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength)
|
||||
|
||||
c.ctc_fst_decoder_config.graph = C.CString(config.CtcFstDecoderConfig.Graph)
|
||||
defer C.free(unsafe.Pointer(c.ctc_fst_decoder_config.graph))
|
||||
c.ctc_fst_decoder_config.max_active = C.int(config.CtcFstDecoderConfig.MaxActive)
|
||||
|
||||
recognizer := &OnlineRecognizer{}
|
||||
recognizer.impl = C.CreateOnlineRecognizer(&c)
|
||||
|
||||
|
||||
@@ -99,6 +99,11 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer(
|
||||
recognizer_config.hotwords_score =
|
||||
SHERPA_ONNX_OR(config->hotwords_score, 1.5);
|
||||
|
||||
recognizer_config.ctc_fst_decoder_config.graph =
|
||||
SHERPA_ONNX_OR(config->ctc_fst_decoder_config.graph, "");
|
||||
recognizer_config.ctc_fst_decoder_config.max_active =
|
||||
SHERPA_ONNX_OR(config->ctc_fst_decoder_config.max_active, 3000);
|
||||
|
||||
if (config->model_config.debug) {
|
||||
SHERPA_ONNX_LOGE("%s\n", recognizer_config.ToString().c_str());
|
||||
}
|
||||
|
||||
@@ -96,6 +96,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxFeatureConfig {
|
||||
int32_t feature_dim;
|
||||
} SherpaOnnxFeatureConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineCtcFstDecoderConfig {
|
||||
const char *graph;
|
||||
int32_t max_active;
|
||||
} SherpaOnnxOnlineCtcFstDecoderConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig {
|
||||
SherpaOnnxFeatureConfig feat_config;
|
||||
SherpaOnnxOnlineModelConfig model_config;
|
||||
@@ -131,6 +136,8 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig {
|
||||
|
||||
/// Bonus score for each token in hotwords.
|
||||
float hotwords_score;
|
||||
|
||||
SherpaOnnxOnlineCtcFstDecoderConfig ctc_fst_decoder_config;
|
||||
} SherpaOnnxOnlineRecognizerConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerResult {
|
||||
|
||||
1
swift-api-examples/.gitignore
vendored
1
swift-api-examples/.gitignore
vendored
@@ -7,3 +7,4 @@ vits-vctk
|
||||
sherpa-onnx-paraformer-zh-2023-09-14
|
||||
!*.sh
|
||||
*.bak
|
||||
streaming-hlg-decode-file
|
||||
|
||||
@@ -111,6 +111,15 @@ func sherpaOnnxFeatureConfig(
|
||||
feature_dim: Int32(featureDim))
|
||||
}
|
||||
|
||||
func sherpaOnnxOnlineCtcFstDecoderConfig(
|
||||
graph: String = "",
|
||||
maxActive: Int = 3000
|
||||
) -> SherpaOnnxOnlineCtcFstDecoderConfig {
|
||||
return SherpaOnnxOnlineCtcFstDecoderConfig(
|
||||
graph: toCPointer(graph),
|
||||
max_active: Int32(maxActive))
|
||||
}
|
||||
|
||||
func sherpaOnnxOnlineRecognizerConfig(
|
||||
featConfig: SherpaOnnxFeatureConfig,
|
||||
modelConfig: SherpaOnnxOnlineModelConfig,
|
||||
@@ -121,7 +130,8 @@ func sherpaOnnxOnlineRecognizerConfig(
|
||||
decodingMethod: String = "greedy_search",
|
||||
maxActivePaths: Int = 4,
|
||||
hotwordsFile: String = "",
|
||||
hotwordsScore: Float = 1.5
|
||||
hotwordsScore: Float = 1.5,
|
||||
ctcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig()
|
||||
) -> SherpaOnnxOnlineRecognizerConfig {
|
||||
return SherpaOnnxOnlineRecognizerConfig(
|
||||
feat_config: featConfig,
|
||||
@@ -133,7 +143,9 @@ func sherpaOnnxOnlineRecognizerConfig(
|
||||
rule2_min_trailing_silence: rule2MinTrailingSilence,
|
||||
rule3_min_utterance_length: rule3MinUtteranceLength,
|
||||
hotwords_file: toCPointer(hotwordsFile),
|
||||
hotwords_score: hotwordsScore)
|
||||
hotwords_score: hotwordsScore,
|
||||
ctc_fst_decoder_config: ctcFstDecoderConfig
|
||||
)
|
||||
}
|
||||
|
||||
/// Wrapper for recognition result.
|
||||
|
||||
36
swift-api-examples/run-streaming-hlg-decode-file.sh
Executable file
36
swift-api-examples/run-streaming-hlg-decode-file.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
if [ ! -d ../build-swift-macos ]; then
|
||||
echo "Please run ../build-swift-macos.sh first!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
|
||||
echo "Downloading the pre-trained model for testing."
|
||||
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
fi
|
||||
|
||||
if [ ! -e ./streaming-hlg-decode-file ]; then
|
||||
# Note: We use -lc++ to link against libc++ instead of libstdc++
|
||||
swiftc \
|
||||
-lc++ \
|
||||
-I ../build-swift-macos/install/include \
|
||||
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
|
||||
./streaming-hlg-decode-file.swift ./SherpaOnnx.swift \
|
||||
-L ../build-swift-macos/install/lib/ \
|
||||
-l sherpa-onnx \
|
||||
-l onnxruntime \
|
||||
-o streaming-hlg-decode-file
|
||||
|
||||
strip ./streaming-hlg-decode-file
|
||||
else
|
||||
echo "./streaming-hlg-decode-file exists - skip building"
|
||||
fi
|
||||
|
||||
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
|
||||
./streaming-hlg-decode-file
|
||||
79
swift-api-examples/streaming-hlg-decode-file.swift
Normal file
79
swift-api-examples/streaming-hlg-decode-file.swift
Normal file
@@ -0,0 +1,79 @@
|
||||
import AVFoundation
|
||||
|
||||
extension AudioBuffer {
|
||||
func array() -> [Float] {
|
||||
return Array(UnsafeBufferPointer(self))
|
||||
}
|
||||
}
|
||||
|
||||
extension AVAudioPCMBuffer {
|
||||
func array() -> [Float] {
|
||||
return self.audioBufferList.pointee.mBuffers.array()
|
||||
}
|
||||
}
|
||||
|
||||
func run() {
|
||||
let filePath =
|
||||
"./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav"
|
||||
let model =
|
||||
"./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx"
|
||||
let tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt"
|
||||
let zipfomer2CtcModelConfig = sherpaOnnxOnlineZipformer2CtcModelConfig(
|
||||
model: model
|
||||
)
|
||||
|
||||
let modelConfig = sherpaOnnxOnlineModelConfig(
|
||||
tokens: tokens,
|
||||
zipformer2Ctc: zipfomer2CtcModelConfig
|
||||
)
|
||||
|
||||
let featConfig = sherpaOnnxFeatureConfig(
|
||||
sampleRate: 16000,
|
||||
featureDim: 80
|
||||
)
|
||||
|
||||
let ctcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig(
|
||||
graph: "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst",
|
||||
maxActive: 3000
|
||||
)
|
||||
|
||||
var config = sherpaOnnxOnlineRecognizerConfig(
|
||||
featConfig: featConfig,
|
||||
modelConfig: modelConfig,
|
||||
ctcFstDecoderConfig: ctcFstDecoderConfig
|
||||
)
|
||||
|
||||
let recognizer = SherpaOnnxRecognizer(config: &config)
|
||||
|
||||
let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
|
||||
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
|
||||
|
||||
let audioFormat = audioFile.processingFormat
|
||||
assert(audioFormat.channelCount == 1)
|
||||
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
|
||||
|
||||
let audioFrameCount = UInt32(audioFile.length)
|
||||
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
|
||||
|
||||
try! audioFile.read(into: audioFileBuffer!)
|
||||
let array: [Float]! = audioFileBuffer?.array()
|
||||
recognizer.acceptWaveform(samples: array, sampleRate: Int(audioFormat.sampleRate))
|
||||
|
||||
let tailPadding = [Float](repeating: 0.0, count: 3200)
|
||||
recognizer.acceptWaveform(samples: tailPadding, sampleRate: Int(audioFormat.sampleRate))
|
||||
|
||||
recognizer.inputFinished()
|
||||
while recognizer.isReady() {
|
||||
recognizer.decode()
|
||||
}
|
||||
|
||||
let result = recognizer.getResult()
|
||||
print("\nresult is:\n\(result.text)")
|
||||
}
|
||||
|
||||
@main
|
||||
struct App {
|
||||
static func main() {
|
||||
run()
|
||||
}
|
||||
}
|
||||
@@ -43,6 +43,10 @@ function freeConfig(config, Module) {
|
||||
freeConfig(config.lm, Module)
|
||||
}
|
||||
|
||||
if ('ctcFstDecoder' in config) {
|
||||
freeConfig(config.ctcFstDecoder, Module)
|
||||
}
|
||||
|
||||
Module._free(config.ptr);
|
||||
}
|
||||
|
||||
@@ -193,11 +197,26 @@ function initSherpaOnnxFeatureConfig(config, Module) {
|
||||
return {ptr: ptr, len: len};
|
||||
}
|
||||
|
||||
function initSherpaOnnxOnlineCtcFstDecoderConfig(config, Module) {
|
||||
const len = 2 * 4;
|
||||
const ptr = Module._malloc(len);
|
||||
|
||||
const graphLen = Module.lengthBytesUTF8(config.graph) + 1;
|
||||
const buffer = Module._malloc(graphLen);
|
||||
Module.stringToUTF8(config.graph, buffer, graphLen);
|
||||
|
||||
Module.setValue(ptr, buffer, 'i8*');
|
||||
Module.setValue(ptr + 4, config.maxActive, 'i32');
|
||||
return {ptr: ptr, len: len, buffer: buffer};
|
||||
}
|
||||
|
||||
function initSherpaOnnxOnlineRecognizerConfig(config, Module) {
|
||||
const feat = initSherpaOnnxFeatureConfig(config.featConfig, Module);
|
||||
const model = initSherpaOnnxOnlineModelConfig(config.modelConfig, Module);
|
||||
const ctcFstDecoder = initSherpaOnnxOnlineCtcFstDecoderConfig(
|
||||
config.ctcFstDecoderConfig, Module)
|
||||
|
||||
const len = feat.len + model.len + 8 * 4;
|
||||
const len = feat.len + model.len + 8 * 4 + ctcFstDecoder.len;
|
||||
const ptr = Module._malloc(len);
|
||||
|
||||
let offset = 0;
|
||||
@@ -243,8 +262,11 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) {
|
||||
Module.setValue(ptr + offset, config.hotwordsScore, 'float');
|
||||
offset += 4;
|
||||
|
||||
Module._CopyHeap(ctcFstDecoder.ptr, ctcFstDecoder.len, ptr + offset);
|
||||
|
||||
return {
|
||||
buffer: buffer, ptr: ptr, len: len, feat: feat, model: model
|
||||
buffer: buffer, ptr: ptr, len: len, feat: feat, model: model,
|
||||
ctcFstDecoder: ctcFstDecoder
|
||||
}
|
||||
}
|
||||
|
||||
@@ -313,6 +335,10 @@ function createOnlineRecognizer(Module, myConfig) {
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
ctcFstDecoderConfig: {
|
||||
graph: '',
|
||||
maxActive: 3000,
|
||||
}
|
||||
};
|
||||
if (myConfig) {
|
||||
recognizerConfig = myConfig;
|
||||
|
||||
@@ -22,9 +22,11 @@ static_assert(sizeof(SherpaOnnxOnlineModelConfig) ==
|
||||
sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4,
|
||||
"");
|
||||
static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOnlineCtcFstDecoderConfig) == 2 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOnlineRecognizerConfig) ==
|
||||
sizeof(SherpaOnnxFeatureConfig) +
|
||||
sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4,
|
||||
sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4 +
|
||||
sizeof(SherpaOnnxOnlineCtcFstDecoderConfig),
|
||||
"");
|
||||
|
||||
void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) {
|
||||
@@ -67,6 +69,11 @@ void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) {
|
||||
config->rule3_min_utterance_length);
|
||||
fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file);
|
||||
fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score);
|
||||
|
||||
fprintf(stdout, "----------ctc fst decoder config----------\n");
|
||||
fprintf(stdout, "graph: %s\n", config->ctc_fst_decoder_config.graph);
|
||||
fprintf(stdout, "max_active: %d\n",
|
||||
config->ctc_fst_decoder_config.max_active);
|
||||
}
|
||||
|
||||
void CopyHeap(const char *src, int32_t num_bytes, char *dst) {
|
||||
|
||||
Reference in New Issue
Block a user