Add C API for streaming HLG decoding (#734)

This commit is contained in:
Fangjun Kuang
2024-04-05 10:31:20 +08:00
committed by GitHub
parent db67e00c77
commit dbff2eaadb
39 changed files with 839 additions and 8 deletions

View File

@@ -7,3 +7,4 @@ vits-vctk
sherpa-onnx-paraformer-zh-2023-09-14
!*.sh
*.bak
streaming-hlg-decode-file

View File

@@ -111,6 +111,15 @@ func sherpaOnnxFeatureConfig(
feature_dim: Int32(featureDim))
}
func sherpaOnnxOnlineCtcFstDecoderConfig(
graph: String = "",
maxActive: Int = 3000
) -> SherpaOnnxOnlineCtcFstDecoderConfig {
return SherpaOnnxOnlineCtcFstDecoderConfig(
graph: toCPointer(graph),
max_active: Int32(maxActive))
}
func sherpaOnnxOnlineRecognizerConfig(
featConfig: SherpaOnnxFeatureConfig,
modelConfig: SherpaOnnxOnlineModelConfig,
@@ -121,7 +130,8 @@ func sherpaOnnxOnlineRecognizerConfig(
decodingMethod: String = "greedy_search",
maxActivePaths: Int = 4,
hotwordsFile: String = "",
hotwordsScore: Float = 1.5
hotwordsScore: Float = 1.5,
ctcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig()
) -> SherpaOnnxOnlineRecognizerConfig {
return SherpaOnnxOnlineRecognizerConfig(
feat_config: featConfig,
@@ -133,7 +143,9 @@ func sherpaOnnxOnlineRecognizerConfig(
rule2_min_trailing_silence: rule2MinTrailingSilence,
rule3_min_utterance_length: rule3MinUtteranceLength,
hotwords_file: toCPointer(hotwordsFile),
hotwords_score: hotwordsScore)
hotwords_score: hotwordsScore,
ctc_fst_decoder_config: ctcFstDecoderConfig
)
}
/// Wrapper for recognition result.

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env bash
set -ex
if [ ! -d ../build-swift-macos ]; then
echo "Please run ../build-swift-macos.sh first!"
exit 1
fi
if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
echo "Downloading the pre-trained model for testing."
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
fi
if [ ! -e ./streaming-hlg-decode-file ]; then
# Note: We use -lc++ to link against libc++ instead of libstdc++
swiftc \
-lc++ \
-I ../build-swift-macos/install/include \
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
./streaming-hlg-decode-file.swift ./SherpaOnnx.swift \
-L ../build-swift-macos/install/lib/ \
-l sherpa-onnx \
-l onnxruntime \
-o streaming-hlg-decode-file
strip ./streaming-hlg-decode-file
else
echo "./streaming-hlg-decode-file exists - skip building"
fi
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
./streaming-hlg-decode-file

View File

@@ -0,0 +1,79 @@
import AVFoundation
extension AudioBuffer {
func array() -> [Float] {
return Array(UnsafeBufferPointer(self))
}
}
extension AVAudioPCMBuffer {
func array() -> [Float] {
return self.audioBufferList.pointee.mBuffers.array()
}
}
func run() {
let filePath =
"./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav"
let model =
"./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx"
let tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt"
let zipfomer2CtcModelConfig = sherpaOnnxOnlineZipformer2CtcModelConfig(
model: model
)
let modelConfig = sherpaOnnxOnlineModelConfig(
tokens: tokens,
zipformer2Ctc: zipfomer2CtcModelConfig
)
let featConfig = sherpaOnnxFeatureConfig(
sampleRate: 16000,
featureDim: 80
)
let ctcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig(
graph: "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst",
maxActive: 3000
)
var config = sherpaOnnxOnlineRecognizerConfig(
featConfig: featConfig,
modelConfig: modelConfig,
ctcFstDecoderConfig: ctcFstDecoderConfig
)
let recognizer = SherpaOnnxRecognizer(config: &config)
let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
let audioFormat = audioFile.processingFormat
assert(audioFormat.channelCount == 1)
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
let audioFrameCount = UInt32(audioFile.length)
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
try! audioFile.read(into: audioFileBuffer!)
let array: [Float]! = audioFileBuffer?.array()
recognizer.acceptWaveform(samples: array, sampleRate: Int(audioFormat.sampleRate))
let tailPadding = [Float](repeating: 0.0, count: 3200)
recognizer.acceptWaveform(samples: tailPadding, sampleRate: Int(audioFormat.sampleRate))
recognizer.inputFinished()
while recognizer.isReady() {
recognizer.decode()
}
let result = recognizer.getResult()
print("\nresult is:\n\(result.text)")
}
@main
struct App {
static func main() {
run()
}
}