Add two-pass speech recognition Android/iOS demo (#304)
This commit is contained in:
2
swift-api-examples/.gitignore
vendored
Normal file
2
swift-api-examples/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
decode-file
|
||||
decode-file-non-streaming
|
||||
@@ -175,7 +175,7 @@ class SherpaOnnxRecognizer {
|
||||
let recognizer: OpaquePointer!
|
||||
let stream: OpaquePointer!
|
||||
|
||||
/// Constructor taking a model config and a decoder config.
|
||||
/// Constructor taking a model config
|
||||
init(
|
||||
config: UnsafePointer<SherpaOnnxOnlineRecognizerConfig>!
|
||||
) {
|
||||
@@ -198,8 +198,7 @@ class SherpaOnnxRecognizer {
|
||||
/// - Parameters:
|
||||
/// - samples: Audio samples normalized to the range [-1, 1]
|
||||
/// - sampleRate: Sample rate of the input audio samples. Must match
|
||||
/// the one expected by the model. It must be 16000 for
|
||||
/// models from icefall.
|
||||
/// the one expected by the model.
|
||||
func acceptWaveform(samples: [Float], sampleRate: Int = 16000) {
|
||||
AcceptWaveform(stream, Int32(sampleRate), samples, Int32(samples.count))
|
||||
}
|
||||
@@ -238,3 +237,163 @@ class SherpaOnnxRecognizer {
|
||||
return IsEndpoint(recognizer, stream) == 1 ? true : false
|
||||
}
|
||||
}
|
||||
|
||||
// For offline APIs
|
||||
|
||||
func sherpaOnnxOfflineTransducerModelConfig(
|
||||
encoder: String = "",
|
||||
decoder: String = "",
|
||||
joiner: String = ""
|
||||
) -> SherpaOnnxOfflineTransducerModelConfig {
|
||||
return SherpaOnnxOfflineTransducerModelConfig(
|
||||
encoder: toCPointer(encoder),
|
||||
decoder: toCPointer(decoder),
|
||||
joiner: toCPointer(joiner)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineParaformerModelConfig(
|
||||
model: String = ""
|
||||
) -> SherpaOnnxOfflineParaformerModelConfig {
|
||||
return SherpaOnnxOfflineParaformerModelConfig(
|
||||
model: toCPointer(model)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineNemoEncDecCtcModelConfig(
|
||||
model: String = ""
|
||||
) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig {
|
||||
return SherpaOnnxOfflineNemoEncDecCtcModelConfig(
|
||||
model: toCPointer(model)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineWhisperModelConfig(
|
||||
encoder: String = "",
|
||||
decoder: String = ""
|
||||
) -> SherpaOnnxOfflineWhisperModelConfig {
|
||||
return SherpaOnnxOfflineWhisperModelConfig(
|
||||
encoder: toCPointer(encoder),
|
||||
decoder: toCPointer(decoder)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineTdnnModelConfig(
|
||||
model: String = ""
|
||||
) -> SherpaOnnxOfflineTdnnModelConfig {
|
||||
return SherpaOnnxOfflineTdnnModelConfig(
|
||||
model: toCPointer(model)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineLMConfig(
|
||||
model: String = "",
|
||||
scale: Float = 1.0
|
||||
) -> SherpaOnnxOfflineLMConfig {
|
||||
return SherpaOnnxOfflineLMConfig(
|
||||
model: toCPointer(model),
|
||||
scale: scale
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineModelConfig(
|
||||
tokens: String,
|
||||
transducer: SherpaOnnxOfflineTransducerModelConfig = sherpaOnnxOfflineTransducerModelConfig(),
|
||||
paraformer: SherpaOnnxOfflineParaformerModelConfig = sherpaOnnxOfflineParaformerModelConfig(),
|
||||
nemoCtc: SherpaOnnxOfflineNemoEncDecCtcModelConfig = sherpaOnnxOfflineNemoEncDecCtcModelConfig(),
|
||||
whisper: SherpaOnnxOfflineWhisperModelConfig = sherpaOnnxOfflineWhisperModelConfig(),
|
||||
tdnn: SherpaOnnxOfflineTdnnModelConfig = sherpaOnnxOfflineTdnnModelConfig(),
|
||||
numThreads: Int = 1,
|
||||
provider: String = "cpu",
|
||||
debug: Int = 0,
|
||||
modelType: String = ""
|
||||
) -> SherpaOnnxOfflineModelConfig {
|
||||
return SherpaOnnxOfflineModelConfig(
|
||||
transducer: transducer,
|
||||
paraformer: paraformer,
|
||||
nemo_ctc: nemoCtc,
|
||||
whisper: whisper,
|
||||
tdnn: tdnn,
|
||||
tokens: toCPointer(tokens),
|
||||
num_threads: Int32(numThreads),
|
||||
debug: Int32(debug),
|
||||
provider: toCPointer(provider),
|
||||
model_type: toCPointer(modelType)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineRecognizerConfig(
|
||||
featConfig: SherpaOnnxFeatureConfig,
|
||||
modelConfig: SherpaOnnxOfflineModelConfig,
|
||||
lmConfig: SherpaOnnxOfflineLMConfig = sherpaOnnxOfflineLMConfig(),
|
||||
decodingMethod: String = "greedy_search",
|
||||
maxActivePaths: Int = 4
|
||||
) -> SherpaOnnxOfflineRecognizerConfig {
|
||||
return SherpaOnnxOfflineRecognizerConfig(
|
||||
feat_config: featConfig,
|
||||
model_config: modelConfig,
|
||||
lm_config: lmConfig,
|
||||
decoding_method: toCPointer(decodingMethod),
|
||||
max_active_paths: Int32(maxActivePaths)
|
||||
)
|
||||
}
|
||||
|
||||
class SherpaOnnxOfflineRecongitionResult {
|
||||
/// A pointer to the underlying counterpart in C
|
||||
let result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>!
|
||||
|
||||
/// Return the actual recognition result.
|
||||
/// For English models, it contains words separated by spaces.
|
||||
/// For Chinese models, it contains Chinese words.
|
||||
var text: String {
|
||||
return String(cString: result.pointee.text)
|
||||
}
|
||||
|
||||
init(result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>!) {
|
||||
self.result = result
|
||||
}
|
||||
|
||||
deinit {
|
||||
if let result {
|
||||
DestroyOfflineRecognizerResult(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class SherpaOnnxOfflineRecognizer {
|
||||
/// A pointer to the underlying counterpart in C
|
||||
let recognizer: OpaquePointer!
|
||||
|
||||
init(
|
||||
config: UnsafePointer<SherpaOnnxOfflineRecognizerConfig>!
|
||||
) {
|
||||
recognizer = CreateOfflineRecognizer(config)
|
||||
}
|
||||
|
||||
deinit {
|
||||
if let recognizer {
|
||||
DestroyOfflineRecognizer(recognizer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode wave samples.
|
||||
///
|
||||
/// - Parameters:
|
||||
/// - samples: Audio samples normalized to the range [-1, 1]
|
||||
/// - sampleRate: Sample rate of the input audio samples. Must match
|
||||
/// the one expected by the model.
|
||||
func decode(samples: [Float], sampleRate: Int = 16000) -> SherpaOnnxOfflineRecongitionResult {
|
||||
let stream: OpaquePointer! = CreateOfflineStream(recognizer)
|
||||
|
||||
AcceptWaveformOffline(stream, Int32(sampleRate), samples, Int32(samples.count))
|
||||
|
||||
DecodeOfflineStream(recognizer, stream)
|
||||
|
||||
let result: UnsafeMutablePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
|
||||
stream)
|
||||
|
||||
DestroyOfflineStream(stream)
|
||||
|
||||
return SherpaOnnxOfflineRecongitionResult(result: result)
|
||||
}
|
||||
}
|
||||
|
||||
65
swift-api-examples/decode-file-non-streaming.swift
Normal file
65
swift-api-examples/decode-file-non-streaming.swift
Normal file
@@ -0,0 +1,65 @@
|
||||
import AVFoundation
|
||||
|
||||
extension AudioBuffer {
|
||||
func array() -> [Float] {
|
||||
return Array(UnsafeBufferPointer(self))
|
||||
}
|
||||
}
|
||||
|
||||
extension AVAudioPCMBuffer {
|
||||
func array() -> [Float] {
|
||||
return self.audioBufferList.pointee.mBuffers.array()
|
||||
}
|
||||
}
|
||||
|
||||
func run() {
|
||||
let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"
|
||||
let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"
|
||||
let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"
|
||||
|
||||
let whisperConfig = sherpaOnnxOfflineWhisperModelConfig(
|
||||
encoder: encoder,
|
||||
decoder: decoder
|
||||
)
|
||||
|
||||
let modelConfig = sherpaOnnxOfflineModelConfig(
|
||||
tokens: tokens,
|
||||
whisper: whisperConfig,
|
||||
debug: 0,
|
||||
modelType: "whisper"
|
||||
)
|
||||
|
||||
let featConfig = sherpaOnnxFeatureConfig(
|
||||
sampleRate: 16000,
|
||||
featureDim: 80
|
||||
)
|
||||
var config = sherpaOnnxOfflineRecognizerConfig(
|
||||
featConfig: featConfig,
|
||||
modelConfig: modelConfig
|
||||
)
|
||||
|
||||
let recognizer = SherpaOnnxOfflineRecognizer(config: &config)
|
||||
|
||||
let filePath = "./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav"
|
||||
let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
|
||||
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
|
||||
|
||||
let audioFormat = audioFile.processingFormat
|
||||
assert(audioFormat.channelCount == 1)
|
||||
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
|
||||
|
||||
let audioFrameCount = UInt32(audioFile.length)
|
||||
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
|
||||
|
||||
try! audioFile.read(into: audioFileBuffer!)
|
||||
let array: [Float]! = audioFileBuffer?.array()
|
||||
let result = recognizer.decode(samples: array, sampleRate: Int(audioFormat.sampleRate))
|
||||
print("\nresult is:\n\(result.text)")
|
||||
}
|
||||
|
||||
@main
|
||||
struct App {
|
||||
static func main() {
|
||||
run()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user