Add two-pass speech recognition Android/iOS demo (#304)

2023-09-12 15:40:16 +08:00
parent 8982984ea2
commit debab7c091
97 changed files with 3546 additions and 57 deletions
--- a/swift-api-examples/.gitignore
+++ b/swift-api-examples/.gitignore
@@ -0,0 +1,2 @@
+decode-file
+decode-file-non-streaming
--- a/swift-api-examples/SherpaOnnx.swift
+++ b/swift-api-examples/SherpaOnnx.swift
@@ -175,7 +175,7 @@ class SherpaOnnxRecognizer {
  let recognizer: OpaquePointer!
  let stream: OpaquePointer!

-  /// Constructor taking a model config and a decoder config.
+  /// Constructor taking a model config
  init(
    config: UnsafePointer<SherpaOnnxOnlineRecognizerConfig>!
  ) {
@@ -198,8 +198,7 @@ class SherpaOnnxRecognizer {
  /// - Parameters:
  ///   - samples: Audio samples normalized to the range [-1, 1]
  ///   - sampleRate: Sample rate of the input audio samples. Must match
-  ///                 the one expected by the model. It must be 16000 for
-  ///                 models from icefall.
+  ///                 the one expected by the model.
  func acceptWaveform(samples: [Float], sampleRate: Int = 16000) {
    AcceptWaveform(stream, Int32(sampleRate), samples, Int32(samples.count))
  }
@@ -238,3 +237,163 @@ class SherpaOnnxRecognizer {
    return IsEndpoint(recognizer, stream) == 1 ? true : false
  }
 }
+
+// For offline APIs
+
+func sherpaOnnxOfflineTransducerModelConfig(
+  encoder: String = "",
+  decoder: String = "",
+  joiner: String = ""
+) -> SherpaOnnxOfflineTransducerModelConfig {
+  return SherpaOnnxOfflineTransducerModelConfig(
+    encoder: toCPointer(encoder),
+    decoder: toCPointer(decoder),
+    joiner: toCPointer(joiner)
+  )
+}
+
+func sherpaOnnxOfflineParaformerModelConfig(
+  model: String = ""
+) -> SherpaOnnxOfflineParaformerModelConfig {
+  return SherpaOnnxOfflineParaformerModelConfig(
+    model: toCPointer(model)
+  )
+}
+
+func sherpaOnnxOfflineNemoEncDecCtcModelConfig(
+  model: String = ""
+) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig {
+  return SherpaOnnxOfflineNemoEncDecCtcModelConfig(
+    model: toCPointer(model)
+  )
+}
+
+func sherpaOnnxOfflineWhisperModelConfig(
+  encoder: String = "",
+  decoder: String = ""
+) -> SherpaOnnxOfflineWhisperModelConfig {
+  return SherpaOnnxOfflineWhisperModelConfig(
+    encoder: toCPointer(encoder),
+    decoder: toCPointer(decoder)
+  )
+}
+
+func sherpaOnnxOfflineTdnnModelConfig(
+  model: String = ""
+) -> SherpaOnnxOfflineTdnnModelConfig {
+  return SherpaOnnxOfflineTdnnModelConfig(
+    model: toCPointer(model)
+  )
+}
+
+func sherpaOnnxOfflineLMConfig(
+  model: String = "",
+  scale: Float = 1.0
+) -> SherpaOnnxOfflineLMConfig {
+  return SherpaOnnxOfflineLMConfig(
+    model: toCPointer(model),
+    scale: scale
+  )
+}
+
+func sherpaOnnxOfflineModelConfig(
+  tokens: String,
+  transducer: SherpaOnnxOfflineTransducerModelConfig = sherpaOnnxOfflineTransducerModelConfig(),
+  paraformer: SherpaOnnxOfflineParaformerModelConfig = sherpaOnnxOfflineParaformerModelConfig(),
+  nemoCtc: SherpaOnnxOfflineNemoEncDecCtcModelConfig = sherpaOnnxOfflineNemoEncDecCtcModelConfig(),
+  whisper: SherpaOnnxOfflineWhisperModelConfig = sherpaOnnxOfflineWhisperModelConfig(),
+  tdnn: SherpaOnnxOfflineTdnnModelConfig = sherpaOnnxOfflineTdnnModelConfig(),
+  numThreads: Int = 1,
+  provider: String = "cpu",
+  debug: Int = 0,
+  modelType: String = ""
+) -> SherpaOnnxOfflineModelConfig {
+  return SherpaOnnxOfflineModelConfig(
+    transducer: transducer,
+    paraformer: paraformer,
+    nemo_ctc: nemoCtc,
+    whisper: whisper,
+    tdnn: tdnn,
+    tokens: toCPointer(tokens),
+    num_threads: Int32(numThreads),
+    debug: Int32(debug),
+    provider: toCPointer(provider),
+    model_type: toCPointer(modelType)
+  )
+}
+
+func sherpaOnnxOfflineRecognizerConfig(
+  featConfig: SherpaOnnxFeatureConfig,
+  modelConfig: SherpaOnnxOfflineModelConfig,
+  lmConfig: SherpaOnnxOfflineLMConfig = sherpaOnnxOfflineLMConfig(),
+  decodingMethod: String = "greedy_search",
+  maxActivePaths: Int = 4
+) -> SherpaOnnxOfflineRecognizerConfig {
+  return SherpaOnnxOfflineRecognizerConfig(
+    feat_config: featConfig,
+    model_config: modelConfig,
+    lm_config: lmConfig,
+    decoding_method: toCPointer(decodingMethod),
+    max_active_paths: Int32(maxActivePaths)
+  )
+}
+
+class SherpaOnnxOfflineRecongitionResult {
+  /// A pointer to the underlying counterpart in C
+  let result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>!
+
+  /// Return the actual recognition result.
+  /// For English models, it contains words separated by spaces.
+  /// For Chinese models, it contains Chinese words.
+  var text: String {
+    return String(cString: result.pointee.text)
+  }
+
+  init(result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>!) {
+    self.result = result
+  }
+
+  deinit {
+    if let result {
+      DestroyOfflineRecognizerResult(result)
+    }
+  }
+}
+
+class SherpaOnnxOfflineRecognizer {
+  /// A pointer to the underlying counterpart in C
+  let recognizer: OpaquePointer!
+
+  init(
+    config: UnsafePointer<SherpaOnnxOfflineRecognizerConfig>!
+  ) {
+    recognizer = CreateOfflineRecognizer(config)
+  }
+
+  deinit {
+    if let recognizer {
+      DestroyOfflineRecognizer(recognizer)
+    }
+  }
+
+  /// Decode wave samples.
+  ///
+  /// - Parameters:
+  ///   - samples: Audio samples normalized to the range [-1, 1]
+  ///   - sampleRate: Sample rate of the input audio samples. Must match
+  ///                 the one expected by the model.
+  func decode(samples: [Float], sampleRate: Int = 16000) -> SherpaOnnxOfflineRecongitionResult {
+    let stream: OpaquePointer! = CreateOfflineStream(recognizer)
+
+    AcceptWaveformOffline(stream, Int32(sampleRate), samples, Int32(samples.count))
+
+    DecodeOfflineStream(recognizer, stream)
+
+    let result: UnsafeMutablePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
+      stream)
+
+    DestroyOfflineStream(stream)
+
+    return SherpaOnnxOfflineRecongitionResult(result: result)
+  }
+}
--- a/swift-api-examples/decode-file-non-streaming.swift
+++ b/swift-api-examples/decode-file-non-streaming.swift
@@ -0,0 +1,65 @@
+import AVFoundation
+
+extension AudioBuffer {
+  func array() -> [Float] {
+    return Array(UnsafeBufferPointer(self))
+  }
+}
+
+extension AVAudioPCMBuffer {
+  func array() -> [Float] {
+    return self.audioBufferList.pointee.mBuffers.array()
+  }
+}
+
+func run() {
+  let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"
+  let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"
+  let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"
+
+  let whisperConfig = sherpaOnnxOfflineWhisperModelConfig(
+    encoder: encoder,
+    decoder: decoder
+  )
+
+  let modelConfig = sherpaOnnxOfflineModelConfig(
+    tokens: tokens,
+    whisper: whisperConfig,
+    debug: 0,
+    modelType: "whisper"
+  )
+
+  let featConfig = sherpaOnnxFeatureConfig(
+    sampleRate: 16000,
+    featureDim: 80
+  )
+  var config = sherpaOnnxOfflineRecognizerConfig(
+    featConfig: featConfig,
+    modelConfig: modelConfig
+  )
+
+  let recognizer = SherpaOnnxOfflineRecognizer(config: &config)
+
+  let filePath = "./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav"
+  let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
+  let audioFile = try! AVAudioFile(forReading: fileURL as URL)
+
+  let audioFormat = audioFile.processingFormat
+  assert(audioFormat.channelCount == 1)
+  assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
+
+  let audioFrameCount = UInt32(audioFile.length)
+  let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
+
+  try! audioFile.read(into: audioFileBuffer!)
+  let array: [Float]! = audioFileBuffer?.array()
+  let result = recognizer.decode(samples: array, sampleRate: Int(audioFormat.sampleRate))
+  print("\nresult is:\n\(result.text)")
+}
+
+@main
+struct App {
+  static func main() {
+    run()
+  }
+}