Add Swift example for generating subtitles (#318)

2023-09-18 15:16:54 +08:00
parent 2d51ca49b7
commit 692a47dd80
9 changed files with 654 additions and 11 deletions
--- a/swift-api-examples/.gitignore
+++ b/swift-api-examples/.gitignore
@@ -1,2 +1,3 @@
 decode-file
 decode-file-non-streaming
+generate-subtitles
--- a/swift-api-examples/SherpaOnnx.swift
+++ b/swift-api-examples/SherpaOnnx.swift
@@ -215,7 +215,7 @@ class SherpaOnnxRecognizer {

  /// Get the decoding results so far
  func getResult() -> SherpaOnnxOnlineRecongitionResult {
-    let result: UnsafeMutablePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
+    let result: UnsafePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
      recognizer, stream)
    return SherpaOnnxOnlineRecongitionResult(result: result)
  }
@@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer {

    DecodeOfflineStream(recognizer, stream)

-    let result: UnsafeMutablePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
+    let result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
      stream)

    DestroyOfflineStream(stream)
@@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer {
    return SherpaOnnxOfflineRecongitionResult(result: result)
  }
 }
+
+func sherpaOnnxSileroVadModelConfig(
+  model: String,
+  threshold: Float = 0.5,
+  minSilenceDuration: Float = 0.25,
+  minSpeechDuration: Float = 0.5,
+  windowSize: Int = 512
+) -> SherpaOnnxSileroVadModelConfig {
+  return SherpaOnnxSileroVadModelConfig(
+    model: toCPointer(model),
+    threshold: threshold,
+    min_silence_duration: minSilenceDuration,
+    min_speech_duration: minSpeechDuration,
+    window_size: Int32(windowSize)
+  )
+}
+
+func sherpaOnnxVadModelConfig(
+  sileroVad: SherpaOnnxSileroVadModelConfig,
+  sampleRate: Int32 = 16000,
+  numThreads: Int = 1,
+  provider: String = "cpu",
+  debug: Int = 0
+) -> SherpaOnnxVadModelConfig {
+  return SherpaOnnxVadModelConfig(
+    silero_vad: sileroVad,
+    sample_rate: sampleRate,
+    num_threads: Int32(numThreads),
+    provider: toCPointer(provider),
+    debug: Int32(debug)
+  )
+}
+
+class SherpaOnnxCircularBufferWrapper {
+  let buffer: OpaquePointer!
+
+  init(capacity: Int) {
+    buffer = SherpaOnnxCreateCircularBuffer(Int32(capacity))
+  }
+
+  deinit {
+    if let buffer {
+      SherpaOnnxDestroyCircularBuffer(buffer)
+    }
+  }
+
+  func push(samples: [Float]) {
+    SherpaOnnxCircularBufferPush(buffer, samples, Int32(samples.count))
+  }
+
+  func get(startIndex: Int, n: Int) -> [Float] {
+    let p: UnsafePointer<Float>! = SherpaOnnxCircularBufferGet(buffer, Int32(startIndex), Int32(n))
+
+    var samples: [Float] = []
+
+    for index in 0..<n {
+      samples.append(p[Int(index)])
+    }
+
+    SherpaOnnxCircularBufferFree(p)
+
+    return samples
+  }
+
+  func pop(n: Int) {
+    SherpaOnnxCircularBufferPop(buffer, Int32(n))
+  }
+
+  func size() -> Int {
+    return Int(SherpaOnnxCircularBufferSize(buffer))
+  }
+
+  func reset() {
+    SherpaOnnxCircularBufferReset(buffer)
+  }
+}
+
+class SherpaOnnxSpeechSegmentWrapper {
+  let p: UnsafePointer<SherpaOnnxSpeechSegment>!
+
+  init(p: UnsafePointer<SherpaOnnxSpeechSegment>!) {
+    self.p = p
+  }
+
+  deinit {
+    if let p {
+      SherpaOnnxDestroySpeechSegment(p)
+    }
+  }
+
+  var start: Int {
+    return Int(p.pointee.start)
+  }
+
+  var n: Int {
+    return Int(p.pointee.n)
+  }
+
+  var samples: [Float] {
+    var samples: [Float] = []
+    for index in 0..<n {
+      samples.append(p.pointee.samples[Int(index)])
+    }
+    return samples
+  }
+}
+
+class SherpaOnnxVoiceActivityDetectorWrapper {
+  /// A pointer to the underlying counterpart in C
+  let vad: OpaquePointer!
+
+  init(config: UnsafePointer<SherpaOnnxVadModelConfig>!, buffer_size_in_seconds: Float) {
+    vad = SherpaOnnxCreateVoiceActivityDetector(config, buffer_size_in_seconds)
+  }
+
+  deinit {
+    if let vad {
+      SherpaOnnxDestroyVoiceActivityDetector(vad)
+    }
+  }
+
+  func acceptWaveform(samples: [Float]) {
+    SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, samples, Int32(samples.count))
+  }
+
+  func isEmpty() -> Bool {
+    return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1 ? true : false
+  }
+
+  func pop() {
+    SherpaOnnxVoiceActivityDetectorPop(vad)
+  }
+
+  func front() -> SherpaOnnxSpeechSegmentWrapper {
+    let p: UnsafePointer<SherpaOnnxSpeechSegment>? = SherpaOnnxVoiceActivityDetectorFront(vad)
+    return SherpaOnnxSpeechSegmentWrapper(p: p)
+  }
+
+  func reset() {
+    SherpaOnnxVoiceActivityDetectorReset(vad)
+  }
+}
--- a/swift-api-examples/decode-file-non-streaming.swift
+++ b/swift-api-examples/decode-file-non-streaming.swift
@@ -13,7 +13,6 @@ extension AVAudioPCMBuffer {
 }

 func run() {
-
  var recognizer: SherpaOnnxOfflineRecognizer
  var modelConfig: SherpaOnnxOfflineModelConfig
  var modelType = "whisper"
--- a/swift-api-examples/generate-subtitles.swift
+++ b/swift-api-examples/generate-subtitles.swift
@@ -0,0 +1,217 @@
+/*
+This file shows how to use Swift API to generate subtitles.
+
+You can use the files from
+https://huggingface.co/csukuangfj/vad/tree/main
+for testing.
+
+For instance, to generate subtitles for Obama.mov, please first
+use
+
+ffmpeg -i ./Obama.mov -acodec pcm_s16le -ac 1 -ar 16000 Obama.wav
+
+to extract the audio part from the video.
+
+This file supports only processing WAV sound files, so you have to first
+extract audios from videos.
+
+Please see
+./run-generate-subtitles.sh
+for usages.
+*/
+
+import AVFoundation
+
+extension AudioBuffer {
+  func array() -> [Float] {
+    return Array(UnsafeBufferPointer(self))
+  }
+}
+
+extension AVAudioPCMBuffer {
+  func array() -> [Float] {
+    return self.audioBufferList.pointee.mBuffers.array()
+  }
+}
+
+extension TimeInterval {
+  var hourMinuteSecondMS: String {
+    String(format: "%d:%02d:%02d,%03d", hour, minute, second, millisecond)
+  }
+
+  var hour: Int {
+    Int((self / 3600).truncatingRemainder(dividingBy: 3600))
+  }
+  var minute: Int {
+    Int((self / 60).truncatingRemainder(dividingBy: 60))
+  }
+  var second: Int {
+    Int(truncatingRemainder(dividingBy: 60))
+  }
+  var millisecond: Int {
+    Int((self * 1000).truncatingRemainder(dividingBy: 1000))
+  }
+}
+
+extension String {
+  var fileURL: URL {
+    return URL(fileURLWithPath: self)
+  }
+  var pathExtension: String {
+    return fileURL.pathExtension
+  }
+  var lastPathComponent: String {
+    return fileURL.lastPathComponent
+  }
+  var stringByDeletingPathExtension: String {
+    return fileURL.deletingPathExtension().path
+  }
+}
+
+class SpeechSegment: CustomStringConvertible {
+
+  let start: Float
+  let end: Float
+  let text: String
+
+  init(start: Float, duration: Float, text: String) {
+    self.start = start
+    self.end = start + duration
+    self.text = text
+  }
+  public var description: String {
+    var s: String
+    s = TimeInterval(self.start).hourMinuteSecondMS
+    s += " --> "
+    s += TimeInterval(self.end).hourMinuteSecondMS
+    s += "\n"
+    s += self.text
+
+    return s
+  }
+}
+
+func run() {
+  var recognizer: SherpaOnnxOfflineRecognizer
+  var modelConfig: SherpaOnnxOfflineModelConfig
+  var modelType = "whisper"
+  // modelType = "paraformer"
+  var filePath = "/Users/fangjun/Desktop/Obama.wav"  // English
+  // filePath = "/Users/fangjun/Desktop/lei-jun.wav"  // Chinese
+  // please go to https://huggingface.co/csukuangfj/vad
+  // to download the above two files
+
+  if modelType == "whisper" {
+    // for English
+    let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"
+    let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"
+    let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"
+
+    let whisperConfig = sherpaOnnxOfflineWhisperModelConfig(
+      encoder: encoder,
+      decoder: decoder
+    )
+
+    modelConfig = sherpaOnnxOfflineModelConfig(
+      tokens: tokens,
+      whisper: whisperConfig,
+      debug: 0,
+      modelType: "whisper"
+    )
+  } else if modelType == "paraformer" {
+    // for Chinese
+    let model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx"
+    let tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt"
+    let paraformerConfig = sherpaOnnxOfflineParaformerModelConfig(
+      model: model
+    )
+
+    modelConfig = sherpaOnnxOfflineModelConfig(
+      tokens: tokens,
+      paraformer: paraformerConfig,
+      debug: 0,
+      modelType: "paraformer"
+    )
+  } else {
+    print("Please specify a supported modelType \(modelType)")
+    return
+  }
+
+  let sampleRate = 16000
+  let featConfig = sherpaOnnxFeatureConfig(
+    sampleRate: sampleRate,
+    featureDim: 80
+  )
+  var config = sherpaOnnxOfflineRecognizerConfig(
+    featConfig: featConfig,
+    modelConfig: modelConfig
+  )
+
+  recognizer = SherpaOnnxOfflineRecognizer(config: &config)
+
+  let audioFile = try! AVAudioFile(forReading: filePath.fileURL)
+
+  let audioFormat = audioFile.processingFormat
+  assert(audioFormat.sampleRate == Double(sampleRate))
+  assert(audioFormat.channelCount == 1)
+  assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
+
+  let sileroVadConfig = sherpaOnnxSileroVadModelConfig(
+    model: "./silero_vad.onnx"
+  )
+
+  var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)
+  let vad = SherpaOnnxVoiceActivityDetectorWrapper(
+    config: &vadModelConfig, buffer_size_in_seconds: 120)
+
+  let audioFrameCount = UInt32(audioFile.length)
+  let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
+
+  try! audioFile.read(into: audioFileBuffer!)
+  var array: [Float]! = audioFileBuffer?.array()
+
+  let windowSize = Int(vadModelConfig.silero_vad.window_size)
+
+  var segments: [SpeechSegment] = []
+
+  while array.count > windowSize {
+    // todo(fangjun): avoid extra copies here
+    vad.acceptWaveform(samples: [Float](array[0..<windowSize]))
+    array = [Float](array[windowSize..<array.count])
+
+    while !vad.isEmpty() {
+      let s = vad.front()
+      vad.pop()
+      let result = recognizer.decode(samples: s.samples)
+
+      segments.append(
+        SpeechSegment(
+          start: Float(s.start) / Float(sampleRate),
+          duration: Float(s.samples.count) / Float(sampleRate),
+          text: result.text))
+
+      print(segments.last!)
+
+    }
+  }
+
+  let srt = zip(segments.indices, segments).map { (index, element) in
+    return "\(index+1)\n\(element)"
+  }.joined(separator: "\n\n")
+
+  let srtFilename = filePath.stringByDeletingPathExtension + ".srt"
+  do {
+    try srt.write(to: srtFilename.fileURL, atomically: true, encoding: .utf8)
+  } catch {
+    print("Error writing: \(error.localizedDescription)")
+  }
+
+  print("Saved to \(srtFilename)")
+}
+
+@main
+struct App {
+  static func main() {
+    run()
+  }
+}
--- a/swift-api-examples/run-generate-subtitles.sh
+++ b/swift-api-examples/run-generate-subtitles.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [ ! -d ../build-swift-macos ]; then
+  echo "Please run ../build-swift-macos.sh first!"
+  exit 1
+fi
+
+if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
+  echo "Please download the pre-trained model for testing."
+  echo "You can refer to"
+  echo ""
+  echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
+  echo ""
+  echo "for help"
+  exit 1
+fi
+
+if [ ! -e ./generate-subtitles ]; then
+  # Note: We use -lc++ to link against libc++ instead of libstdc++
+  swiftc \
+    -lc++ \
+    -I ../build-swift-macos/install/include \
+    -import-objc-header ./SherpaOnnx-Bridging-Header.h \
+    ./generate-subtitles.swift  ./SherpaOnnx.swift \
+    -L ../build-swift-macos/install/lib/ \
+    -l sherpa-onnx \
+    -l onnxruntime \
+    -o generate-subtitles
+else
+  echo "./generate-subtitles exists - skip building"
+fi
+
+export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
+./generate-subtitles