Add Swift example for generating subtitles (#318)

2023-09-18 15:16:54 +08:00
parent 2d51ca49b7
commit 692a47dd80
9 changed files with 654 additions and 11 deletions
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -9,9 +9,11 @@
 #include <utility>
 #include <vector>
 #include "sherpa-onnx/csrc/circular-buffer.h"
 #include "sherpa-onnx/csrc/display.h"
 #include "sherpa-onnx/csrc/offline-recognizer.h"
 #include "sherpa-onnx/csrc/online-recognizer.h"
 #include "sherpa-onnx/csrc/voice-activity-detector.h"
 struct SherpaOnnxOnlineRecognizer {
  std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
@@ -127,7 +129,7 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer,
  recognizer->impl->DecodeStreams(ss.data(), n);
 }
-SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
+const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
    SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) {
  sherpa_onnx::OnlineRecognizerResult result =
      recognizer->impl->GetResult(stream->impl.get());
@@ -340,7 +342,7 @@ void DecodeMultipleOfflineStreams(SherpaOnnxOfflineRecognizer *recognizer,
  recognizer->impl->DecodeStreams(ss.data(), n);
 }
-SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
+const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
    SherpaOnnxOfflineStream *stream) {
  const sherpa_onnx::OfflineRecognitionResult &result =
      stream->impl->GetResult();
@@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult(
  delete[] r->timestamps;
  delete r;
 }
 // ============================================================
 // For VAD
 // ============================================================
 //
 struct SherpaOnnxCircularBuffer {
  std::unique_ptr<sherpa_onnx::CircularBuffer> impl;
 };
 SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) {
  SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer;
  buffer->impl = std::make_unique<sherpa_onnx::CircularBuffer>(capacity);
  return buffer;
 }
 void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) {
  delete buffer;
 }
 void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer,
                                  const float *p, int32_t n) {
  buffer->impl->Push(p, n);
 }
 const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer,
                                         int32_t start_index, int32_t n) {
  std::vector<float> v = buffer->impl->Get(start_index, n);
  float *p = new float[n];
  std::copy(v.begin(), v.end(), p);
  return p;
 }
 void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; }
 void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) {
  buffer->impl->Pop(n);
 }
 int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) {
  return buffer->impl->Size();
 }
 void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) {
  buffer->impl->Reset();
 }
 struct SherpaOnnxVoiceActivityDetector {
  std::unique_ptr<sherpa_onnx::VoiceActivityDetector> impl;
 };
 SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
    const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) {
  sherpa_onnx::VadModelConfig vad_config;
  vad_config.silero_vad.model = SHERPA_ONNX_OR(config->silero_vad.model, "");
  vad_config.silero_vad.threshold =
      SHERPA_ONNX_OR(config->silero_vad.threshold, 0.5);
  vad_config.silero_vad.min_silence_duration =
      SHERPA_ONNX_OR(config->silero_vad.min_silence_duration, 0.5);
  vad_config.silero_vad.min_speech_duration =
      SHERPA_ONNX_OR(config->silero_vad.min_speech_duration, 0.25);
  vad_config.silero_vad.window_size =
      SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
  vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
  vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
  vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
  vad_config.debug = SHERPA_ONNX_OR(config->debug, false);
  if (vad_config.debug) {
    fprintf(stderr, "%s\n", vad_config.ToString().c_str());
  }
  SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector;
  p->impl = std::make_unique<sherpa_onnx::VoiceActivityDetector>(
      vad_config, buffer_size_in_seconds);
  return p;
 }
 void SherpaOnnxDestroyVoiceActivityDetector(
    SherpaOnnxVoiceActivityDetector *p) {
  delete p;
 }
 void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
    SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) {
  p->impl->AcceptWaveform(samples, n);
 }
 int32_t SherpaOnnxVoiceActivityDetectorEmpty(
    SherpaOnnxVoiceActivityDetector *p) {
  return p->impl->Empty();
 }
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
    SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Pop();
 }
 SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
 SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) {
  const sherpa_onnx::SpeechSegment &segment = p->impl->Front();
  SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment;
  ans->start = segment.start;
  ans->samples = new float[segment.samples.size()];
  std::copy(segment.samples.begin(), segment.samples.end(), ans->samples);
  ans->n = segment.samples.size();
  return ans;
 }
 void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) {
  delete[] p->samples;
  delete p;
 }
 void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Reset();
 }
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -234,7 +234,7 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams(
 /// @return A pointer containing the result. The user has to invoke
 ///         DestroyOnlineRecognizerResult() to free the returned pointer to
 ///         avoid memory leak.
-SHERPA_ONNX_API SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
+SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
    SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream);
 /// Destroy the pointer returned by GetOnlineStreamResult().
@@ -429,7 +429,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
 /// @return Return a pointer to the result. The user has to invoke
 ///         DestroyOnlineRecognizerResult() to free the returned pointer to
 ///         avoid memory leak.
-SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
+SHERPA_ONNX_API const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
    SherpaOnnxOfflineStream *stream);
 /// Destroy the pointer returned by GetOfflineStreamResult().
@@ -438,6 +438,127 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
 SHERPA_ONNX_API void DestroyOfflineRecognizerResult(
    const SherpaOnnxOfflineRecognizerResult *r);
 // ============================================================
 // For VAD
 // ============================================================
 SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
  // Path to the silero VAD model
  const char *model;
  // threshold to classify a segment as speech
  //
  // If the predicted probability of a segment is larger than this
  // value, then it is classified as speech.
  float threshold;
  // in seconds
  float min_silence_duration;
  // in seconds
  float min_speech_duration;
  int window_size;
 } SherpaOnnxSileroVadModelConfig;
 SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
  SherpaOnnxSileroVadModelConfig silero_vad;
  int32_t sample_rate;
  int32_t num_threads;
  const char *provider;
  int32_t debug;
 } SherpaOnnxVadModelConfig;
 SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer
    SherpaOnnxCircularBuffer;
 // Return an instance of circular buffer. The user has to use
 // SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid
 // memory leak.
 SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(
    int32_t capacity);
 // Free the pointer returned by SherpaOnnxCreateCircularBuffer()
 SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer(
    SherpaOnnxCircularBuffer *buffer);
 SHERPA_ONNX_API void SherpaOnnxCircularBufferPush(
    SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n);
 // Return n samples starting at the given index.
 //
 // Return a pointer to an array containing n samples starting at start_index.
 // The user has to use SherpaOnnxCircularBufferFree() to free the returned
 // pointer to avoid memory leak.
 SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet(
    SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n);
 // Free the pointer returned by SherpaOnnxCircularBufferGet().
 SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p);
 // Remove n elements from the buffer
 SHERPA_ONNX_API void SherpaOnnxCircularBufferPop(
    SherpaOnnxCircularBuffer *buffer, int32_t n);
 // Return number of elements in the buffer.
 SHERPA_ONNX_API int32_t
 SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer);
 // Clear all elements in the buffer
 SHERPA_ONNX_API void SherpaOnnxCircularBufferReset(
    SherpaOnnxCircularBuffer *buffer);
 SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment {
  // The start index in samples of this segment
  int32_t start;
  // pointer to the array containing the samples
  float *samples;
  // number of samples in this segment
  int32_t n;
 } SherpaOnnxSpeechSegment;
 typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector;
 // Return an instance of VoiceActivityDetector.
 // The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free
 // the returned pointer to avoid memory leak.
 SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector *
 SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config,
                                      float buffer_size_in_seconds);
 SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector(
    SherpaOnnxVoiceActivityDetector *p);
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
    SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n);
 // Return 1 if there are no speech segments available.
 // Return 0 if there are speech segments.
 SHERPA_ONNX_API int32_t
 SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p);
 // Return the first speech segment.
 // It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1.
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
    SherpaOnnxVoiceActivityDetector *p);
 // Return the first speech segment.
 // The user has to use SherpaOnnxDestroySpeechSegment() to free the returned
 // pointer to avoid memory leak.
 SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
 SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p);
 // Free the pointer returned SherpaOnnxVoiceActivityDetectorFront().
 SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
    const SherpaOnnxSpeechSegment *p);
 // Re-initialize the voice activity detector.
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
    SherpaOnnxVoiceActivityDetector *p);
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
--- a/sherpa-onnx/csrc/hypothesis.cc
+++ b/sherpa-onnx/csrc/hypothesis.cc
@@ -18,7 +18,7 @@ void Hypotheses::Add(Hypothesis hyp) {
  } else {
    it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob);
-    if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0){
+    if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0) {
      it->second.lm_log_prob =
          LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob);
    }
--- a/sherpa-onnx/csrc/silero-vad-model-config.h
+++ b/sherpa-onnx/csrc/silero-vad-model-config.h
@@ -15,7 +15,7 @@ struct SileroVadModelConfig {
  // threshold to classify a segment as speech
  //
-  // The predicted probability of a segment is larger than this
+  // If the predicted probability of a segment is larger than this
  // value, then it is classified as speech.
  float threshold = 0.5;
@@ -25,7 +25,7 @@ struct SileroVadModelConfig {
  // 512, 1024, 1536 samples for 16000 Hz
  // 256, 512, 768 samples for 800 Hz
-  int window_size = 512;  // in samples
+  int32_t window_size = 512;  // in samples
  SileroVadModelConfig() = default;
--- a/swift-api-examples/.gitignore
+++ b/swift-api-examples/.gitignore
@@ -1,2 +1,3 @@
 decode-file
 decode-file-non-streaming
 generate-subtitles
--- a/swift-api-examples/SherpaOnnx.swift
+++ b/swift-api-examples/SherpaOnnx.swift
@@ -215,7 +215,7 @@ class SherpaOnnxRecognizer {
  /// Get the decoding results so far
  func getResult() -> SherpaOnnxOnlineRecongitionResult {
-    let result: UnsafeMutablePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
+    let result: UnsafePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
      recognizer, stream)
    return SherpaOnnxOnlineRecongitionResult(result: result)
  }
@@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer {
    DecodeOfflineStream(recognizer, stream)
-    let result: UnsafeMutablePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
+    let result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
      stream)
    DestroyOfflineStream(stream)
@@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer {
    return SherpaOnnxOfflineRecongitionResult(result: result)
  }
 }
 func sherpaOnnxSileroVadModelConfig(
  model: String,
  threshold: Float = 0.5,
  minSilenceDuration: Float = 0.25,
  minSpeechDuration: Float = 0.5,
  windowSize: Int = 512
 ) -> SherpaOnnxSileroVadModelConfig {
  return SherpaOnnxSileroVadModelConfig(
    model: toCPointer(model),
    threshold: threshold,
    min_silence_duration: minSilenceDuration,
    min_speech_duration: minSpeechDuration,
    window_size: Int32(windowSize)
  )
 }
 func sherpaOnnxVadModelConfig(
  sileroVad: SherpaOnnxSileroVadModelConfig,
  sampleRate: Int32 = 16000,
  numThreads: Int = 1,
  provider: String = "cpu",
  debug: Int = 0
 ) -> SherpaOnnxVadModelConfig {
  return SherpaOnnxVadModelConfig(
    silero_vad: sileroVad,
    sample_rate: sampleRate,
    num_threads: Int32(numThreads),
    provider: toCPointer(provider),
    debug: Int32(debug)
  )
 }
 class SherpaOnnxCircularBufferWrapper {
  let buffer: OpaquePointer!
  init(capacity: Int) {
    buffer = SherpaOnnxCreateCircularBuffer(Int32(capacity))
  }
  deinit {
    if let buffer {
      SherpaOnnxDestroyCircularBuffer(buffer)
    }
  }
  func push(samples: [Float]) {
    SherpaOnnxCircularBufferPush(buffer, samples, Int32(samples.count))
  }
  func get(startIndex: Int, n: Int) -> [Float] {
    let p: UnsafePointer<Float>! = SherpaOnnxCircularBufferGet(buffer, Int32(startIndex), Int32(n))
    var samples: [Float] = []
    for index in 0..<n {
      samples.append(p[Int(index)])
    }
    SherpaOnnxCircularBufferFree(p)
    return samples
  }
  func pop(n: Int) {
    SherpaOnnxCircularBufferPop(buffer, Int32(n))
  }
  func size() -> Int {
    return Int(SherpaOnnxCircularBufferSize(buffer))
  }
  func reset() {
    SherpaOnnxCircularBufferReset(buffer)
  }
 }
 class SherpaOnnxSpeechSegmentWrapper {
  let p: UnsafePointer<SherpaOnnxSpeechSegment>!
  init(p: UnsafePointer<SherpaOnnxSpeechSegment>!) {
    self.p = p
  }
  deinit {
    if let p {
      SherpaOnnxDestroySpeechSegment(p)
    }
  }
  var start: Int {
    return Int(p.pointee.start)
  }
  var n: Int {
    return Int(p.pointee.n)
  }
  var samples: [Float] {
    var samples: [Float] = []
    for index in 0..<n {
      samples.append(p.pointee.samples[Int(index)])
    }
    return samples
  }
 }
 class SherpaOnnxVoiceActivityDetectorWrapper {
  /// A pointer to the underlying counterpart in C
  let vad: OpaquePointer!
  init(config: UnsafePointer<SherpaOnnxVadModelConfig>!, buffer_size_in_seconds: Float) {
    vad = SherpaOnnxCreateVoiceActivityDetector(config, buffer_size_in_seconds)
  }
  deinit {
    if let vad {
      SherpaOnnxDestroyVoiceActivityDetector(vad)
    }
  }
  func acceptWaveform(samples: [Float]) {
    SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, samples, Int32(samples.count))
  }
  func isEmpty() -> Bool {
    return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1 ? true : false
  }
  func pop() {
    SherpaOnnxVoiceActivityDetectorPop(vad)
  }
  func front() -> SherpaOnnxSpeechSegmentWrapper {
    let p: UnsafePointer<SherpaOnnxSpeechSegment>? = SherpaOnnxVoiceActivityDetectorFront(vad)
    return SherpaOnnxSpeechSegmentWrapper(p: p)
  }
  func reset() {
    SherpaOnnxVoiceActivityDetectorReset(vad)
  }
 }
--- a/swift-api-examples/decode-file-non-streaming.swift
+++ b/swift-api-examples/decode-file-non-streaming.swift
@@ -13,7 +13,6 @@ extension AVAudioPCMBuffer {
 }
 func run() {
  var recognizer: SherpaOnnxOfflineRecognizer
  var modelConfig: SherpaOnnxOfflineModelConfig
  var modelType = "whisper"
--- a/swift-api-examples/generate-subtitles.swift
+++ b/swift-api-examples/generate-subtitles.swift
@@ -0,0 +1,217 @@
 /*
 This file shows how to use Swift API to generate subtitles.
 You can use the files from
 https://huggingface.co/csukuangfj/vad/tree/main
 for testing.
 For instance, to generate subtitles for Obama.mov, please first
 use
 ffmpeg -i ./Obama.mov -acodec pcm_s16le -ac 1 -ar 16000 Obama.wav
 to extract the audio part from the video.
 This file supports only processing WAV sound files, so you have to first
 extract audios from videos.
 Please see
 ./run-generate-subtitles.sh
 for usages.
 */
 import AVFoundation
 extension AudioBuffer {
  func array() -> [Float] {
    return Array(UnsafeBufferPointer(self))
  }
 }
 extension AVAudioPCMBuffer {
  func array() -> [Float] {
    return self.audioBufferList.pointee.mBuffers.array()
  }
 }
 extension TimeInterval {
  var hourMinuteSecondMS: String {
    String(format: "%d:%02d:%02d,%03d", hour, minute, second, millisecond)
  }
  var hour: Int {
    Int((self / 3600).truncatingRemainder(dividingBy: 3600))
  }
  var minute: Int {
    Int((self / 60).truncatingRemainder(dividingBy: 60))
  }
  var second: Int {
    Int(truncatingRemainder(dividingBy: 60))
  }
  var millisecond: Int {
    Int((self * 1000).truncatingRemainder(dividingBy: 1000))
  }
 }
 extension String {
  var fileURL: URL {
    return URL(fileURLWithPath: self)
  }
  var pathExtension: String {
    return fileURL.pathExtension
  }
  var lastPathComponent: String {
    return fileURL.lastPathComponent
  }
  var stringByDeletingPathExtension: String {
    return fileURL.deletingPathExtension().path
  }
 }
 class SpeechSegment: CustomStringConvertible {
  let start: Float
  let end: Float
  let text: String
  init(start: Float, duration: Float, text: String) {
    self.start = start
    self.end = start + duration
    self.text = text
  }
  public var description: String {
    var s: String
    s = TimeInterval(self.start).hourMinuteSecondMS
    s += " --> "
    s += TimeInterval(self.end).hourMinuteSecondMS
    s += "\n"
    s += self.text
    return s
  }
 }
 func run() {
  var recognizer: SherpaOnnxOfflineRecognizer
  var modelConfig: SherpaOnnxOfflineModelConfig
  var modelType = "whisper"
  // modelType = "paraformer"
  var filePath = "/Users/fangjun/Desktop/Obama.wav"  // English
  // filePath = "/Users/fangjun/Desktop/lei-jun.wav"  // Chinese
  // please go to https://huggingface.co/csukuangfj/vad
  // to download the above two files
  if modelType == "whisper" {
    // for English
    let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"
    let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"
    let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"
    let whisperConfig = sherpaOnnxOfflineWhisperModelConfig(
      encoder: encoder,
      decoder: decoder
    )
    modelConfig = sherpaOnnxOfflineModelConfig(
      tokens: tokens,
      whisper: whisperConfig,
      debug: 0,
      modelType: "whisper"
    )
  } else if modelType == "paraformer" {
    // for Chinese
    let model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx"
    let tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt"
    let paraformerConfig = sherpaOnnxOfflineParaformerModelConfig(
      model: model
    )
    modelConfig = sherpaOnnxOfflineModelConfig(
      tokens: tokens,
      paraformer: paraformerConfig,
      debug: 0,
      modelType: "paraformer"
    )
  } else {
    print("Please specify a supported modelType \(modelType)")
    return
  }
  let sampleRate = 16000
  let featConfig = sherpaOnnxFeatureConfig(
    sampleRate: sampleRate,
    featureDim: 80
  )
  var config = sherpaOnnxOfflineRecognizerConfig(
    featConfig: featConfig,
    modelConfig: modelConfig
  )
  recognizer = SherpaOnnxOfflineRecognizer(config: &config)
  let audioFile = try! AVAudioFile(forReading: filePath.fileURL)
  let audioFormat = audioFile.processingFormat
  assert(audioFormat.sampleRate == Double(sampleRate))
  assert(audioFormat.channelCount == 1)
  assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
  let sileroVadConfig = sherpaOnnxSileroVadModelConfig(
    model: "./silero_vad.onnx"
  )
  var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)
  let vad = SherpaOnnxVoiceActivityDetectorWrapper(
    config: &vadModelConfig, buffer_size_in_seconds: 120)
  let audioFrameCount = UInt32(audioFile.length)
  let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
  try! audioFile.read(into: audioFileBuffer!)
  var array: [Float]! = audioFileBuffer?.array()
  let windowSize = Int(vadModelConfig.silero_vad.window_size)
  var segments: [SpeechSegment] = []
  while array.count > windowSize {
    // todo(fangjun): avoid extra copies here
    vad.acceptWaveform(samples: [Float](array[0..<windowSize]))
    array = [Float](array[windowSize..<array.count])
    while !vad.isEmpty() {
      let s = vad.front()
      vad.pop()
      let result = recognizer.decode(samples: s.samples)
      segments.append(
        SpeechSegment(
          start: Float(s.start) / Float(sampleRate),
          duration: Float(s.samples.count) / Float(sampleRate),
          text: result.text))
      print(segments.last!)
    }
  }
  let srt = zip(segments.indices, segments).map { (index, element) in
    return "\(index+1)\n\(element)"
  }.joined(separator: "\n\n")
  let srtFilename = filePath.stringByDeletingPathExtension + ".srt"
  do {
    try srt.write(to: srtFilename.fileURL, atomically: true, encoding: .utf8)
  } catch {
    print("Error writing: \(error.localizedDescription)")
  }
  print("Saved to \(srtFilename)")
 }
@main
 struct App {
  static func main() {
    run()
  }
 }
--- a/swift-api-examples/run-generate-subtitles.sh
+++ b/swift-api-examples/run-generate-subtitles.sh
@@ -0,0 +1,36 @@
 #!/usr/bin/env bash
 set -ex
 if [ ! -d ../build-swift-macos ]; then
  echo "Please run ../build-swift-macos.sh first!"
  exit 1
 fi
 if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
  echo "Please download the pre-trained model for testing."
  echo "You can refer to"
  echo ""
  echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
  echo ""
  echo "for help"
  exit 1
 fi
 if [ ! -e ./generate-subtitles ]; then
  # Note: We use -lc++ to link against libc++ instead of libstdc++
  swiftc \
    -lc++ \
    -I ../build-swift-macos/install/include \
    -import-objc-header ./SherpaOnnx-Bridging-Header.h \
    ./generate-subtitles.swift  ./SherpaOnnx.swift \
    -L ../build-swift-macos/install/lib/ \
    -l sherpa-onnx \
    -l onnxruntime \
    -o generate-subtitles
 else
  echo "./generate-subtitles exists - skip building"
 fi
 export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
 ./generate-subtitles