/* This file shows how to use Swift API to generate subtitles. You can use the files from https://huggingface.co/csukuangfj/vad/tree/main for testing. For instance, to generate subtitles for Obama.mov, please first use ffmpeg -i ./Obama.mov -acodec pcm_s16le -ac 1 -ar 16000 Obama.wav to extract the audio part from the video. This file supports only processing WAV sound files, so you have to first extract audios from videos. Please see ./run-generate-subtitles.sh for usages. */ import AVFoundation extension AudioBuffer { func array() -> [Float] { return Array(UnsafeBufferPointer(self)) } } extension AVAudioPCMBuffer { func array() -> [Float] { return self.audioBufferList.pointee.mBuffers.array() } } extension TimeInterval { var hourMinuteSecondMS: String { String(format: "%d:%02d:%02d,%03d", hour, minute, second, millisecond) } var hour: Int { Int((self / 3600).truncatingRemainder(dividingBy: 3600)) } var minute: Int { Int((self / 60).truncatingRemainder(dividingBy: 60)) } var second: Int { Int(truncatingRemainder(dividingBy: 60)) } var millisecond: Int { Int((self * 1000).truncatingRemainder(dividingBy: 1000)) } } extension String { var fileURL: URL { return URL(fileURLWithPath: self) } var pathExtension: String { return fileURL.pathExtension } var lastPathComponent: String { return fileURL.lastPathComponent } var stringByDeletingPathExtension: String { return fileURL.deletingPathExtension().path } } class SpeechSegment: CustomStringConvertible { let start: Float let end: Float let text: String init(start: Float, duration: Float, text: String) { self.start = start self.end = start + duration self.text = text } public var description: String { var s: String s = TimeInterval(self.start).hourMinuteSecondMS s += " --> " s += TimeInterval(self.end).hourMinuteSecondMS s += "\n" s += self.text return s } } func run() { var recognizer: SherpaOnnxOfflineRecognizer var modelConfig: SherpaOnnxOfflineModelConfig var modelType = "whisper" // modelType = "paraformer" var filePath = "/Users/fangjun/Desktop/Obama.wav" // English // filePath = "/Users/fangjun/Desktop/lei-jun.wav" // Chinese // please go to https://huggingface.co/csukuangfj/vad // to download the above two files if modelType == "whisper" { // for English let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx" let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx" let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt" let whisperConfig = sherpaOnnxOfflineWhisperModelConfig( encoder: encoder, decoder: decoder ) modelConfig = sherpaOnnxOfflineModelConfig( tokens: tokens, whisper: whisperConfig, debug: 0, modelType: "whisper" ) } else if modelType == "paraformer" { // for Chinese let model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx" let tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt" let paraformerConfig = sherpaOnnxOfflineParaformerModelConfig( model: model ) modelConfig = sherpaOnnxOfflineModelConfig( tokens: tokens, paraformer: paraformerConfig, debug: 0, modelType: "paraformer" ) } else { print("Please specify a supported modelType \(modelType)") return } let sampleRate = 16000 let featConfig = sherpaOnnxFeatureConfig( sampleRate: sampleRate, featureDim: 80 ) var config = sherpaOnnxOfflineRecognizerConfig( featConfig: featConfig, modelConfig: modelConfig ) recognizer = SherpaOnnxOfflineRecognizer(config: &config) let audioFile = try! AVAudioFile(forReading: filePath.fileURL) let audioFormat = audioFile.processingFormat assert(audioFormat.sampleRate == Double(sampleRate)) assert(audioFormat.channelCount == 1) assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32) let sileroVadConfig = sherpaOnnxSileroVadModelConfig( model: "./silero_vad.onnx" ) var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig) let vad = SherpaOnnxVoiceActivityDetectorWrapper( config: &vadModelConfig, buffer_size_in_seconds: 120) let audioFrameCount = UInt32(audioFile.length) let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount) try! audioFile.read(into: audioFileBuffer!) var array: [Float]! = audioFileBuffer?.array() let windowSize = Int(vadModelConfig.silero_vad.window_size) var segments: [SpeechSegment] = [] for offset in stride(from: 0, to: array.count, by: windowSize) { let end = min(offset + windowSize, array.count) vad.acceptWaveform(samples: [Float](array[offset..