Add Swift example for generating subtitles (#318)
This commit is contained in:
1
swift-api-examples/.gitignore
vendored
1
swift-api-examples/.gitignore
vendored
@@ -1,2 +1,3 @@
|
||||
decode-file
|
||||
decode-file-non-streaming
|
||||
generate-subtitles
|
||||
|
||||
@@ -215,7 +215,7 @@ class SherpaOnnxRecognizer {
|
||||
|
||||
/// Get the decoding results so far
|
||||
func getResult() -> SherpaOnnxOnlineRecongitionResult {
|
||||
let result: UnsafeMutablePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
|
||||
let result: UnsafePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
|
||||
recognizer, stream)
|
||||
return SherpaOnnxOnlineRecongitionResult(result: result)
|
||||
}
|
||||
@@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer {
|
||||
|
||||
DecodeOfflineStream(recognizer, stream)
|
||||
|
||||
let result: UnsafeMutablePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
|
||||
let result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
|
||||
stream)
|
||||
|
||||
DestroyOfflineStream(stream)
|
||||
@@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer {
|
||||
return SherpaOnnxOfflineRecongitionResult(result: result)
|
||||
}
|
||||
}
|
||||
|
||||
func sherpaOnnxSileroVadModelConfig(
|
||||
model: String,
|
||||
threshold: Float = 0.5,
|
||||
minSilenceDuration: Float = 0.25,
|
||||
minSpeechDuration: Float = 0.5,
|
||||
windowSize: Int = 512
|
||||
) -> SherpaOnnxSileroVadModelConfig {
|
||||
return SherpaOnnxSileroVadModelConfig(
|
||||
model: toCPointer(model),
|
||||
threshold: threshold,
|
||||
min_silence_duration: minSilenceDuration,
|
||||
min_speech_duration: minSpeechDuration,
|
||||
window_size: Int32(windowSize)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxVadModelConfig(
|
||||
sileroVad: SherpaOnnxSileroVadModelConfig,
|
||||
sampleRate: Int32 = 16000,
|
||||
numThreads: Int = 1,
|
||||
provider: String = "cpu",
|
||||
debug: Int = 0
|
||||
) -> SherpaOnnxVadModelConfig {
|
||||
return SherpaOnnxVadModelConfig(
|
||||
silero_vad: sileroVad,
|
||||
sample_rate: sampleRate,
|
||||
num_threads: Int32(numThreads),
|
||||
provider: toCPointer(provider),
|
||||
debug: Int32(debug)
|
||||
)
|
||||
}
|
||||
|
||||
class SherpaOnnxCircularBufferWrapper {
|
||||
let buffer: OpaquePointer!
|
||||
|
||||
init(capacity: Int) {
|
||||
buffer = SherpaOnnxCreateCircularBuffer(Int32(capacity))
|
||||
}
|
||||
|
||||
deinit {
|
||||
if let buffer {
|
||||
SherpaOnnxDestroyCircularBuffer(buffer)
|
||||
}
|
||||
}
|
||||
|
||||
func push(samples: [Float]) {
|
||||
SherpaOnnxCircularBufferPush(buffer, samples, Int32(samples.count))
|
||||
}
|
||||
|
||||
func get(startIndex: Int, n: Int) -> [Float] {
|
||||
let p: UnsafePointer<Float>! = SherpaOnnxCircularBufferGet(buffer, Int32(startIndex), Int32(n))
|
||||
|
||||
var samples: [Float] = []
|
||||
|
||||
for index in 0..<n {
|
||||
samples.append(p[Int(index)])
|
||||
}
|
||||
|
||||
SherpaOnnxCircularBufferFree(p)
|
||||
|
||||
return samples
|
||||
}
|
||||
|
||||
func pop(n: Int) {
|
||||
SherpaOnnxCircularBufferPop(buffer, Int32(n))
|
||||
}
|
||||
|
||||
func size() -> Int {
|
||||
return Int(SherpaOnnxCircularBufferSize(buffer))
|
||||
}
|
||||
|
||||
func reset() {
|
||||
SherpaOnnxCircularBufferReset(buffer)
|
||||
}
|
||||
}
|
||||
|
||||
class SherpaOnnxSpeechSegmentWrapper {
|
||||
let p: UnsafePointer<SherpaOnnxSpeechSegment>!
|
||||
|
||||
init(p: UnsafePointer<SherpaOnnxSpeechSegment>!) {
|
||||
self.p = p
|
||||
}
|
||||
|
||||
deinit {
|
||||
if let p {
|
||||
SherpaOnnxDestroySpeechSegment(p)
|
||||
}
|
||||
}
|
||||
|
||||
var start: Int {
|
||||
return Int(p.pointee.start)
|
||||
}
|
||||
|
||||
var n: Int {
|
||||
return Int(p.pointee.n)
|
||||
}
|
||||
|
||||
var samples: [Float] {
|
||||
var samples: [Float] = []
|
||||
for index in 0..<n {
|
||||
samples.append(p.pointee.samples[Int(index)])
|
||||
}
|
||||
return samples
|
||||
}
|
||||
}
|
||||
|
||||
class SherpaOnnxVoiceActivityDetectorWrapper {
|
||||
/// A pointer to the underlying counterpart in C
|
||||
let vad: OpaquePointer!
|
||||
|
||||
init(config: UnsafePointer<SherpaOnnxVadModelConfig>!, buffer_size_in_seconds: Float) {
|
||||
vad = SherpaOnnxCreateVoiceActivityDetector(config, buffer_size_in_seconds)
|
||||
}
|
||||
|
||||
deinit {
|
||||
if let vad {
|
||||
SherpaOnnxDestroyVoiceActivityDetector(vad)
|
||||
}
|
||||
}
|
||||
|
||||
func acceptWaveform(samples: [Float]) {
|
||||
SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, samples, Int32(samples.count))
|
||||
}
|
||||
|
||||
func isEmpty() -> Bool {
|
||||
return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1 ? true : false
|
||||
}
|
||||
|
||||
func pop() {
|
||||
SherpaOnnxVoiceActivityDetectorPop(vad)
|
||||
}
|
||||
|
||||
func front() -> SherpaOnnxSpeechSegmentWrapper {
|
||||
let p: UnsafePointer<SherpaOnnxSpeechSegment>? = SherpaOnnxVoiceActivityDetectorFront(vad)
|
||||
return SherpaOnnxSpeechSegmentWrapper(p: p)
|
||||
}
|
||||
|
||||
func reset() {
|
||||
SherpaOnnxVoiceActivityDetectorReset(vad)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,6 @@ extension AVAudioPCMBuffer {
|
||||
}
|
||||
|
||||
func run() {
|
||||
|
||||
var recognizer: SherpaOnnxOfflineRecognizer
|
||||
var modelConfig: SherpaOnnxOfflineModelConfig
|
||||
var modelType = "whisper"
|
||||
|
||||
217
swift-api-examples/generate-subtitles.swift
Normal file
217
swift-api-examples/generate-subtitles.swift
Normal file
@@ -0,0 +1,217 @@
|
||||
/*
|
||||
This file shows how to use Swift API to generate subtitles.
|
||||
|
||||
You can use the files from
|
||||
https://huggingface.co/csukuangfj/vad/tree/main
|
||||
for testing.
|
||||
|
||||
For instance, to generate subtitles for Obama.mov, please first
|
||||
use
|
||||
|
||||
ffmpeg -i ./Obama.mov -acodec pcm_s16le -ac 1 -ar 16000 Obama.wav
|
||||
|
||||
to extract the audio part from the video.
|
||||
|
||||
This file supports only processing WAV sound files, so you have to first
|
||||
extract audios from videos.
|
||||
|
||||
Please see
|
||||
./run-generate-subtitles.sh
|
||||
for usages.
|
||||
*/
|
||||
|
||||
import AVFoundation
|
||||
|
||||
extension AudioBuffer {
|
||||
func array() -> [Float] {
|
||||
return Array(UnsafeBufferPointer(self))
|
||||
}
|
||||
}
|
||||
|
||||
extension AVAudioPCMBuffer {
|
||||
func array() -> [Float] {
|
||||
return self.audioBufferList.pointee.mBuffers.array()
|
||||
}
|
||||
}
|
||||
|
||||
extension TimeInterval {
|
||||
var hourMinuteSecondMS: String {
|
||||
String(format: "%d:%02d:%02d,%03d", hour, minute, second, millisecond)
|
||||
}
|
||||
|
||||
var hour: Int {
|
||||
Int((self / 3600).truncatingRemainder(dividingBy: 3600))
|
||||
}
|
||||
var minute: Int {
|
||||
Int((self / 60).truncatingRemainder(dividingBy: 60))
|
||||
}
|
||||
var second: Int {
|
||||
Int(truncatingRemainder(dividingBy: 60))
|
||||
}
|
||||
var millisecond: Int {
|
||||
Int((self * 1000).truncatingRemainder(dividingBy: 1000))
|
||||
}
|
||||
}
|
||||
|
||||
extension String {
|
||||
var fileURL: URL {
|
||||
return URL(fileURLWithPath: self)
|
||||
}
|
||||
var pathExtension: String {
|
||||
return fileURL.pathExtension
|
||||
}
|
||||
var lastPathComponent: String {
|
||||
return fileURL.lastPathComponent
|
||||
}
|
||||
var stringByDeletingPathExtension: String {
|
||||
return fileURL.deletingPathExtension().path
|
||||
}
|
||||
}
|
||||
|
||||
class SpeechSegment: CustomStringConvertible {
|
||||
|
||||
let start: Float
|
||||
let end: Float
|
||||
let text: String
|
||||
|
||||
init(start: Float, duration: Float, text: String) {
|
||||
self.start = start
|
||||
self.end = start + duration
|
||||
self.text = text
|
||||
}
|
||||
public var description: String {
|
||||
var s: String
|
||||
s = TimeInterval(self.start).hourMinuteSecondMS
|
||||
s += " --> "
|
||||
s += TimeInterval(self.end).hourMinuteSecondMS
|
||||
s += "\n"
|
||||
s += self.text
|
||||
|
||||
return s
|
||||
}
|
||||
}
|
||||
|
||||
func run() {
|
||||
var recognizer: SherpaOnnxOfflineRecognizer
|
||||
var modelConfig: SherpaOnnxOfflineModelConfig
|
||||
var modelType = "whisper"
|
||||
// modelType = "paraformer"
|
||||
var filePath = "/Users/fangjun/Desktop/Obama.wav" // English
|
||||
// filePath = "/Users/fangjun/Desktop/lei-jun.wav" // Chinese
|
||||
// please go to https://huggingface.co/csukuangfj/vad
|
||||
// to download the above two files
|
||||
|
||||
if modelType == "whisper" {
|
||||
// for English
|
||||
let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"
|
||||
let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"
|
||||
let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"
|
||||
|
||||
let whisperConfig = sherpaOnnxOfflineWhisperModelConfig(
|
||||
encoder: encoder,
|
||||
decoder: decoder
|
||||
)
|
||||
|
||||
modelConfig = sherpaOnnxOfflineModelConfig(
|
||||
tokens: tokens,
|
||||
whisper: whisperConfig,
|
||||
debug: 0,
|
||||
modelType: "whisper"
|
||||
)
|
||||
} else if modelType == "paraformer" {
|
||||
// for Chinese
|
||||
let model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx"
|
||||
let tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt"
|
||||
let paraformerConfig = sherpaOnnxOfflineParaformerModelConfig(
|
||||
model: model
|
||||
)
|
||||
|
||||
modelConfig = sherpaOnnxOfflineModelConfig(
|
||||
tokens: tokens,
|
||||
paraformer: paraformerConfig,
|
||||
debug: 0,
|
||||
modelType: "paraformer"
|
||||
)
|
||||
} else {
|
||||
print("Please specify a supported modelType \(modelType)")
|
||||
return
|
||||
}
|
||||
|
||||
let sampleRate = 16000
|
||||
let featConfig = sherpaOnnxFeatureConfig(
|
||||
sampleRate: sampleRate,
|
||||
featureDim: 80
|
||||
)
|
||||
var config = sherpaOnnxOfflineRecognizerConfig(
|
||||
featConfig: featConfig,
|
||||
modelConfig: modelConfig
|
||||
)
|
||||
|
||||
recognizer = SherpaOnnxOfflineRecognizer(config: &config)
|
||||
|
||||
let audioFile = try! AVAudioFile(forReading: filePath.fileURL)
|
||||
|
||||
let audioFormat = audioFile.processingFormat
|
||||
assert(audioFormat.sampleRate == Double(sampleRate))
|
||||
assert(audioFormat.channelCount == 1)
|
||||
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
|
||||
|
||||
let sileroVadConfig = sherpaOnnxSileroVadModelConfig(
|
||||
model: "./silero_vad.onnx"
|
||||
)
|
||||
|
||||
var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)
|
||||
let vad = SherpaOnnxVoiceActivityDetectorWrapper(
|
||||
config: &vadModelConfig, buffer_size_in_seconds: 120)
|
||||
|
||||
let audioFrameCount = UInt32(audioFile.length)
|
||||
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
|
||||
|
||||
try! audioFile.read(into: audioFileBuffer!)
|
||||
var array: [Float]! = audioFileBuffer?.array()
|
||||
|
||||
let windowSize = Int(vadModelConfig.silero_vad.window_size)
|
||||
|
||||
var segments: [SpeechSegment] = []
|
||||
|
||||
while array.count > windowSize {
|
||||
// todo(fangjun): avoid extra copies here
|
||||
vad.acceptWaveform(samples: [Float](array[0..<windowSize]))
|
||||
array = [Float](array[windowSize..<array.count])
|
||||
|
||||
while !vad.isEmpty() {
|
||||
let s = vad.front()
|
||||
vad.pop()
|
||||
let result = recognizer.decode(samples: s.samples)
|
||||
|
||||
segments.append(
|
||||
SpeechSegment(
|
||||
start: Float(s.start) / Float(sampleRate),
|
||||
duration: Float(s.samples.count) / Float(sampleRate),
|
||||
text: result.text))
|
||||
|
||||
print(segments.last!)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
let srt = zip(segments.indices, segments).map { (index, element) in
|
||||
return "\(index+1)\n\(element)"
|
||||
}.joined(separator: "\n\n")
|
||||
|
||||
let srtFilename = filePath.stringByDeletingPathExtension + ".srt"
|
||||
do {
|
||||
try srt.write(to: srtFilename.fileURL, atomically: true, encoding: .utf8)
|
||||
} catch {
|
||||
print("Error writing: \(error.localizedDescription)")
|
||||
}
|
||||
|
||||
print("Saved to \(srtFilename)")
|
||||
}
|
||||
|
||||
@main
|
||||
struct App {
|
||||
static func main() {
|
||||
run()
|
||||
}
|
||||
}
|
||||
36
swift-api-examples/run-generate-subtitles.sh
Executable file
36
swift-api-examples/run-generate-subtitles.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
if [ ! -d ../build-swift-macos ]; then
|
||||
echo "Please run ../build-swift-macos.sh first!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
|
||||
echo "Please download the pre-trained model for testing."
|
||||
echo "You can refer to"
|
||||
echo ""
|
||||
echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
|
||||
echo ""
|
||||
echo "for help"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -e ./generate-subtitles ]; then
|
||||
# Note: We use -lc++ to link against libc++ instead of libstdc++
|
||||
swiftc \
|
||||
-lc++ \
|
||||
-I ../build-swift-macos/install/include \
|
||||
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
|
||||
./generate-subtitles.swift ./SherpaOnnx.swift \
|
||||
-L ../build-swift-macos/install/lib/ \
|
||||
-l sherpa-onnx \
|
||||
-l onnxruntime \
|
||||
-o generate-subtitles
|
||||
else
|
||||
echo "./generate-subtitles exists - skip building"
|
||||
fi
|
||||
|
||||
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
|
||||
./generate-subtitles
|
||||
Reference in New Issue
Block a user