Generate SRT from audio (#341)

This commit is contained in:
yujinqiu
2023-09-25 16:36:16 +08:00
committed by GitHub
parent 552a267c23
commit 9091917eab
19 changed files with 984 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
{
"colors" : [
{
"idiom" : "universal"
}
],
"info" : {
"author" : "xcode",
"version" : 1
}
}

View File

@@ -0,0 +1,14 @@
{
"images" : [
{
"filename" : "k2-1024x1024.png",
"idiom" : "universal",
"platform" : "ios",
"size" : "1024x1024"
}
],
"info" : {
"author" : "xcode",
"version" : 1
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 411 KiB

View File

@@ -0,0 +1,6 @@
{
"info" : {
"author" : "xcode",
"version" : 1
}
}

View File

@@ -0,0 +1,121 @@
//
// ContentView.swift
// SherpaOnnxSubtitle
//
// Created by knight on 2023/9/23.
//
import AVKit
import MediaPlayer
import PhotosUI
import SwiftUI
struct ContentView: View {
@StateObject var subtitleViewModel = SubtitleViewModel()
var body: some View {
VStack {
VStack {
Text("SherpaOnnxSubtitle")
.font(.title)
VStack(alignment: .leading) {
Text("Audio format should be **mono** channel and **16khz** sample rate")
Text("You can convert file with the help of ffmpeg")
Text("```ffmpeg -i ./foo.mov -acodec pcm_s16le -ac 1 -ar 16000 foo.wav```")
}
}
.padding(.vertical)
PhotosPicker(
selection: $subtitleViewModel.selectedItem,
matching: .videos
) {
Label("Open Audio from Photo Library", systemImage: "photo")
.frame(minWidth: 0, maxWidth: .infinity)
.padding()
.background(.blue, in: .rect(cornerRadius: 8.0))
.foregroundColor(.white)
}
Button(action: {
subtitleViewModel.importNow = true
}, label: {
Text("Open Audio from Files")
.frame(minWidth: 0, maxWidth: .infinity)
.padding()
.background(.blue, in: .rect(cornerRadius: 8.0))
})
.foregroundColor(.white)
switch subtitleViewModel.loadState {
case .initial, .loaded(_), .done:
EmptyView()
case .loading:
ProgressView()
case .failed:
Text("Gen SRT failed")
}
}
.fileImporter(isPresented: $subtitleViewModel.importNow, allowedContentTypes: [.movie, .audio], onCompletion: handleImportCompletion)
.onChange(of: subtitleViewModel.importNow) { importNow in
if !importNow {
subtitleViewModel.restoreState()
}
}
.fileExporter(isPresented: $subtitleViewModel.exportNow,
document: subtitleViewModel.srtDocument, contentType: .srt,
defaultFilename: subtitleViewModel.srtName,
onCompletion: handleExportCompletion)
.task(id: subtitleViewModel.selectedItem) {
do {
if !subtitleViewModel.hasAudio {
return
}
subtitleViewModel.loadState = .loading
if let movie = try await subtitleViewModel.selectedItem?.loadTransferable(type: Audio.self) {
subtitleViewModel.loadState = .loaded(movie)
subtitleViewModel.generateSRT(from: movie.url)
} else {
subtitleViewModel.loadState = .failed
}
} catch {
subtitleViewModel.loadState = .failed
}
}
.padding()
}
private func handleImportCompletion(result: Result<URL, Error>) {
print("file import...")
switch result {
case let .success(file):
let accessing = file.startAccessingSecurityScopedResource()
defer {
if accessing {
file.stopAccessingSecurityScopedResource()
}
}
subtitleViewModel.generateSRT(from: file)
case let .failure(error):
print(error.localizedDescription)
subtitleViewModel.loadState = .failed
}
}
private func handleExportCompletion(result: Result<URL, any Error>) {
switch result {
case let .success(url):
print("audio export to: \(url)")
subtitleViewModel.loadState = .done
case let .failure(error):
print("export audio error: \(error.localizedDescription)")
subtitleViewModel.loadState = .failed
}
}
}
struct ContentView_Previews: PreviewProvider {
static var previews: some View {
ContentView()
}
}

View File

@@ -0,0 +1,14 @@
//
// UTType.swift
// YPlayer
//
// Created by knight on 2023/7/7.
//
import UniformTypeIdentifiers
extension UTType {
static var srt: UTType {
UTType(exportedAs: "com.k2.srt")
}
}

View File

@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>UTExportedTypeDeclarations</key>
<array>
<dict>
<key>UTTypeConformsTo</key>
<array>
<string>public.plain-text</string>
</array>
<key>UTTypeDescription</key>
<string>SubRip Subtitle File</string>
<key>UTTypeIconFiles</key>
<array/>
<key>UTTypeIdentifier</key>
<string>com.k2.srt</string>
<key>UTTypeTagSpecification</key>
<dict>
<key>public.filename-extension</key>
<array>
<string>srt</string>
</array>
</dict>
</dict>
</array>
</dict>
</plist>

View File

@@ -0,0 +1,27 @@
//
// Audio.swift
// SherpaOnnxSubtitle
//
// Created by knight on 2023/9/23.
//
import SwiftUI
struct Audio: Transferable {
let url: URL
static var transferRepresentation: some TransferRepresentation {
FileRepresentation(contentType: .movie) { movie in
SentTransferredFile(movie.url)
} importing: { received in
let copy = URL.documentsDirectory.appending(path: "audio.wav")
if FileManager.default.fileExists(atPath: copy.path()) {
try FileManager.default.removeItem(at: copy)
}
try FileManager.default.copyItem(at: received.file, to: copy)
return Self(url: copy)
}
}
}

View File

@@ -0,0 +1,32 @@
//
// Document.swift
// YPlayer
//
// Created by knight on 2023/6/5.
//
import SwiftUI
import UniformTypeIdentifiers
struct Document: FileDocument {
static var readableContentTypes = [UTType.srt]
static var writableContentTypes = [UTType.srt]
var data: Data?
init(data: Data?) {
self.data = data
}
init(configuration: ReadConfiguration) throws {
if let data = configuration.file.regularFileContents {
self.data = data
}
}
func fileWrapper(configuration _: WriteConfiguration) throws -> FileWrapper {
guard let data = data else {
throw ExportError.fileNotFound
}
return FileWrapper(regularFileWithContents: data)
}
}

View File

@@ -0,0 +1,12 @@
//
// Errors.swift
// YPlayer
//
// Created by knight on 2023/8/26.
//
import Foundation
enum ExportError: String, Error {
case fileNotFound = "export file not found"
}

View File

@@ -0,0 +1,31 @@
//
// SpeechSegment.swift
// SherpaOnnxSubtitle
//
// Created by knight on 2023/9/23.
//
import Foundation
class SpeechSegment: CustomStringConvertible {
let start: Float
let end: Float
let text: String
init(start: Float, duration: Float, text: String) {
self.start = start
end = start + duration
self.text = text
}
public var description: String {
var s: String
s = TimeInterval(start).hourMinuteSecondMS
s += " --> "
s += TimeInterval(end).hourMinuteSecondMS
s += "\n"
s += text
return s
}
}

View File

@@ -0,0 +1,6 @@
{
"info" : {
"author" : "xcode",
"version" : 1
}
}

View File

@@ -0,0 +1,17 @@
//
// SherpaOnnxSubtitleApp.swift
// SherpaOnnxSubtitle
//
// Created by knight on 2023/9/23.
//
import SwiftUI
@main
struct SherpaOnnxSubtitleApp: App {
var body: some Scene {
WindowGroup {
ContentView()
}
}
}

View File

@@ -0,0 +1,168 @@
//
// SubtitleViewModel.swift
// SherpaOnnxSubtitle
//
// Created by knight on 2023/9/23.
//
import AVFoundation
import PhotosUI
import SwiftUI
enum LoadState {
case initial
case loading
case loaded(Audio)
case done
case failed
}
class SubtitleViewModel: ObservableObject {
var modelType = "whisper"
let sampleRate = 16000
var modelConfig: SherpaOnnxOfflineModelConfig?
// modelType = "paraformer"
var recognizer: SherpaOnnxOfflineRecognizer?
var vadModelConfig: SherpaOnnxVadModelConfig?
var vad: SherpaOnnxVoiceActivityDetectorWrapper?
@Published var loadState: LoadState = .initial
@Published var selectedItem: PhotosPickerItem? = nil
@Published var importNow: Bool = false {
didSet {
loadState = .loading
}
}
@Published var exportNow: Bool = false
var srtName: String = "unknown.srt"
var content: String = ""
var srtDocument: Document {
let content = content.data(using: .utf8)
return Document(data: content)
}
var hasAudio: Bool {
return selectedItem != nil
}
init() {
if modelType == "whisper" {
// for English
self.modelConfig = getNonStreamingWhisperTinyEn()
} else if modelType == "paraformer" {
// for Chinese
self.modelConfig = getNonStreamingZhParaformer20230328()
} else {
print("Please specify a supported modelType \(modelType)")
return
}
let featConfig = sherpaOnnxFeatureConfig(
sampleRate: sampleRate,
featureDim: 80
)
guard let modelConfig else {
return
}
var config = sherpaOnnxOfflineRecognizerConfig(
featConfig: featConfig,
modelConfig: modelConfig
)
recognizer = SherpaOnnxOfflineRecognizer(config: &config)
let sileroVadConfig = sherpaOnnxSileroVadModelConfig(
model: getResource("silero_vad", "onnx")
)
self.vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)
guard var vadModelConfig else {
return
}
vad = SherpaOnnxVoiceActivityDetectorWrapper(
config: &vadModelConfig, buffer_size_in_seconds: 120
)
}
func restoreState() {
loadState = .initial
}
func generateSRT(from file: URL) {
print("gen srt from: \(file)")
content = ""
// restore state
defer {
loadState = .done
}
guard let recognizer else {
return
}
guard let vadModelConfig else {
return
}
guard let vad else {
return
}
do {
let audioFile = try AVAudioFile(forReading: file)
let audioFormat = audioFile.processingFormat
assert(audioFormat.sampleRate == Double(sampleRate))
assert(audioFormat.channelCount == 1)
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
let audioFrameCount = UInt32(audioFile.length)
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
try audioFile.read(into: audioFileBuffer!)
var array: [Float]! = audioFileBuffer?.array()
let windowSize = Int(vadModelConfig.silero_vad.window_size)
var segments: [SpeechSegment] = []
while array.count > windowSize {
// todo(fangjun): avoid extra copies here
vad.acceptWaveform(samples: [Float](array[0 ..< windowSize]))
array = [Float](array[windowSize ..< array.count])
while !vad.isEmpty() {
let s = vad.front()
vad.pop()
let result = recognizer.decode(samples: s.samples)
segments.append(
SpeechSegment(
start: Float(s.start) / Float(sampleRate),
duration: Float(s.samples.count) / Float(sampleRate),
text: result.text
))
print(segments.last!)
}
}
content = zip(segments.indices, segments).map { index, element in
"\(index + 1)\n\(element)"
}.joined(separator: "\n\n")
} catch {
print("error: \(error.localizedDescription)")
}
exportNow = true
let last = file.lastPathComponent
srtName = "\(last).srt"
}
}