Generate SRT from audio (#341)
This commit is contained in:
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"colors" : [
|
||||
{
|
||||
"idiom" : "universal"
|
||||
}
|
||||
],
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"images" : [
|
||||
{
|
||||
"filename" : "k2-1024x1024.png",
|
||||
"idiom" : "universal",
|
||||
"platform" : "ios",
|
||||
"size" : "1024x1024"
|
||||
}
|
||||
],
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 411 KiB |
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
//
|
||||
// ContentView.swift
|
||||
// SherpaOnnxSubtitle
|
||||
//
|
||||
// Created by knight on 2023/9/23.
|
||||
//
|
||||
|
||||
import AVKit
|
||||
import MediaPlayer
|
||||
import PhotosUI
|
||||
import SwiftUI
|
||||
|
||||
struct ContentView: View {
|
||||
@StateObject var subtitleViewModel = SubtitleViewModel()
|
||||
|
||||
var body: some View {
|
||||
VStack {
|
||||
VStack {
|
||||
Text("SherpaOnnxSubtitle")
|
||||
.font(.title)
|
||||
VStack(alignment: .leading) {
|
||||
Text("Audio format should be **mono** channel and **16khz** sample rate")
|
||||
|
||||
Text("You can convert file with the help of ffmpeg")
|
||||
Text("```ffmpeg -i ./foo.mov -acodec pcm_s16le -ac 1 -ar 16000 foo.wav```")
|
||||
}
|
||||
}
|
||||
.padding(.vertical)
|
||||
PhotosPicker(
|
||||
selection: $subtitleViewModel.selectedItem,
|
||||
matching: .videos
|
||||
) {
|
||||
Label("Open Audio from Photo Library", systemImage: "photo")
|
||||
.frame(minWidth: 0, maxWidth: .infinity)
|
||||
.padding()
|
||||
.background(.blue, in: .rect(cornerRadius: 8.0))
|
||||
.foregroundColor(.white)
|
||||
}
|
||||
|
||||
Button(action: {
|
||||
subtitleViewModel.importNow = true
|
||||
}, label: {
|
||||
Text("Open Audio from Files")
|
||||
.frame(minWidth: 0, maxWidth: .infinity)
|
||||
.padding()
|
||||
.background(.blue, in: .rect(cornerRadius: 8.0))
|
||||
})
|
||||
.foregroundColor(.white)
|
||||
switch subtitleViewModel.loadState {
|
||||
case .initial, .loaded(_), .done:
|
||||
EmptyView()
|
||||
case .loading:
|
||||
ProgressView()
|
||||
case .failed:
|
||||
Text("Gen SRT failed")
|
||||
}
|
||||
}
|
||||
.fileImporter(isPresented: $subtitleViewModel.importNow, allowedContentTypes: [.movie, .audio], onCompletion: handleImportCompletion)
|
||||
.onChange(of: subtitleViewModel.importNow) { importNow in
|
||||
if !importNow {
|
||||
subtitleViewModel.restoreState()
|
||||
}
|
||||
}
|
||||
.fileExporter(isPresented: $subtitleViewModel.exportNow,
|
||||
document: subtitleViewModel.srtDocument, contentType: .srt,
|
||||
defaultFilename: subtitleViewModel.srtName,
|
||||
onCompletion: handleExportCompletion)
|
||||
.task(id: subtitleViewModel.selectedItem) {
|
||||
do {
|
||||
if !subtitleViewModel.hasAudio {
|
||||
return
|
||||
}
|
||||
subtitleViewModel.loadState = .loading
|
||||
|
||||
if let movie = try await subtitleViewModel.selectedItem?.loadTransferable(type: Audio.self) {
|
||||
subtitleViewModel.loadState = .loaded(movie)
|
||||
subtitleViewModel.generateSRT(from: movie.url)
|
||||
} else {
|
||||
subtitleViewModel.loadState = .failed
|
||||
}
|
||||
} catch {
|
||||
subtitleViewModel.loadState = .failed
|
||||
}
|
||||
}
|
||||
.padding()
|
||||
}
|
||||
|
||||
private func handleImportCompletion(result: Result<URL, Error>) {
|
||||
print("file import...")
|
||||
switch result {
|
||||
case let .success(file):
|
||||
let accessing = file.startAccessingSecurityScopedResource()
|
||||
defer {
|
||||
if accessing {
|
||||
file.stopAccessingSecurityScopedResource()
|
||||
}
|
||||
}
|
||||
subtitleViewModel.generateSRT(from: file)
|
||||
case let .failure(error):
|
||||
print(error.localizedDescription)
|
||||
subtitleViewModel.loadState = .failed
|
||||
}
|
||||
}
|
||||
|
||||
private func handleExportCompletion(result: Result<URL, any Error>) {
|
||||
switch result {
|
||||
case let .success(url):
|
||||
print("audio export to: \(url)")
|
||||
subtitleViewModel.loadState = .done
|
||||
case let .failure(error):
|
||||
print("export audio error: \(error.localizedDescription)")
|
||||
subtitleViewModel.loadState = .failed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ContentView_Previews: PreviewProvider {
|
||||
static var previews: some View {
|
||||
ContentView()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
//
|
||||
// UTType.swift
|
||||
// YPlayer
|
||||
//
|
||||
// Created by knight on 2023/7/7.
|
||||
//
|
||||
|
||||
import UniformTypeIdentifiers
|
||||
|
||||
extension UTType {
|
||||
static var srt: UTType {
|
||||
UTType(exportedAs: "com.k2.srt")
|
||||
}
|
||||
}
|
||||
28
ios-swiftui/SherpaOnnxSubtitle/SherpaOnnxSubtitle/Info.plist
Normal file
28
ios-swiftui/SherpaOnnxSubtitle/SherpaOnnxSubtitle/Info.plist
Normal file
@@ -0,0 +1,28 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>UTExportedTypeDeclarations</key>
|
||||
<array>
|
||||
<dict>
|
||||
<key>UTTypeConformsTo</key>
|
||||
<array>
|
||||
<string>public.plain-text</string>
|
||||
</array>
|
||||
<key>UTTypeDescription</key>
|
||||
<string>SubRip Subtitle File</string>
|
||||
<key>UTTypeIconFiles</key>
|
||||
<array/>
|
||||
<key>UTTypeIdentifier</key>
|
||||
<string>com.k2.srt</string>
|
||||
<key>UTTypeTagSpecification</key>
|
||||
<dict>
|
||||
<key>public.filename-extension</key>
|
||||
<array>
|
||||
<string>srt</string>
|
||||
</array>
|
||||
</dict>
|
||||
</dict>
|
||||
</array>
|
||||
</dict>
|
||||
</plist>
|
||||
@@ -0,0 +1,27 @@
|
||||
//
|
||||
// Audio.swift
|
||||
// SherpaOnnxSubtitle
|
||||
//
|
||||
// Created by knight on 2023/9/23.
|
||||
//
|
||||
|
||||
import SwiftUI
|
||||
|
||||
struct Audio: Transferable {
|
||||
let url: URL
|
||||
|
||||
static var transferRepresentation: some TransferRepresentation {
|
||||
FileRepresentation(contentType: .movie) { movie in
|
||||
SentTransferredFile(movie.url)
|
||||
} importing: { received in
|
||||
let copy = URL.documentsDirectory.appending(path: "audio.wav")
|
||||
|
||||
if FileManager.default.fileExists(atPath: copy.path()) {
|
||||
try FileManager.default.removeItem(at: copy)
|
||||
}
|
||||
|
||||
try FileManager.default.copyItem(at: received.file, to: copy)
|
||||
return Self(url: copy)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
//
|
||||
// Document.swift
|
||||
// YPlayer
|
||||
//
|
||||
// Created by knight on 2023/6/5.
|
||||
//
|
||||
|
||||
import SwiftUI
|
||||
import UniformTypeIdentifiers
|
||||
|
||||
struct Document: FileDocument {
|
||||
static var readableContentTypes = [UTType.srt]
|
||||
static var writableContentTypes = [UTType.srt]
|
||||
var data: Data?
|
||||
|
||||
init(data: Data?) {
|
||||
self.data = data
|
||||
}
|
||||
|
||||
init(configuration: ReadConfiguration) throws {
|
||||
if let data = configuration.file.regularFileContents {
|
||||
self.data = data
|
||||
}
|
||||
}
|
||||
|
||||
func fileWrapper(configuration _: WriteConfiguration) throws -> FileWrapper {
|
||||
guard let data = data else {
|
||||
throw ExportError.fileNotFound
|
||||
}
|
||||
return FileWrapper(regularFileWithContents: data)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
//
|
||||
// Errors.swift
|
||||
// YPlayer
|
||||
//
|
||||
// Created by knight on 2023/8/26.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
enum ExportError: String, Error {
|
||||
case fileNotFound = "export file not found"
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
//
|
||||
// SpeechSegment.swift
|
||||
// SherpaOnnxSubtitle
|
||||
//
|
||||
// Created by knight on 2023/9/23.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
class SpeechSegment: CustomStringConvertible {
|
||||
let start: Float
|
||||
let end: Float
|
||||
let text: String
|
||||
|
||||
init(start: Float, duration: Float, text: String) {
|
||||
self.start = start
|
||||
end = start + duration
|
||||
self.text = text
|
||||
}
|
||||
|
||||
public var description: String {
|
||||
var s: String
|
||||
s = TimeInterval(start).hourMinuteSecondMS
|
||||
s += " --> "
|
||||
s += TimeInterval(end).hourMinuteSecondMS
|
||||
s += "\n"
|
||||
s += text
|
||||
|
||||
return s
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
//
|
||||
// SherpaOnnxSubtitleApp.swift
|
||||
// SherpaOnnxSubtitle
|
||||
//
|
||||
// Created by knight on 2023/9/23.
|
||||
//
|
||||
|
||||
import SwiftUI
|
||||
|
||||
@main
|
||||
struct SherpaOnnxSubtitleApp: App {
|
||||
var body: some Scene {
|
||||
WindowGroup {
|
||||
ContentView()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,168 @@
|
||||
//
|
||||
// SubtitleViewModel.swift
|
||||
// SherpaOnnxSubtitle
|
||||
//
|
||||
// Created by knight on 2023/9/23.
|
||||
//
|
||||
|
||||
import AVFoundation
|
||||
import PhotosUI
|
||||
import SwiftUI
|
||||
|
||||
enum LoadState {
|
||||
case initial
|
||||
case loading
|
||||
case loaded(Audio)
|
||||
case done
|
||||
case failed
|
||||
}
|
||||
|
||||
class SubtitleViewModel: ObservableObject {
|
||||
var modelType = "whisper"
|
||||
let sampleRate = 16000
|
||||
|
||||
var modelConfig: SherpaOnnxOfflineModelConfig?
|
||||
// modelType = "paraformer"
|
||||
|
||||
var recognizer: SherpaOnnxOfflineRecognizer?
|
||||
|
||||
var vadModelConfig: SherpaOnnxVadModelConfig?
|
||||
var vad: SherpaOnnxVoiceActivityDetectorWrapper?
|
||||
|
||||
@Published var loadState: LoadState = .initial
|
||||
|
||||
@Published var selectedItem: PhotosPickerItem? = nil
|
||||
|
||||
@Published var importNow: Bool = false {
|
||||
didSet {
|
||||
loadState = .loading
|
||||
}
|
||||
}
|
||||
|
||||
@Published var exportNow: Bool = false
|
||||
|
||||
var srtName: String = "unknown.srt"
|
||||
var content: String = ""
|
||||
|
||||
var srtDocument: Document {
|
||||
let content = content.data(using: .utf8)
|
||||
return Document(data: content)
|
||||
}
|
||||
|
||||
var hasAudio: Bool {
|
||||
return selectedItem != nil
|
||||
}
|
||||
|
||||
init() {
|
||||
if modelType == "whisper" {
|
||||
// for English
|
||||
self.modelConfig = getNonStreamingWhisperTinyEn()
|
||||
} else if modelType == "paraformer" {
|
||||
// for Chinese
|
||||
self.modelConfig = getNonStreamingZhParaformer20230328()
|
||||
} else {
|
||||
print("Please specify a supported modelType \(modelType)")
|
||||
return
|
||||
}
|
||||
|
||||
let featConfig = sherpaOnnxFeatureConfig(
|
||||
sampleRate: sampleRate,
|
||||
featureDim: 80
|
||||
)
|
||||
|
||||
guard let modelConfig else {
|
||||
return
|
||||
}
|
||||
|
||||
var config = sherpaOnnxOfflineRecognizerConfig(
|
||||
featConfig: featConfig,
|
||||
modelConfig: modelConfig
|
||||
)
|
||||
|
||||
recognizer = SherpaOnnxOfflineRecognizer(config: &config)
|
||||
|
||||
let sileroVadConfig = sherpaOnnxSileroVadModelConfig(
|
||||
model: getResource("silero_vad", "onnx")
|
||||
)
|
||||
|
||||
self.vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)
|
||||
guard var vadModelConfig else {
|
||||
return
|
||||
}
|
||||
vad = SherpaOnnxVoiceActivityDetectorWrapper(
|
||||
config: &vadModelConfig, buffer_size_in_seconds: 120
|
||||
)
|
||||
}
|
||||
|
||||
func restoreState() {
|
||||
loadState = .initial
|
||||
}
|
||||
|
||||
func generateSRT(from file: URL) {
|
||||
print("gen srt from: \(file)")
|
||||
content = ""
|
||||
|
||||
// restore state
|
||||
defer {
|
||||
loadState = .done
|
||||
}
|
||||
guard let recognizer else {
|
||||
return
|
||||
}
|
||||
guard let vadModelConfig else {
|
||||
return
|
||||
}
|
||||
|
||||
guard let vad else {
|
||||
return
|
||||
}
|
||||
|
||||
do {
|
||||
let audioFile = try AVAudioFile(forReading: file)
|
||||
let audioFormat = audioFile.processingFormat
|
||||
assert(audioFormat.sampleRate == Double(sampleRate))
|
||||
assert(audioFormat.channelCount == 1)
|
||||
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
|
||||
|
||||
let audioFrameCount = UInt32(audioFile.length)
|
||||
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
|
||||
|
||||
try audioFile.read(into: audioFileBuffer!)
|
||||
var array: [Float]! = audioFileBuffer?.array()
|
||||
|
||||
let windowSize = Int(vadModelConfig.silero_vad.window_size)
|
||||
|
||||
var segments: [SpeechSegment] = []
|
||||
|
||||
while array.count > windowSize {
|
||||
// todo(fangjun): avoid extra copies here
|
||||
vad.acceptWaveform(samples: [Float](array[0 ..< windowSize]))
|
||||
array = [Float](array[windowSize ..< array.count])
|
||||
|
||||
while !vad.isEmpty() {
|
||||
let s = vad.front()
|
||||
vad.pop()
|
||||
let result = recognizer.decode(samples: s.samples)
|
||||
|
||||
segments.append(
|
||||
SpeechSegment(
|
||||
start: Float(s.start) / Float(sampleRate),
|
||||
duration: Float(s.samples.count) / Float(sampleRate),
|
||||
text: result.text
|
||||
))
|
||||
|
||||
print(segments.last!)
|
||||
}
|
||||
}
|
||||
content = zip(segments.indices, segments).map { index, element in
|
||||
"\(index + 1)\n\(element)"
|
||||
}.joined(separator: "\n\n")
|
||||
} catch {
|
||||
print("error: \(error.localizedDescription)")
|
||||
}
|
||||
exportNow = true
|
||||
|
||||
let last = file.lastPathComponent
|
||||
srtName = "\(last).srt"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user