This repository has been archived on 2025-08-26. You can view files and clone it, but cannot push or open issues or pull requests.
Files
enginex-mr_series-sherpa-onnx/ios-swiftui/SherpaOnnx2Pass/SherpaOnnx2Pass/SherpaOnnxViewModel.swift
2024-07-10 17:05:26 +08:00

253 lines
7.9 KiB
Swift

//
// SherpaOnnxViewModel.swift
// SherpaOnnx
//
// Created by knight on 2023/4/5.
//
import Foundation
import AVFoundation
enum Status {
case stop
case recording
}
class SherpaOnnxViewModel: ObservableObject {
@Published var status: Status = .stop
@Published var subtitles: String = ""
var sentences: [String] = []
var samplesBuffer = [[Float]] ()
var audioEngine: AVAudioEngine? = nil
var recognizer: SherpaOnnxRecognizer! = nil
var offlineRecognizer: SherpaOnnxOfflineRecognizer! = nil
var lastSentence: String = ""
// let maxSentence: Int = 10 // for Chinese
let maxSentence: Int = 6 // for English
var results: String {
if sentences.isEmpty && lastSentence.isEmpty {
return ""
}
if sentences.isEmpty {
return "0: \(lastSentence.lowercased())"
}
let start = max(sentences.count - maxSentence, 0)
if lastSentence.isEmpty {
return sentences.enumerated().map { (index, s) in "\(index): \(s.lowercased())" }[start...]
.joined(separator: "\n")
} else {
return sentences.enumerated().map { (index, s) in "\(index): \(s.lowercased())" }[start...]
.joined(separator: "\n") + "\n\(sentences.count): \(lastSentence.lowercased())"
}
}
func updateLabel() {
DispatchQueue.main.async {
self.subtitles = self.results
}
}
init() {
initRecognizer()
initOfflineRecognizer()
initRecorder()
}
private func initRecognizer() {
// Please select one model that is best suitable for you.
//
// You can also modify Model.swift to add new pre-trained models from
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
// let modelConfig = getBilingualStreamingZhEnZipformer20230220()
/* let modelConfig = getStreamingZh14MZipformer20230223() */
let modelConfig = getStreamingEn20MZipformer20230217()
let featConfig = sherpaOnnxFeatureConfig(
sampleRate: 16000,
featureDim: 80)
var config = sherpaOnnxOnlineRecognizerConfig(
featConfig: featConfig,
modelConfig: modelConfig,
enableEndpoint: true,
rule1MinTrailingSilence: 2.4,
// rule2MinTrailingSilence: 1.2, // for Chinese
rule2MinTrailingSilence: 0.5, // for English
rule3MinUtteranceLength: 30,
decodingMethod: "greedy_search",
maxActivePaths: 4
)
recognizer = SherpaOnnxRecognizer(config: &config)
}
private func initOfflineRecognizer() {
// let modelConfig = getNonStreamingZhParaformer20230914()
let modelConfig = getNonStreamingWhisperTinyEn()
// let modelConfig = getNonStreamingEnZipformer20230504()
let featConfig = sherpaOnnxFeatureConfig(
sampleRate: 16000,
featureDim: 80)
var config = sherpaOnnxOfflineRecognizerConfig(
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: "greedy_search",
maxActivePaths: 4
)
offlineRecognizer = SherpaOnnxOfflineRecognizer(config: &config)
}
private func initRecorder() {
print("init recorder")
audioEngine = AVAudioEngine()
let inputNode = self.audioEngine?.inputNode
let bus = 0
let inputFormat = inputNode?.outputFormat(forBus: bus)
let outputFormat = AVAudioFormat(
commonFormat: .pcmFormatFloat32,
sampleRate: 16000, channels: 1,
interleaved: false)!
let converter = AVAudioConverter(from: inputFormat!, to: outputFormat)!
inputNode!.installTap(
onBus: bus,
bufferSize: 1024,
format: inputFormat
) {
(buffer: AVAudioPCMBuffer, when: AVAudioTime) in
var newBufferAvailable = true
let inputCallback: AVAudioConverterInputBlock = {
inNumPackets, outStatus in
if newBufferAvailable {
outStatus.pointee = .haveData
newBufferAvailable = false
return buffer
} else {
outStatus.pointee = .noDataNow
return nil
}
}
let convertedBuffer = AVAudioPCMBuffer(
pcmFormat: outputFormat,
frameCapacity:
AVAudioFrameCount(outputFormat.sampleRate)
* buffer.frameLength
/ AVAudioFrameCount(buffer.format.sampleRate))!
var error: NSError?
let _ = converter.convert(
to: convertedBuffer,
error: &error, withInputFrom: inputCallback)
// TODO(fangjun): Handle status != haveData
let array = convertedBuffer.array()
if !array.isEmpty {
self.samplesBuffer.append(array)
self.recognizer.acceptWaveform(samples: array)
while (self.recognizer.isReady()){
self.recognizer.decode()
}
let isEndpoint = self.recognizer.isEndpoint()
let text = self.recognizer.getResult().text
if !text.isEmpty && self.lastSentence != text {
self.lastSentence = text
self.updateLabel()
print(text)
}
if isEndpoint{
if !text.isEmpty {
// Invoke offline recognizer
var numSamples: Int = 0
for a in self.samplesBuffer {
numSamples += a.count
}
var samples: [Float] = Array(repeating: 0, count: numSamples)
var i = 0
for a in self.samplesBuffer {
for s in a {
samples[i] = s
i += 1
}
}
// let num = 12000 // For Chinese
let num = 10000 // For English
self.lastSentence = self.offlineRecognizer.decode(samples: Array(samples[0..<samples.count-num])).text
let tmp = self.lastSentence
self.lastSentence = ""
self.sentences.append(tmp)
self.updateLabel()
i = 0
if samples.count > num {
i = samples.count - num
}
var tail: [Float] = Array(repeating: 0, count: samples.count - i)
for k in 0 ... samples.count - i - 1 {
tail[k] = samples[i+k];
}
self.samplesBuffer = [[Float]]()
self.samplesBuffer.append(tail)
} else {
self.samplesBuffer = [[Float]]()
}
self.recognizer.reset()
}
}
}
}
public func toggleRecorder() {
if status == .stop {
startRecorder()
status = .recording
} else {
stopRecorder()
status = .stop
}
}
private func startRecorder() {
lastSentence = ""
sentences = []
samplesBuffer = [[Float]] ()
updateLabel()
do {
try self.audioEngine?.start()
} catch let error as NSError {
print("Got an error starting audioEngine: \(error.domain), \(error)")
}
print("started")
}
private func stopRecorder() {
audioEngine?.stop()
print("stopped")
}
}