Fix modified beam search for iOS and android (#76)

* Use Int type for sampling rate * Fix swift * Fix iOS
2023-03-03 15:18:31 +08:00
parent 7f72c13d9a
commit 5f31b22c12
15 changed files with 125 additions and 93 deletions
--- a/ios-swift/SherpaOnnx/SherpaOnnx/ViewController.swift
+++ b/ios-swift/SherpaOnnx/SherpaOnnx/ViewController.swift
@@ -23,10 +23,10 @@ extension AVAudioPCMBuffer {
 class ViewController: UIViewController {
    @IBOutlet weak var resultLabel: UILabel!
    @IBOutlet weak var recordBtn: UIButton!
-    
+
    var audioEngine: AVAudioEngine? = nil
    var recognizer: SherpaOnnxRecognizer! = nil
-    
+
    /// It saves the decoded results so far
    var sentences: [String] = [] {
        didSet {
@@ -42,7 +42,7 @@ class ViewController: UIViewController {
        if sentences.isEmpty {
            return "0: \(lastSentence.lowercased())"
        }
-        
+
        let start = max(sentences.count - maxSentence, 0)
        if lastSentence.isEmpty {
            return sentences.enumerated().map { (index, s) in "\(index): \(s.lowercased())" }[start...]
@@ -52,23 +52,23 @@ class ViewController: UIViewController {
                .joined(separator: "\n") + "\n\(sentences.count): \(lastSentence.lowercased())"
        }
    }
-    
+
    func updateLabel() {
        DispatchQueue.main.async {
            self.resultLabel.text = self.results
        }
    }
-    
+
    override func viewDidLoad() {
        super.viewDidLoad()
        // Do any additional setup after loading the view.
-        
+
        resultLabel.text = "ASR with Next-gen Kaldi\n\nSee https://github.com/k2-fsa/sherpa-onnx\n\nPress the Start button to run!"
        recordBtn.setTitle("Start", for: .normal)
        initRecognizer()
        initRecorder()
    }
-    
+
    @IBAction func onRecordBtnClick(_ sender: UIButton) {
        if recordBtn.currentTitle == "Start" {
            startRecorder()
@@ -78,30 +78,32 @@ class ViewController: UIViewController {
            recordBtn.setTitle("Start", for: .normal)
        }
    }
-    
+
    func initRecognizer() {
        // Please select one model that is best suitable for you.
        //
        // You can also modify Model.swift to add new pre-trained models from
        // https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
-        
+
        let modelConfig = getBilingualStreamZhEnZipformer20230220()
-        
+
        let featConfig = sherpaOnnxFeatureConfig(
            sampleRate: 16000,
            featureDim: 80)
-        
+
        var config = sherpaOnnxOnlineRecognizerConfig(
            featConfig: featConfig,
            modelConfig: modelConfig,
            enableEndpoint: true,
            rule1MinTrailingSilence: 2.4,
            rule2MinTrailingSilence: 0.8,
-            rule3MinUtteranceLength: 30
+            rule3MinUtteranceLength: 30,
+            decodingMethod: "greedy_search",
+            maxActivePaths: 4
        )
        recognizer = SherpaOnnxRecognizer(config: &config)
    }
-    
+
    func initRecorder() {
        print("init recorder")
        audioEngine = AVAudioEngine()
@@ -112,9 +114,9 @@ class ViewController: UIViewController {
            commonFormat: .pcmFormatFloat32,
            sampleRate: 16000, channels: 1,
            interleaved: false)!
-        
+
        let converter = AVAudioConverter(from: inputFormat!, to: outputFormat)!
-        
+
        inputNode!.installTap(
            onBus: bus,
            bufferSize: 1024,
@@ -122,34 +124,34 @@ class ViewController: UIViewController {
        ) {
            (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
            var newBufferAvailable = true
-            
+
            let inputCallback: AVAudioConverterInputBlock = {
                inNumPackets, outStatus in
                if newBufferAvailable {
                    outStatus.pointee = .haveData
                    newBufferAvailable = false
-                    
+
                    return buffer
                } else {
                    outStatus.pointee = .noDataNow
                    return nil
                }
            }
-            
+
            let convertedBuffer = AVAudioPCMBuffer(
                pcmFormat: outputFormat,
                frameCapacity:
                    AVAudioFrameCount(outputFormat.sampleRate)
                * buffer.frameLength
                / AVAudioFrameCount(buffer.format.sampleRate))!
-            
+
            var error: NSError?
            let _ = converter.convert(
                to: convertedBuffer,
                error: &error, withInputFrom: inputCallback)
-            
+
            // TODO(fangjun): Handle status != haveData
-            
+
            let array = convertedBuffer.array()
            if !array.isEmpty {
                self.recognizer.acceptWaveform(samples: array)
@@ -158,13 +160,13 @@ class ViewController: UIViewController {
                }
                let isEndpoint = self.recognizer.isEndpoint()
                let text = self.recognizer.getResult().text
-                
+
                if !text.isEmpty && self.lastSentence != text {
                    self.lastSentence = text
                    self.updateLabel()
                    print(text)
                }
-                
+
                if isEndpoint {
                    if !text.isEmpty {
                        let tmp = self.lastSentence
@@ -175,13 +177,13 @@ class ViewController: UIViewController {
                }
            }
        }
-        
+
    }
-    
+
    func startRecorder() {
        lastSentence = ""
        sentences = []
-        
+
        do {
            try self.audioEngine?.start()
        } catch let error as NSError {
@@ -189,7 +191,7 @@ class ViewController: UIViewController {
        }
        print("started")
    }
-    
+
    func stopRecorder() {
        audioEngine?.stop()
        print("stopped")