Support non-streaming zipformer CTC ASR models (#2340)

This PR adds support for non-streaming Zipformer CTC ASR models across multiple language bindings, WebAssembly, examples, and CI workflows. - Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs - Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js - Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models Model doc is available at https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html
2025-07-04 15:57:07 +08:00
parent ef16455cb5
commit 3bf986d08d
71 changed files with 2121 additions and 68 deletions
--- a/swift-api-examples/.gitignore
+++ b/swift-api-examples/.gitignore
@@ -16,3 +16,6 @@ tts-kokoro-en
 tts-kokoro-zh-en
 speech-enhancement-gtcrn
 decode-file-sense-voice-with-hr
+test-version
+zipformer-ctc-asr
+dolphin-ctc-asr
--- a/swift-api-examples/SherpaOnnx.swift
+++ b/swift-api-examples/SherpaOnnx.swift
@@ -346,6 +346,14 @@ func sherpaOnnxOfflineParaformerModelConfig(
  )
 }

+func sherpaOnnxOfflineZipformerCtcModelConfig(
+  model: String = ""
+) -> SherpaOnnxOfflineZipformerCtcModelConfig {
+  return SherpaOnnxOfflineZipformerCtcModelConfig(
+    model: toCPointer(model)
+  )
+}
+
 func sherpaOnnxOfflineNemoEncDecCtcModelConfig(
  model: String = ""
 ) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig {
@@ -449,7 +457,9 @@ func sherpaOnnxOfflineModelConfig(
  senseVoice: SherpaOnnxOfflineSenseVoiceModelConfig = sherpaOnnxOfflineSenseVoiceModelConfig(),
  moonshine: SherpaOnnxOfflineMoonshineModelConfig = sherpaOnnxOfflineMoonshineModelConfig(),
  fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(),
-  dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig()
+  dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig(),
+  zipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig =
+    sherpaOnnxOfflineZipformerCtcModelConfig()
 ) -> SherpaOnnxOfflineModelConfig {
  return SherpaOnnxOfflineModelConfig(
    transducer: transducer,
@@ -468,7 +478,8 @@ func sherpaOnnxOfflineModelConfig(
    sense_voice: senseVoice,
    moonshine: moonshine,
    fire_red_asr: fireRedAsr,
-    dolphin: dolphin
+    dolphin: dolphin,
+    zipformer_ctc: zipformerCtc
  )
 }

--- a/swift-api-examples/run-zipformer-ctc-asr.sh
+++ b/swift-api-examples/run-zipformer-ctc-asr.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [ ! -d ../build-swift-macos ]; then
+  echo "Please run ../build-swift-macos.sh first!"
+  exit 1
+fi
+
+if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then
+  echo "Please download the pre-trained model for testing."
+  echo "You can refer to"
+  echo ""
+  echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese"
+  echo ""
+  echo "for help"
+
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
+
+  tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
+  rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
+  ls -lh sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
+fi
+
+if [ ! -e ./zipformer-ctc-asr ]; then
+  # Note: We use -lc++ to link against libc++ instead of libstdc++
+  swiftc \
+    -lc++ \
+    -I ../build-swift-macos/install/include \
+    -import-objc-header ./SherpaOnnx-Bridging-Header.h \
+    ./zipformer-ctc-asr.swift  ./SherpaOnnx.swift \
+    -L ../build-swift-macos/install/lib/ \
+    -l sherpa-onnx \
+    -l onnxruntime \
+    -o zipformer-ctc-asr
+
+  strip zipformer-ctc-asr
+else
+  echo "./zipformer-ctc-asr exists - skip building"
+fi
+
+export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
+./zipformer-ctc-asr
--- a/swift-api-examples/zipformer-ctc-asr.swift
+++ b/swift-api-examples/zipformer-ctc-asr.swift
@@ -0,0 +1,66 @@
+import AVFoundation
+
+extension AudioBuffer {
+  func array() -> [Float] {
+    return Array(UnsafeBufferPointer(self))
+  }
+}
+
+extension AVAudioPCMBuffer {
+  func array() -> [Float] {
+    return self.audioBufferList.pointee.mBuffers.array()
+  }
+}
+
+func run() {
+  let model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"
+  let tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"
+
+  let zipformerCtc = sherpaOnnxOfflineZipformerCtcModelConfig(
+    model: model
+  )
+
+  let modelConfig = sherpaOnnxOfflineModelConfig(
+    tokens: tokens,
+    debug: 0,
+    zipformerCtc: zipformerCtc
+  )
+
+  let featConfig = sherpaOnnxFeatureConfig(
+    sampleRate: 16000,
+    featureDim: 80
+  )
+  var config = sherpaOnnxOfflineRecognizerConfig(
+    featConfig: featConfig,
+    modelConfig: modelConfig
+  )
+
+  let recognizer = SherpaOnnxOfflineRecognizer(config: &config)
+
+  let filePath = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
+  let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
+  let audioFile = try! AVAudioFile(forReading: fileURL as URL)
+
+  let audioFormat = audioFile.processingFormat
+  assert(audioFormat.channelCount == 1)
+  assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
+
+  let audioFrameCount = UInt32(audioFile.length)
+  let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
+
+  try! audioFile.read(into: audioFileBuffer!)
+  let array: [Float]! = audioFileBuffer?.array()
+  let result = recognizer.decode(samples: array, sampleRate: Int(audioFormat.sampleRate))
+  print("\nresult is:\n\(result.text)")
+  if result.timestamps.count != 0 {
+    print("\ntimestamps is:\n\(result.timestamps)")
+  }
+
+}
+
+@main
+struct App {
+  static func main() {
+    run()
+  }
+}