Add Swift API for spoken language identification. (#696)

2024-03-25 16:22:25 +08:00
parent ab7cff2513
commit 83a10a55a5
5 changed files with 180 additions and 0 deletions
--- a/.github/scripts/test-swift.sh
+++ b/.github/scripts/test-swift.sh
@@ -7,6 +7,9 @@ echo "pwd: $PWD"
 cd swift-api-examples
 ls -lh

+./run-spoken-language-identification.sh
+rm -rf sherpa-onnx-whisper*
+
 mkdir -p /Users/fangjun/Desktop
 pushd /Users/fangjun/Desktop
 curl -SL -O https://huggingface.co/csukuangfj/test-data/resolve/main/Obama.wav
--- a/swift-api-examples/.gitignore
+++ b/swift-api-examples/.gitignore
@@ -1,6 +1,7 @@
 decode-file
 decode-file-non-streaming
 generate-subtitles
+spoken-language-identification
 tts
 vits-vctk
 sherpa-onnx-paraformer-zh-2023-09-14
--- a/swift-api-examples/SherpaOnnx.swift
+++ b/swift-api-examples/SherpaOnnx.swift
@@ -713,3 +713,86 @@ class SherpaOnnxOfflineTtsWrapper {
    return SherpaOnnxGeneratedAudioWrapper(audio: audio)
  }
 }
+
+// spoken language identification
+
+func sherpaOnnxSpokenLanguageIdentificationWhisperConfig(
+  encoder: String,
+  decoder: String,
+  tailPaddings: Int = -1
+) -> SherpaOnnxSpokenLanguageIdentificationWhisperConfig {
+  return SherpaOnnxSpokenLanguageIdentificationWhisperConfig(
+    encoder: toCPointer(encoder),
+    decoder: toCPointer(decoder),
+    tail_paddings: Int32(tailPaddings))
+}
+
+func sherpaOnnxSpokenLanguageIdentificationConfig(
+  whisper: SherpaOnnxSpokenLanguageIdentificationWhisperConfig,
+  numThreads: Int = 1,
+  debug: Int = 0,
+  provider: String = "cpu"
+) -> SherpaOnnxSpokenLanguageIdentificationConfig {
+  return SherpaOnnxSpokenLanguageIdentificationConfig(
+    whisper: whisper,
+    num_threads: Int32(numThreads),
+    debug: Int32(debug),
+    provider: toCPointer(provider))
+}
+
+class SherpaOnnxSpokenLanguageIdentificationResultWrapper {
+  /// A pointer to the underlying counterpart in C
+  let result: UnsafePointer<SherpaOnnxSpokenLanguageIdentificationResult>!
+
+  /// Return the detected language.
+  /// en for English
+  /// zh for Chinese
+  /// es for Spanish
+  /// de for German
+  /// etc.
+  var lang: String {
+    return String(cString: result.pointee.lang)
+  }
+
+  init(result: UnsafePointer<SherpaOnnxSpokenLanguageIdentificationResult>!) {
+    self.result = result
+  }
+
+  deinit {
+    if let result {
+      SherpaOnnxDestroySpokenLanguageIdentificationResult(result)
+    }
+  }
+}
+
+class SherpaOnnxSpokenLanguageIdentificationWrapper {
+  /// A pointer to the underlying counterpart in C
+  let slid: OpaquePointer!
+
+  init(
+    config: UnsafePointer<SherpaOnnxSpokenLanguageIdentificationConfig>!
+  ) {
+    slid = SherpaOnnxCreateSpokenLanguageIdentification(config)
+  }
+
+  deinit {
+    if let slid {
+      SherpaOnnxDestroySpokenLanguageIdentification(slid)
+    }
+  }
+
+  func decode(samples: [Float], sampleRate: Int = 16000)
+    -> SherpaOnnxSpokenLanguageIdentificationResultWrapper
+  {
+    let stream: OpaquePointer! = SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid)
+    AcceptWaveformOffline(stream, Int32(sampleRate), samples, Int32(samples.count))
+
+    let result: UnsafePointer<SherpaOnnxSpokenLanguageIdentificationResult>? =
+      SherpaOnnxSpokenLanguageIdentificationCompute(
+        slid,
+        stream)
+
+    DestroyOfflineStream(stream)
+    return SherpaOnnxSpokenLanguageIdentificationResultWrapper(result: result)
+  }
+}
--- a/swift-api-examples/run-spoken-language-identification.sh
+++ b/swift-api-examples/run-spoken-language-identification.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [ ! -d ../build-swift-macos ]; then
+  echo "Please run ../build-swift-macos.sh first!"
+  exit 1
+fi
+
+if [ ! -d ./sherpa-onnx-whisper-tiny ]; then
+  echo "Download a pre-trained model for testing."
+
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
+  tar xvf sherpa-onnx-whisper-tiny.tar.bz2
+  rm sherpa-onnx-whisper-tiny.tar.bz2
+fi
+
+if [ ! -e ./spoken-language-identification ]; then
+  # Note: We use -lc++ to link against libc++ instead of libstdc++
+  swiftc \
+    -lc++ \
+    -I ../build-swift-macos/install/include \
+    -import-objc-header ./SherpaOnnx-Bridging-Header.h \
+    ./spoken-language-identification.swift  ./SherpaOnnx.swift \
+    -L ../build-swift-macos/install/lib/ \
+    -l sherpa-onnx \
+    -l onnxruntime \
+    -o spoken-language-identification
+
+  strip spoken-language-identification
+else
+  echo "./spoken-language-identification exists - skip building"
+fi
+
+export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
+./spoken-language-identification
--- a/swift-api-examples/spoken-language-identification.swift
+++ b/swift-api-examples/spoken-language-identification.swift
@@ -0,0 +1,57 @@
+import AVFoundation
+
+extension AudioBuffer {
+  func array() -> [Float] {
+    return Array(UnsafeBufferPointer(self))
+  }
+}
+
+extension AVAudioPCMBuffer {
+  func array() -> [Float] {
+    return self.audioBufferList.pointee.mBuffers.array()
+  }
+}
+
+func run() {
+  let encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx"
+  let decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx"
+
+  let whisperConfig = sherpaOnnxSpokenLanguageIdentificationWhisperConfig(
+    encoder: encoder,
+    decoder: decoder
+  )
+
+  var config = sherpaOnnxSpokenLanguageIdentificationConfig(
+    whisper: whisperConfig,
+    numThreads: 1,
+    debug: 1,
+    provider: "cpu"
+  )
+  let filePath = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav"
+
+  let slid = SherpaOnnxSpokenLanguageIdentificationWrapper(config: &config)
+
+  let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
+  let audioFile = try! AVAudioFile(forReading: fileURL as URL)
+
+  let audioFormat = audioFile.processingFormat
+  assert(audioFormat.sampleRate == 16000)
+  assert(audioFormat.channelCount == 1)
+  assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
+
+  let audioFrameCount = UInt32(audioFile.length)
+  let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
+
+  try! audioFile.read(into: audioFileBuffer!)
+  let array: [Float]! = audioFileBuffer?.array()
+  let result = slid.decode(samples: array)
+
+  print("\nDetectedllanguage is:\n\(result.lang)")
+}
+
+@main
+struct App {
+  static func main() {
+    run()
+  }
+}