Swift API for speaker diarization (#1404)
This commit is contained in:
5
.github/scripts/test-swift.sh
vendored
5
.github/scripts/test-swift.sh
vendored
@@ -7,6 +7,11 @@ echo "pwd: $PWD"
|
||||
cd swift-api-examples
|
||||
ls -lh
|
||||
|
||||
./run-speaker-diarization.sh
|
||||
rm -rf *.onnx
|
||||
rm -rf sherpa-onnx-pyannote-segmentation-3-0
|
||||
rm -fv *.wav
|
||||
|
||||
./run-add-punctuations.sh
|
||||
rm ./add-punctuations
|
||||
rm -rf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12
|
||||
|
||||
@@ -1078,3 +1078,116 @@ class SherpaOnnxOfflinePunctuationWrapper {
|
||||
return ans
|
||||
}
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: String)
|
||||
-> SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig
|
||||
{
|
||||
return SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: toCPointer(model))
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineSpeakerSegmentationModelConfig(
|
||||
pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig,
|
||||
numThreads: Int = 1,
|
||||
debug: Int = 0,
|
||||
provider: String = "cpu"
|
||||
) -> SherpaOnnxOfflineSpeakerSegmentationModelConfig {
|
||||
return SherpaOnnxOfflineSpeakerSegmentationModelConfig(
|
||||
pyannote: pyannote,
|
||||
num_threads: Int32(numThreads),
|
||||
debug: Int32(debug),
|
||||
provider: toCPointer(provider)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxFastClusteringConfig(numClusters: Int = -1, threshold: Float = 0.5)
|
||||
-> SherpaOnnxFastClusteringConfig
|
||||
{
|
||||
return SherpaOnnxFastClusteringConfig(num_clusters: Int32(numClusters), threshold: threshold)
|
||||
}
|
||||
|
||||
func sherpaOnnxSpeakerEmbeddingExtractorConfig(
|
||||
model: String,
|
||||
numThreads: Int = 1,
|
||||
debug: Int = 0,
|
||||
provider: String = "cpu"
|
||||
) -> SherpaOnnxSpeakerEmbeddingExtractorConfig {
|
||||
return SherpaOnnxSpeakerEmbeddingExtractorConfig(
|
||||
model: toCPointer(model),
|
||||
num_threads: Int32(numThreads),
|
||||
debug: Int32(debug),
|
||||
provider: toCPointer(provider)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineSpeakerDiarizationConfig(
|
||||
segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig,
|
||||
embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig,
|
||||
clustering: SherpaOnnxFastClusteringConfig,
|
||||
minDurationOn: Float = 0.3,
|
||||
minDurationOff: Float = 0.5
|
||||
) -> SherpaOnnxOfflineSpeakerDiarizationConfig {
|
||||
return SherpaOnnxOfflineSpeakerDiarizationConfig(
|
||||
segmentation: segmentation,
|
||||
embedding: embedding,
|
||||
clustering: clustering,
|
||||
min_duration_on: minDurationOn,
|
||||
min_duration_off: minDurationOff
|
||||
)
|
||||
}
|
||||
|
||||
struct SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper {
|
||||
var start: Float = 0
|
||||
var end: Float = 0
|
||||
var speaker: Int = 0
|
||||
}
|
||||
|
||||
class SherpaOnnxOfflineSpeakerDiarizationWrapper {
|
||||
/// A pointer to the underlying counterpart in C
|
||||
let impl: OpaquePointer!
|
||||
|
||||
init(
|
||||
config: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationConfig>!
|
||||
) {
|
||||
impl = SherpaOnnxCreateOfflineSpeakerDiarization(config)
|
||||
}
|
||||
|
||||
deinit {
|
||||
if let impl {
|
||||
SherpaOnnxDestroyOfflineSpeakerDiarization(impl)
|
||||
}
|
||||
}
|
||||
|
||||
var sampleRate: Int {
|
||||
return Int(SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(impl))
|
||||
}
|
||||
|
||||
func process(samples: [Float]) -> [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] {
|
||||
let result = SherpaOnnxOfflineSpeakerDiarizationProcess(
|
||||
impl, samples, Int32(samples.count))
|
||||
|
||||
if result == nil {
|
||||
return []
|
||||
}
|
||||
|
||||
let numSegments = Int(SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result))
|
||||
|
||||
let p: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationSegment>? =
|
||||
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result)
|
||||
|
||||
if p == nil {
|
||||
return []
|
||||
}
|
||||
|
||||
var ans: [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] = []
|
||||
for i in 0..<numSegments {
|
||||
ans.append(
|
||||
SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper(
|
||||
start: p![i].start, end: p![i].end, speaker: Int(p![i].speaker)))
|
||||
}
|
||||
|
||||
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p)
|
||||
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result)
|
||||
|
||||
return ans
|
||||
}
|
||||
}
|
||||
|
||||
35
swift-api-examples/run-speaker-diarization.sh
Executable file
35
swift-api-examples/run-speaker-diarization.sh
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||
fi
|
||||
|
||||
if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||
fi
|
||||
|
||||
if [ ! -f ./0-four-speakers-zh.wav ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||
fi
|
||||
|
||||
if [ ! -e ./speaker-diarization ]; then
|
||||
# Note: We use -lc++ to link against libc++ instead of libstdc++
|
||||
swiftc \
|
||||
-lc++ \
|
||||
-I ../build-swift-macos/install/include \
|
||||
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
|
||||
./speaker-diarization.swift ./SherpaOnnx.swift \
|
||||
-L ../build-swift-macos/install/lib/ \
|
||||
-l sherpa-onnx \
|
||||
-l onnxruntime \
|
||||
-o speaker-diarization
|
||||
|
||||
strip speaker-diarization
|
||||
else
|
||||
echo "./speaker-diarization exists - skip building"
|
||||
fi
|
||||
|
||||
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
|
||||
./speaker-diarization
|
||||
56
swift-api-examples/speaker-diarization.swift
Normal file
56
swift-api-examples/speaker-diarization.swift
Normal file
@@ -0,0 +1,56 @@
|
||||
import AVFoundation
|
||||
|
||||
extension AudioBuffer {
|
||||
func array() -> [Float] {
|
||||
return Array(UnsafeBufferPointer(self))
|
||||
}
|
||||
}
|
||||
|
||||
extension AVAudioPCMBuffer {
|
||||
func array() -> [Float] {
|
||||
return self.audioBufferList.pointee.mBuffers.array()
|
||||
}
|
||||
}
|
||||
|
||||
func run() {
|
||||
let segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"
|
||||
let embeddingExtractorModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"
|
||||
let waveFilename = "./0-four-speakers-zh.wav"
|
||||
|
||||
// There are 4 speakers in ./0-four-speakers-zh.wav, so we use 4 here
|
||||
let numSpeakers = 4
|
||||
var config = sherpaOnnxOfflineSpeakerDiarizationConfig(
|
||||
segmentation: sherpaOnnxOfflineSpeakerSegmentationModelConfig(
|
||||
pyannote: sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: segmentationModel)),
|
||||
embedding: sherpaOnnxSpeakerEmbeddingExtractorConfig(model: embeddingExtractorModel),
|
||||
clustering: sherpaOnnxFastClusteringConfig(numClusters: numSpeakers)
|
||||
)
|
||||
|
||||
let sd = SherpaOnnxOfflineSpeakerDiarizationWrapper(config: &config)
|
||||
|
||||
let fileURL: NSURL = NSURL(fileURLWithPath: waveFilename)
|
||||
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
|
||||
|
||||
let audioFormat = audioFile.processingFormat
|
||||
assert(Int(audioFormat.sampleRate) == sd.sampleRate)
|
||||
assert(audioFormat.channelCount == 1)
|
||||
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
|
||||
|
||||
let audioFrameCount = UInt32(audioFile.length)
|
||||
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
|
||||
|
||||
try! audioFile.read(into: audioFileBuffer!)
|
||||
let array: [Float]! = audioFileBuffer?.array()
|
||||
print("Started!")
|
||||
let segments = sd.process(samples: array)
|
||||
for i in 0..<segments.count {
|
||||
print("\(segments[i].start) -- \(segments[i].end) speaker_\(segments[i].speaker)")
|
||||
}
|
||||
}
|
||||
|
||||
@main
|
||||
struct App {
|
||||
static func main() {
|
||||
run()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user