Swift API for speaker diarization (#1404)
This commit is contained in:
5
.github/scripts/test-swift.sh
vendored
5
.github/scripts/test-swift.sh
vendored
@@ -7,6 +7,11 @@ echo "pwd: $PWD"
|
|||||||
cd swift-api-examples
|
cd swift-api-examples
|
||||||
ls -lh
|
ls -lh
|
||||||
|
|
||||||
|
./run-speaker-diarization.sh
|
||||||
|
rm -rf *.onnx
|
||||||
|
rm -rf sherpa-onnx-pyannote-segmentation-3-0
|
||||||
|
rm -fv *.wav
|
||||||
|
|
||||||
./run-add-punctuations.sh
|
./run-add-punctuations.sh
|
||||||
rm ./add-punctuations
|
rm ./add-punctuations
|
||||||
rm -rf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12
|
rm -rf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12
|
||||||
|
|||||||
@@ -1078,3 +1078,116 @@ class SherpaOnnxOfflinePunctuationWrapper {
|
|||||||
return ans
|
return ans
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: String)
|
||||||
|
-> SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig
|
||||||
|
{
|
||||||
|
return SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: toCPointer(model))
|
||||||
|
}
|
||||||
|
|
||||||
|
func sherpaOnnxOfflineSpeakerSegmentationModelConfig(
|
||||||
|
pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig,
|
||||||
|
numThreads: Int = 1,
|
||||||
|
debug: Int = 0,
|
||||||
|
provider: String = "cpu"
|
||||||
|
) -> SherpaOnnxOfflineSpeakerSegmentationModelConfig {
|
||||||
|
return SherpaOnnxOfflineSpeakerSegmentationModelConfig(
|
||||||
|
pyannote: pyannote,
|
||||||
|
num_threads: Int32(numThreads),
|
||||||
|
debug: Int32(debug),
|
||||||
|
provider: toCPointer(provider)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sherpaOnnxFastClusteringConfig(numClusters: Int = -1, threshold: Float = 0.5)
|
||||||
|
-> SherpaOnnxFastClusteringConfig
|
||||||
|
{
|
||||||
|
return SherpaOnnxFastClusteringConfig(num_clusters: Int32(numClusters), threshold: threshold)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sherpaOnnxSpeakerEmbeddingExtractorConfig(
|
||||||
|
model: String,
|
||||||
|
numThreads: Int = 1,
|
||||||
|
debug: Int = 0,
|
||||||
|
provider: String = "cpu"
|
||||||
|
) -> SherpaOnnxSpeakerEmbeddingExtractorConfig {
|
||||||
|
return SherpaOnnxSpeakerEmbeddingExtractorConfig(
|
||||||
|
model: toCPointer(model),
|
||||||
|
num_threads: Int32(numThreads),
|
||||||
|
debug: Int32(debug),
|
||||||
|
provider: toCPointer(provider)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sherpaOnnxOfflineSpeakerDiarizationConfig(
|
||||||
|
segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig,
|
||||||
|
embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig,
|
||||||
|
clustering: SherpaOnnxFastClusteringConfig,
|
||||||
|
minDurationOn: Float = 0.3,
|
||||||
|
minDurationOff: Float = 0.5
|
||||||
|
) -> SherpaOnnxOfflineSpeakerDiarizationConfig {
|
||||||
|
return SherpaOnnxOfflineSpeakerDiarizationConfig(
|
||||||
|
segmentation: segmentation,
|
||||||
|
embedding: embedding,
|
||||||
|
clustering: clustering,
|
||||||
|
min_duration_on: minDurationOn,
|
||||||
|
min_duration_off: minDurationOff
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
struct SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper {
|
||||||
|
var start: Float = 0
|
||||||
|
var end: Float = 0
|
||||||
|
var speaker: Int = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
class SherpaOnnxOfflineSpeakerDiarizationWrapper {
|
||||||
|
/// A pointer to the underlying counterpart in C
|
||||||
|
let impl: OpaquePointer!
|
||||||
|
|
||||||
|
init(
|
||||||
|
config: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationConfig>!
|
||||||
|
) {
|
||||||
|
impl = SherpaOnnxCreateOfflineSpeakerDiarization(config)
|
||||||
|
}
|
||||||
|
|
||||||
|
deinit {
|
||||||
|
if let impl {
|
||||||
|
SherpaOnnxDestroyOfflineSpeakerDiarization(impl)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var sampleRate: Int {
|
||||||
|
return Int(SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(impl))
|
||||||
|
}
|
||||||
|
|
||||||
|
func process(samples: [Float]) -> [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] {
|
||||||
|
let result = SherpaOnnxOfflineSpeakerDiarizationProcess(
|
||||||
|
impl, samples, Int32(samples.count))
|
||||||
|
|
||||||
|
if result == nil {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
|
||||||
|
let numSegments = Int(SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result))
|
||||||
|
|
||||||
|
let p: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationSegment>? =
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result)
|
||||||
|
|
||||||
|
if p == nil {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
|
||||||
|
var ans: [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] = []
|
||||||
|
for i in 0..<numSegments {
|
||||||
|
ans.append(
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper(
|
||||||
|
start: p![i].start, end: p![i].end, speaker: Int(p![i].speaker)))
|
||||||
|
}
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p)
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result)
|
||||||
|
|
||||||
|
return ans
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
35
swift-api-examples/run-speaker-diarization.sh
Executable file
35
swift-api-examples/run-speaker-diarization.sh
Executable file
@@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./0-four-speakers-zh.wav ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -e ./speaker-diarization ]; then
|
||||||
|
# Note: We use -lc++ to link against libc++ instead of libstdc++
|
||||||
|
swiftc \
|
||||||
|
-lc++ \
|
||||||
|
-I ../build-swift-macos/install/include \
|
||||||
|
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
|
||||||
|
./speaker-diarization.swift ./SherpaOnnx.swift \
|
||||||
|
-L ../build-swift-macos/install/lib/ \
|
||||||
|
-l sherpa-onnx \
|
||||||
|
-l onnxruntime \
|
||||||
|
-o speaker-diarization
|
||||||
|
|
||||||
|
strip speaker-diarization
|
||||||
|
else
|
||||||
|
echo "./speaker-diarization exists - skip building"
|
||||||
|
fi
|
||||||
|
|
||||||
|
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
|
||||||
|
./speaker-diarization
|
||||||
56
swift-api-examples/speaker-diarization.swift
Normal file
56
swift-api-examples/speaker-diarization.swift
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
import AVFoundation
|
||||||
|
|
||||||
|
extension AudioBuffer {
|
||||||
|
func array() -> [Float] {
|
||||||
|
return Array(UnsafeBufferPointer(self))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extension AVAudioPCMBuffer {
|
||||||
|
func array() -> [Float] {
|
||||||
|
return self.audioBufferList.pointee.mBuffers.array()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func run() {
|
||||||
|
let segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"
|
||||||
|
let embeddingExtractorModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"
|
||||||
|
let waveFilename = "./0-four-speakers-zh.wav"
|
||||||
|
|
||||||
|
// There are 4 speakers in ./0-four-speakers-zh.wav, so we use 4 here
|
||||||
|
let numSpeakers = 4
|
||||||
|
var config = sherpaOnnxOfflineSpeakerDiarizationConfig(
|
||||||
|
segmentation: sherpaOnnxOfflineSpeakerSegmentationModelConfig(
|
||||||
|
pyannote: sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: segmentationModel)),
|
||||||
|
embedding: sherpaOnnxSpeakerEmbeddingExtractorConfig(model: embeddingExtractorModel),
|
||||||
|
clustering: sherpaOnnxFastClusteringConfig(numClusters: numSpeakers)
|
||||||
|
)
|
||||||
|
|
||||||
|
let sd = SherpaOnnxOfflineSpeakerDiarizationWrapper(config: &config)
|
||||||
|
|
||||||
|
let fileURL: NSURL = NSURL(fileURLWithPath: waveFilename)
|
||||||
|
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
|
||||||
|
|
||||||
|
let audioFormat = audioFile.processingFormat
|
||||||
|
assert(Int(audioFormat.sampleRate) == sd.sampleRate)
|
||||||
|
assert(audioFormat.channelCount == 1)
|
||||||
|
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
|
||||||
|
|
||||||
|
let audioFrameCount = UInt32(audioFile.length)
|
||||||
|
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
|
||||||
|
|
||||||
|
try! audioFile.read(into: audioFileBuffer!)
|
||||||
|
let array: [Float]! = audioFileBuffer?.array()
|
||||||
|
print("Started!")
|
||||||
|
let segments = sd.process(samples: array)
|
||||||
|
for i in 0..<segments.count {
|
||||||
|
print("\(segments[i].start) -- \(segments[i].end) speaker_\(segments[i].speaker)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@main
|
||||||
|
struct App {
|
||||||
|
static func main() {
|
||||||
|
run()
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user