diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 520c4754..7b3c55b1 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -9,9 +9,11 @@ #include #include +#include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/display.h" #include "sherpa-onnx/csrc/offline-recognizer.h" #include "sherpa-onnx/csrc/online-recognizer.h" +#include "sherpa-onnx/csrc/voice-activity-detector.h" struct SherpaOnnxOnlineRecognizer { std::unique_ptr impl; @@ -127,7 +129,7 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer, recognizer->impl->DecodeStreams(ss.data(), n); } -SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( +const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) { sherpa_onnx::OnlineRecognizerResult result = recognizer->impl->GetResult(stream->impl.get()); @@ -340,7 +342,7 @@ void DecodeMultipleOfflineStreams(SherpaOnnxOfflineRecognizer *recognizer, recognizer->impl->DecodeStreams(ss.data(), n); } -SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( +const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( SherpaOnnxOfflineStream *stream) { const sherpa_onnx::OfflineRecognitionResult &result = stream->impl->GetResult(); @@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult( delete[] r->timestamps; delete r; } + +// ============================================================ +// For VAD +// ============================================================ +// +struct SherpaOnnxCircularBuffer { + std::unique_ptr impl; +}; + +SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) { + SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer; + buffer->impl = std::make_unique(capacity); + return buffer; +} + +void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) { + delete buffer; +} + +void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer, + const float *p, int32_t n) { + buffer->impl->Push(p, n); +} + +const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer, + int32_t start_index, int32_t n) { + std::vector v = buffer->impl->Get(start_index, n); + + float *p = new float[n]; + std::copy(v.begin(), v.end(), p); + return p; +} + +void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; } + +void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) { + buffer->impl->Pop(n); +} + +int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) { + return buffer->impl->Size(); +} + +void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) { + buffer->impl->Reset(); +} + +struct SherpaOnnxVoiceActivityDetector { + std::unique_ptr impl; +}; + +SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector( + const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) { + sherpa_onnx::VadModelConfig vad_config; + + vad_config.silero_vad.model = SHERPA_ONNX_OR(config->silero_vad.model, ""); + vad_config.silero_vad.threshold = + SHERPA_ONNX_OR(config->silero_vad.threshold, 0.5); + + vad_config.silero_vad.min_silence_duration = + SHERPA_ONNX_OR(config->silero_vad.min_silence_duration, 0.5); + + vad_config.silero_vad.min_speech_duration = + SHERPA_ONNX_OR(config->silero_vad.min_speech_duration, 0.25); + + vad_config.silero_vad.window_size = + SHERPA_ONNX_OR(config->silero_vad.window_size, 512); + + vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); + vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); + vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); + vad_config.debug = SHERPA_ONNX_OR(config->debug, false); + + if (vad_config.debug) { + fprintf(stderr, "%s\n", vad_config.ToString().c_str()); + } + + SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector; + p->impl = std::make_unique( + vad_config, buffer_size_in_seconds); + + return p; +} + +void SherpaOnnxDestroyVoiceActivityDetector( + SherpaOnnxVoiceActivityDetector *p) { + delete p; +} + +void SherpaOnnxVoiceActivityDetectorAcceptWaveform( + SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) { + p->impl->AcceptWaveform(samples, n); +} + +int32_t SherpaOnnxVoiceActivityDetectorEmpty( + SherpaOnnxVoiceActivityDetector *p) { + return p->impl->Empty(); +} + +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop( + SherpaOnnxVoiceActivityDetector *p) { + p->impl->Pop(); +} + +SHERPA_ONNX_API const SherpaOnnxSpeechSegment * +SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) { + const sherpa_onnx::SpeechSegment &segment = p->impl->Front(); + + SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment; + ans->start = segment.start; + ans->samples = new float[segment.samples.size()]; + std::copy(segment.samples.begin(), segment.samples.end(), ans->samples); + ans->n = segment.samples.size(); + + return ans; +} + +void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) { + delete[] p->samples; + delete p; +} + +void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) { + p->impl->Reset(); +} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 71aa5642..1a5c4dbf 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -234,7 +234,7 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams( /// @return A pointer containing the result. The user has to invoke /// DestroyOnlineRecognizerResult() to free the returned pointer to /// avoid memory leak. -SHERPA_ONNX_API SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( +SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream); /// Destroy the pointer returned by GetOnlineStreamResult(). @@ -429,7 +429,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult { /// @return Return a pointer to the result. The user has to invoke /// DestroyOnlineRecognizerResult() to free the returned pointer to /// avoid memory leak. -SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( +SHERPA_ONNX_API const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( SherpaOnnxOfflineStream *stream); /// Destroy the pointer returned by GetOfflineStreamResult(). @@ -438,6 +438,127 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( SHERPA_ONNX_API void DestroyOfflineRecognizerResult( const SherpaOnnxOfflineRecognizerResult *r); +// ============================================================ +// For VAD +// ============================================================ + +SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { + // Path to the silero VAD model + const char *model; + + // threshold to classify a segment as speech + // + // If the predicted probability of a segment is larger than this + // value, then it is classified as speech. + float threshold; + + // in seconds + float min_silence_duration; + + // in seconds + float min_speech_duration; + + int window_size; +} SherpaOnnxSileroVadModelConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { + SherpaOnnxSileroVadModelConfig silero_vad; + + int32_t sample_rate; + int32_t num_threads; + const char *provider; + int32_t debug; +} SherpaOnnxVadModelConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer + SherpaOnnxCircularBuffer; + +// Return an instance of circular buffer. The user has to use +// SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid +// memory leak. +SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer( + int32_t capacity); + +// Free the pointer returned by SherpaOnnxCreateCircularBuffer() +SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer( + SherpaOnnxCircularBuffer *buffer); + +SHERPA_ONNX_API void SherpaOnnxCircularBufferPush( + SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n); + +// Return n samples starting at the given index. +// +// Return a pointer to an array containing n samples starting at start_index. +// The user has to use SherpaOnnxCircularBufferFree() to free the returned +// pointer to avoid memory leak. +SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet( + SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n); + +// Free the pointer returned by SherpaOnnxCircularBufferGet(). +SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p); + +// Remove n elements from the buffer +SHERPA_ONNX_API void SherpaOnnxCircularBufferPop( + SherpaOnnxCircularBuffer *buffer, int32_t n); + +// Return number of elements in the buffer. +SHERPA_ONNX_API int32_t +SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer); + +// Clear all elements in the buffer +SHERPA_ONNX_API void SherpaOnnxCircularBufferReset( + SherpaOnnxCircularBuffer *buffer); + +SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment { + // The start index in samples of this segment + int32_t start; + + // pointer to the array containing the samples + float *samples; + + // number of samples in this segment + int32_t n; +} SherpaOnnxSpeechSegment; + +typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector; + +// Return an instance of VoiceActivityDetector. +// The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free +// the returned pointer to avoid memory leak. +SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector * +SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config, + float buffer_size_in_seconds); + +SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector( + SherpaOnnxVoiceActivityDetector *p); + +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform( + SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n); + +// Return 1 if there are no speech segments available. +// Return 0 if there are speech segments. +SHERPA_ONNX_API int32_t +SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p); + +// Return the first speech segment. +// It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1. +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop( + SherpaOnnxVoiceActivityDetector *p); + +// Return the first speech segment. +// The user has to use SherpaOnnxDestroySpeechSegment() to free the returned +// pointer to avoid memory leak. +SHERPA_ONNX_API const SherpaOnnxSpeechSegment * +SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p); + +// Free the pointer returned SherpaOnnxVoiceActivityDetectorFront(). +SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment( + const SherpaOnnxSpeechSegment *p); + +// Re-initialize the voice activity detector. +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset( + SherpaOnnxVoiceActivityDetector *p); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/sherpa-onnx/csrc/hypothesis.cc b/sherpa-onnx/csrc/hypothesis.cc index 9f5c680f..55d2492b 100644 --- a/sherpa-onnx/csrc/hypothesis.cc +++ b/sherpa-onnx/csrc/hypothesis.cc @@ -18,9 +18,9 @@ void Hypotheses::Add(Hypothesis hyp) { } else { it->second.log_prob = LogAdd()(it->second.log_prob, hyp.log_prob); - if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0){ + if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0) { it->second.lm_log_prob = - LogAdd()(it->second.lm_log_prob, hyp.lm_log_prob); + LogAdd()(it->second.lm_log_prob, hyp.lm_log_prob); } } } diff --git a/sherpa-onnx/csrc/silero-vad-model-config.h b/sherpa-onnx/csrc/silero-vad-model-config.h index fc930963..b9679dd2 100644 --- a/sherpa-onnx/csrc/silero-vad-model-config.h +++ b/sherpa-onnx/csrc/silero-vad-model-config.h @@ -15,7 +15,7 @@ struct SileroVadModelConfig { // threshold to classify a segment as speech // - // The predicted probability of a segment is larger than this + // If the predicted probability of a segment is larger than this // value, then it is classified as speech. float threshold = 0.5; @@ -25,7 +25,7 @@ struct SileroVadModelConfig { // 512, 1024, 1536 samples for 16000 Hz // 256, 512, 768 samples for 800 Hz - int window_size = 512; // in samples + int32_t window_size = 512; // in samples SileroVadModelConfig() = default; diff --git a/swift-api-examples/.gitignore b/swift-api-examples/.gitignore index 7eb31b52..cbcf0750 100644 --- a/swift-api-examples/.gitignore +++ b/swift-api-examples/.gitignore @@ -1,2 +1,3 @@ decode-file decode-file-non-streaming +generate-subtitles diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 72c497cf..d2cf3e89 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -215,7 +215,7 @@ class SherpaOnnxRecognizer { /// Get the decoding results so far func getResult() -> SherpaOnnxOnlineRecongitionResult { - let result: UnsafeMutablePointer? = GetOnlineStreamResult( + let result: UnsafePointer? = GetOnlineStreamResult( recognizer, stream) return SherpaOnnxOnlineRecongitionResult(result: result) } @@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer { DecodeOfflineStream(recognizer, stream) - let result: UnsafeMutablePointer? = GetOfflineStreamResult( + let result: UnsafePointer? = GetOfflineStreamResult( stream) DestroyOfflineStream(stream) @@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer { return SherpaOnnxOfflineRecongitionResult(result: result) } } + +func sherpaOnnxSileroVadModelConfig( + model: String, + threshold: Float = 0.5, + minSilenceDuration: Float = 0.25, + minSpeechDuration: Float = 0.5, + windowSize: Int = 512 +) -> SherpaOnnxSileroVadModelConfig { + return SherpaOnnxSileroVadModelConfig( + model: toCPointer(model), + threshold: threshold, + min_silence_duration: minSilenceDuration, + min_speech_duration: minSpeechDuration, + window_size: Int32(windowSize) + ) +} + +func sherpaOnnxVadModelConfig( + sileroVad: SherpaOnnxSileroVadModelConfig, + sampleRate: Int32 = 16000, + numThreads: Int = 1, + provider: String = "cpu", + debug: Int = 0 +) -> SherpaOnnxVadModelConfig { + return SherpaOnnxVadModelConfig( + silero_vad: sileroVad, + sample_rate: sampleRate, + num_threads: Int32(numThreads), + provider: toCPointer(provider), + debug: Int32(debug) + ) +} + +class SherpaOnnxCircularBufferWrapper { + let buffer: OpaquePointer! + + init(capacity: Int) { + buffer = SherpaOnnxCreateCircularBuffer(Int32(capacity)) + } + + deinit { + if let buffer { + SherpaOnnxDestroyCircularBuffer(buffer) + } + } + + func push(samples: [Float]) { + SherpaOnnxCircularBufferPush(buffer, samples, Int32(samples.count)) + } + + func get(startIndex: Int, n: Int) -> [Float] { + let p: UnsafePointer! = SherpaOnnxCircularBufferGet(buffer, Int32(startIndex), Int32(n)) + + var samples: [Float] = [] + + for index in 0.. Int { + return Int(SherpaOnnxCircularBufferSize(buffer)) + } + + func reset() { + SherpaOnnxCircularBufferReset(buffer) + } +} + +class SherpaOnnxSpeechSegmentWrapper { + let p: UnsafePointer! + + init(p: UnsafePointer!) { + self.p = p + } + + deinit { + if let p { + SherpaOnnxDestroySpeechSegment(p) + } + } + + var start: Int { + return Int(p.pointee.start) + } + + var n: Int { + return Int(p.pointee.n) + } + + var samples: [Float] { + var samples: [Float] = [] + for index in 0..!, buffer_size_in_seconds: Float) { + vad = SherpaOnnxCreateVoiceActivityDetector(config, buffer_size_in_seconds) + } + + deinit { + if let vad { + SherpaOnnxDestroyVoiceActivityDetector(vad) + } + } + + func acceptWaveform(samples: [Float]) { + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, samples, Int32(samples.count)) + } + + func isEmpty() -> Bool { + return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1 ? true : false + } + + func pop() { + SherpaOnnxVoiceActivityDetectorPop(vad) + } + + func front() -> SherpaOnnxSpeechSegmentWrapper { + let p: UnsafePointer? = SherpaOnnxVoiceActivityDetectorFront(vad) + return SherpaOnnxSpeechSegmentWrapper(p: p) + } + + func reset() { + SherpaOnnxVoiceActivityDetectorReset(vad) + } +} diff --git a/swift-api-examples/decode-file-non-streaming.swift b/swift-api-examples/decode-file-non-streaming.swift index 6d0b4e8b..ca9d9475 100644 --- a/swift-api-examples/decode-file-non-streaming.swift +++ b/swift-api-examples/decode-file-non-streaming.swift @@ -13,7 +13,6 @@ extension AVAudioPCMBuffer { } func run() { - var recognizer: SherpaOnnxOfflineRecognizer var modelConfig: SherpaOnnxOfflineModelConfig var modelType = "whisper" diff --git a/swift-api-examples/generate-subtitles.swift b/swift-api-examples/generate-subtitles.swift new file mode 100644 index 00000000..d0682945 --- /dev/null +++ b/swift-api-examples/generate-subtitles.swift @@ -0,0 +1,217 @@ +/* +This file shows how to use Swift API to generate subtitles. + +You can use the files from +https://huggingface.co/csukuangfj/vad/tree/main +for testing. + +For instance, to generate subtitles for Obama.mov, please first +use + +ffmpeg -i ./Obama.mov -acodec pcm_s16le -ac 1 -ar 16000 Obama.wav + +to extract the audio part from the video. + +This file supports only processing WAV sound files, so you have to first +extract audios from videos. + +Please see +./run-generate-subtitles.sh +for usages. +*/ + +import AVFoundation + +extension AudioBuffer { + func array() -> [Float] { + return Array(UnsafeBufferPointer(self)) + } +} + +extension AVAudioPCMBuffer { + func array() -> [Float] { + return self.audioBufferList.pointee.mBuffers.array() + } +} + +extension TimeInterval { + var hourMinuteSecondMS: String { + String(format: "%d:%02d:%02d,%03d", hour, minute, second, millisecond) + } + + var hour: Int { + Int((self / 3600).truncatingRemainder(dividingBy: 3600)) + } + var minute: Int { + Int((self / 60).truncatingRemainder(dividingBy: 60)) + } + var second: Int { + Int(truncatingRemainder(dividingBy: 60)) + } + var millisecond: Int { + Int((self * 1000).truncatingRemainder(dividingBy: 1000)) + } +} + +extension String { + var fileURL: URL { + return URL(fileURLWithPath: self) + } + var pathExtension: String { + return fileURL.pathExtension + } + var lastPathComponent: String { + return fileURL.lastPathComponent + } + var stringByDeletingPathExtension: String { + return fileURL.deletingPathExtension().path + } +} + +class SpeechSegment: CustomStringConvertible { + + let start: Float + let end: Float + let text: String + + init(start: Float, duration: Float, text: String) { + self.start = start + self.end = start + duration + self.text = text + } + public var description: String { + var s: String + s = TimeInterval(self.start).hourMinuteSecondMS + s += " --> " + s += TimeInterval(self.end).hourMinuteSecondMS + s += "\n" + s += self.text + + return s + } +} + +func run() { + var recognizer: SherpaOnnxOfflineRecognizer + var modelConfig: SherpaOnnxOfflineModelConfig + var modelType = "whisper" + // modelType = "paraformer" + var filePath = "/Users/fangjun/Desktop/Obama.wav" // English + // filePath = "/Users/fangjun/Desktop/lei-jun.wav" // Chinese + // please go to https://huggingface.co/csukuangfj/vad + // to download the above two files + + if modelType == "whisper" { + // for English + let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx" + let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx" + let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt" + + let whisperConfig = sherpaOnnxOfflineWhisperModelConfig( + encoder: encoder, + decoder: decoder + ) + + modelConfig = sherpaOnnxOfflineModelConfig( + tokens: tokens, + whisper: whisperConfig, + debug: 0, + modelType: "whisper" + ) + } else if modelType == "paraformer" { + // for Chinese + let model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx" + let tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt" + let paraformerConfig = sherpaOnnxOfflineParaformerModelConfig( + model: model + ) + + modelConfig = sherpaOnnxOfflineModelConfig( + tokens: tokens, + paraformer: paraformerConfig, + debug: 0, + modelType: "paraformer" + ) + } else { + print("Please specify a supported modelType \(modelType)") + return + } + + let sampleRate = 16000 + let featConfig = sherpaOnnxFeatureConfig( + sampleRate: sampleRate, + featureDim: 80 + ) + var config = sherpaOnnxOfflineRecognizerConfig( + featConfig: featConfig, + modelConfig: modelConfig + ) + + recognizer = SherpaOnnxOfflineRecognizer(config: &config) + + let audioFile = try! AVAudioFile(forReading: filePath.fileURL) + + let audioFormat = audioFile.processingFormat + assert(audioFormat.sampleRate == Double(sampleRate)) + assert(audioFormat.channelCount == 1) + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32) + + let sileroVadConfig = sherpaOnnxSileroVadModelConfig( + model: "./silero_vad.onnx" + ) + + var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig) + let vad = SherpaOnnxVoiceActivityDetectorWrapper( + config: &vadModelConfig, buffer_size_in_seconds: 120) + + let audioFrameCount = UInt32(audioFile.length) + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount) + + try! audioFile.read(into: audioFileBuffer!) + var array: [Float]! = audioFileBuffer?.array() + + let windowSize = Int(vadModelConfig.silero_vad.window_size) + + var segments: [SpeechSegment] = [] + + while array.count > windowSize { + // todo(fangjun): avoid extra copies here + vad.acceptWaveform(samples: [Float](array[0..