Add Swift example for generating subtitles (#318)
This commit is contained in:
@@ -9,9 +9,11 @@
|
|||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/csrc/circular-buffer.h"
|
||||||
#include "sherpa-onnx/csrc/display.h"
|
#include "sherpa-onnx/csrc/display.h"
|
||||||
#include "sherpa-onnx/csrc/offline-recognizer.h"
|
#include "sherpa-onnx/csrc/offline-recognizer.h"
|
||||||
#include "sherpa-onnx/csrc/online-recognizer.h"
|
#include "sherpa-onnx/csrc/online-recognizer.h"
|
||||||
|
#include "sherpa-onnx/csrc/voice-activity-detector.h"
|
||||||
|
|
||||||
struct SherpaOnnxOnlineRecognizer {
|
struct SherpaOnnxOnlineRecognizer {
|
||||||
std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
|
std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
|
||||||
@@ -127,7 +129,7 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer,
|
|||||||
recognizer->impl->DecodeStreams(ss.data(), n);
|
recognizer->impl->DecodeStreams(ss.data(), n);
|
||||||
}
|
}
|
||||||
|
|
||||||
SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
|
const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
|
||||||
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) {
|
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) {
|
||||||
sherpa_onnx::OnlineRecognizerResult result =
|
sherpa_onnx::OnlineRecognizerResult result =
|
||||||
recognizer->impl->GetResult(stream->impl.get());
|
recognizer->impl->GetResult(stream->impl.get());
|
||||||
@@ -340,7 +342,7 @@ void DecodeMultipleOfflineStreams(SherpaOnnxOfflineRecognizer *recognizer,
|
|||||||
recognizer->impl->DecodeStreams(ss.data(), n);
|
recognizer->impl->DecodeStreams(ss.data(), n);
|
||||||
}
|
}
|
||||||
|
|
||||||
SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
|
const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
|
||||||
SherpaOnnxOfflineStream *stream) {
|
SherpaOnnxOfflineStream *stream) {
|
||||||
const sherpa_onnx::OfflineRecognitionResult &result =
|
const sherpa_onnx::OfflineRecognitionResult &result =
|
||||||
stream->impl->GetResult();
|
stream->impl->GetResult();
|
||||||
@@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult(
|
|||||||
delete[] r->timestamps;
|
delete[] r->timestamps;
|
||||||
delete r;
|
delete r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// For VAD
|
||||||
|
// ============================================================
|
||||||
|
//
|
||||||
|
struct SherpaOnnxCircularBuffer {
|
||||||
|
std::unique_ptr<sherpa_onnx::CircularBuffer> impl;
|
||||||
|
};
|
||||||
|
|
||||||
|
SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) {
|
||||||
|
SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer;
|
||||||
|
buffer->impl = std::make_unique<sherpa_onnx::CircularBuffer>(capacity);
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) {
|
||||||
|
delete buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer,
|
||||||
|
const float *p, int32_t n) {
|
||||||
|
buffer->impl->Push(p, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer,
|
||||||
|
int32_t start_index, int32_t n) {
|
||||||
|
std::vector<float> v = buffer->impl->Get(start_index, n);
|
||||||
|
|
||||||
|
float *p = new float[n];
|
||||||
|
std::copy(v.begin(), v.end(), p);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; }
|
||||||
|
|
||||||
|
void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) {
|
||||||
|
buffer->impl->Pop(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) {
|
||||||
|
return buffer->impl->Size();
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) {
|
||||||
|
buffer->impl->Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
struct SherpaOnnxVoiceActivityDetector {
|
||||||
|
std::unique_ptr<sherpa_onnx::VoiceActivityDetector> impl;
|
||||||
|
};
|
||||||
|
|
||||||
|
SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
|
||||||
|
const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) {
|
||||||
|
sherpa_onnx::VadModelConfig vad_config;
|
||||||
|
|
||||||
|
vad_config.silero_vad.model = SHERPA_ONNX_OR(config->silero_vad.model, "");
|
||||||
|
vad_config.silero_vad.threshold =
|
||||||
|
SHERPA_ONNX_OR(config->silero_vad.threshold, 0.5);
|
||||||
|
|
||||||
|
vad_config.silero_vad.min_silence_duration =
|
||||||
|
SHERPA_ONNX_OR(config->silero_vad.min_silence_duration, 0.5);
|
||||||
|
|
||||||
|
vad_config.silero_vad.min_speech_duration =
|
||||||
|
SHERPA_ONNX_OR(config->silero_vad.min_speech_duration, 0.25);
|
||||||
|
|
||||||
|
vad_config.silero_vad.window_size =
|
||||||
|
SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
|
||||||
|
|
||||||
|
vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
|
||||||
|
vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
|
||||||
|
vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
|
||||||
|
vad_config.debug = SHERPA_ONNX_OR(config->debug, false);
|
||||||
|
|
||||||
|
if (vad_config.debug) {
|
||||||
|
fprintf(stderr, "%s\n", vad_config.ToString().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector;
|
||||||
|
p->impl = std::make_unique<sherpa_onnx::VoiceActivityDetector>(
|
||||||
|
vad_config, buffer_size_in_seconds);
|
||||||
|
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxDestroyVoiceActivityDetector(
|
||||||
|
SherpaOnnxVoiceActivityDetector *p) {
|
||||||
|
delete p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
|
||||||
|
SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) {
|
||||||
|
p->impl->AcceptWaveform(samples, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxVoiceActivityDetectorEmpty(
|
||||||
|
SherpaOnnxVoiceActivityDetector *p) {
|
||||||
|
return p->impl->Empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
|
||||||
|
SherpaOnnxVoiceActivityDetector *p) {
|
||||||
|
p->impl->Pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
|
||||||
|
SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) {
|
||||||
|
const sherpa_onnx::SpeechSegment &segment = p->impl->Front();
|
||||||
|
|
||||||
|
SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment;
|
||||||
|
ans->start = segment.start;
|
||||||
|
ans->samples = new float[segment.samples.size()];
|
||||||
|
std::copy(segment.samples.begin(), segment.samples.end(), ans->samples);
|
||||||
|
ans->n = segment.samples.size();
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) {
|
||||||
|
delete[] p->samples;
|
||||||
|
delete p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
|
||||||
|
p->impl->Reset();
|
||||||
|
}
|
||||||
|
|||||||
@@ -234,7 +234,7 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams(
|
|||||||
/// @return A pointer containing the result. The user has to invoke
|
/// @return A pointer containing the result. The user has to invoke
|
||||||
/// DestroyOnlineRecognizerResult() to free the returned pointer to
|
/// DestroyOnlineRecognizerResult() to free the returned pointer to
|
||||||
/// avoid memory leak.
|
/// avoid memory leak.
|
||||||
SHERPA_ONNX_API SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
|
SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
|
||||||
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream);
|
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream);
|
||||||
|
|
||||||
/// Destroy the pointer returned by GetOnlineStreamResult().
|
/// Destroy the pointer returned by GetOnlineStreamResult().
|
||||||
@@ -429,7 +429,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
|
|||||||
/// @return Return a pointer to the result. The user has to invoke
|
/// @return Return a pointer to the result. The user has to invoke
|
||||||
/// DestroyOnlineRecognizerResult() to free the returned pointer to
|
/// DestroyOnlineRecognizerResult() to free the returned pointer to
|
||||||
/// avoid memory leak.
|
/// avoid memory leak.
|
||||||
SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
|
SHERPA_ONNX_API const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
|
||||||
SherpaOnnxOfflineStream *stream);
|
SherpaOnnxOfflineStream *stream);
|
||||||
|
|
||||||
/// Destroy the pointer returned by GetOfflineStreamResult().
|
/// Destroy the pointer returned by GetOfflineStreamResult().
|
||||||
@@ -438,6 +438,127 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
|
|||||||
SHERPA_ONNX_API void DestroyOfflineRecognizerResult(
|
SHERPA_ONNX_API void DestroyOfflineRecognizerResult(
|
||||||
const SherpaOnnxOfflineRecognizerResult *r);
|
const SherpaOnnxOfflineRecognizerResult *r);
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// For VAD
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
|
||||||
|
// Path to the silero VAD model
|
||||||
|
const char *model;
|
||||||
|
|
||||||
|
// threshold to classify a segment as speech
|
||||||
|
//
|
||||||
|
// If the predicted probability of a segment is larger than this
|
||||||
|
// value, then it is classified as speech.
|
||||||
|
float threshold;
|
||||||
|
|
||||||
|
// in seconds
|
||||||
|
float min_silence_duration;
|
||||||
|
|
||||||
|
// in seconds
|
||||||
|
float min_speech_duration;
|
||||||
|
|
||||||
|
int window_size;
|
||||||
|
} SherpaOnnxSileroVadModelConfig;
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
|
||||||
|
SherpaOnnxSileroVadModelConfig silero_vad;
|
||||||
|
|
||||||
|
int32_t sample_rate;
|
||||||
|
int32_t num_threads;
|
||||||
|
const char *provider;
|
||||||
|
int32_t debug;
|
||||||
|
} SherpaOnnxVadModelConfig;
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer
|
||||||
|
SherpaOnnxCircularBuffer;
|
||||||
|
|
||||||
|
// Return an instance of circular buffer. The user has to use
|
||||||
|
// SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid
|
||||||
|
// memory leak.
|
||||||
|
SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(
|
||||||
|
int32_t capacity);
|
||||||
|
|
||||||
|
// Free the pointer returned by SherpaOnnxCreateCircularBuffer()
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer(
|
||||||
|
SherpaOnnxCircularBuffer *buffer);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxCircularBufferPush(
|
||||||
|
SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n);
|
||||||
|
|
||||||
|
// Return n samples starting at the given index.
|
||||||
|
//
|
||||||
|
// Return a pointer to an array containing n samples starting at start_index.
|
||||||
|
// The user has to use SherpaOnnxCircularBufferFree() to free the returned
|
||||||
|
// pointer to avoid memory leak.
|
||||||
|
SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet(
|
||||||
|
SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n);
|
||||||
|
|
||||||
|
// Free the pointer returned by SherpaOnnxCircularBufferGet().
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p);
|
||||||
|
|
||||||
|
// Remove n elements from the buffer
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxCircularBufferPop(
|
||||||
|
SherpaOnnxCircularBuffer *buffer, int32_t n);
|
||||||
|
|
||||||
|
// Return number of elements in the buffer.
|
||||||
|
SHERPA_ONNX_API int32_t
|
||||||
|
SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer);
|
||||||
|
|
||||||
|
// Clear all elements in the buffer
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxCircularBufferReset(
|
||||||
|
SherpaOnnxCircularBuffer *buffer);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment {
|
||||||
|
// The start index in samples of this segment
|
||||||
|
int32_t start;
|
||||||
|
|
||||||
|
// pointer to the array containing the samples
|
||||||
|
float *samples;
|
||||||
|
|
||||||
|
// number of samples in this segment
|
||||||
|
int32_t n;
|
||||||
|
} SherpaOnnxSpeechSegment;
|
||||||
|
|
||||||
|
typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector;
|
||||||
|
|
||||||
|
// Return an instance of VoiceActivityDetector.
|
||||||
|
// The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free
|
||||||
|
// the returned pointer to avoid memory leak.
|
||||||
|
SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector *
|
||||||
|
SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config,
|
||||||
|
float buffer_size_in_seconds);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector(
|
||||||
|
SherpaOnnxVoiceActivityDetector *p);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
|
||||||
|
SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n);
|
||||||
|
|
||||||
|
// Return 1 if there are no speech segments available.
|
||||||
|
// Return 0 if there are speech segments.
|
||||||
|
SHERPA_ONNX_API int32_t
|
||||||
|
SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p);
|
||||||
|
|
||||||
|
// Return the first speech segment.
|
||||||
|
// It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1.
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
|
||||||
|
SherpaOnnxVoiceActivityDetector *p);
|
||||||
|
|
||||||
|
// Return the first speech segment.
|
||||||
|
// The user has to use SherpaOnnxDestroySpeechSegment() to free the returned
|
||||||
|
// pointer to avoid memory leak.
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
|
||||||
|
SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p);
|
||||||
|
|
||||||
|
// Free the pointer returned SherpaOnnxVoiceActivityDetectorFront().
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
|
||||||
|
const SherpaOnnxSpeechSegment *p);
|
||||||
|
|
||||||
|
// Re-initialize the voice activity detector.
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
|
||||||
|
SherpaOnnxVoiceActivityDetector *p);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} /* extern "C" */
|
} /* extern "C" */
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ void Hypotheses::Add(Hypothesis hyp) {
|
|||||||
} else {
|
} else {
|
||||||
it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob);
|
it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob);
|
||||||
|
|
||||||
if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0){
|
if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0) {
|
||||||
it->second.lm_log_prob =
|
it->second.lm_log_prob =
|
||||||
LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob);
|
LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ struct SileroVadModelConfig {
|
|||||||
|
|
||||||
// threshold to classify a segment as speech
|
// threshold to classify a segment as speech
|
||||||
//
|
//
|
||||||
// The predicted probability of a segment is larger than this
|
// If the predicted probability of a segment is larger than this
|
||||||
// value, then it is classified as speech.
|
// value, then it is classified as speech.
|
||||||
float threshold = 0.5;
|
float threshold = 0.5;
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ struct SileroVadModelConfig {
|
|||||||
|
|
||||||
// 512, 1024, 1536 samples for 16000 Hz
|
// 512, 1024, 1536 samples for 16000 Hz
|
||||||
// 256, 512, 768 samples for 800 Hz
|
// 256, 512, 768 samples for 800 Hz
|
||||||
int window_size = 512; // in samples
|
int32_t window_size = 512; // in samples
|
||||||
|
|
||||||
SileroVadModelConfig() = default;
|
SileroVadModelConfig() = default;
|
||||||
|
|
||||||
|
|||||||
1
swift-api-examples/.gitignore
vendored
1
swift-api-examples/.gitignore
vendored
@@ -1,2 +1,3 @@
|
|||||||
decode-file
|
decode-file
|
||||||
decode-file-non-streaming
|
decode-file-non-streaming
|
||||||
|
generate-subtitles
|
||||||
|
|||||||
@@ -215,7 +215,7 @@ class SherpaOnnxRecognizer {
|
|||||||
|
|
||||||
/// Get the decoding results so far
|
/// Get the decoding results so far
|
||||||
func getResult() -> SherpaOnnxOnlineRecongitionResult {
|
func getResult() -> SherpaOnnxOnlineRecongitionResult {
|
||||||
let result: UnsafeMutablePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
|
let result: UnsafePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
|
||||||
recognizer, stream)
|
recognizer, stream)
|
||||||
return SherpaOnnxOnlineRecongitionResult(result: result)
|
return SherpaOnnxOnlineRecongitionResult(result: result)
|
||||||
}
|
}
|
||||||
@@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer {
|
|||||||
|
|
||||||
DecodeOfflineStream(recognizer, stream)
|
DecodeOfflineStream(recognizer, stream)
|
||||||
|
|
||||||
let result: UnsafeMutablePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
|
let result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
|
||||||
stream)
|
stream)
|
||||||
|
|
||||||
DestroyOfflineStream(stream)
|
DestroyOfflineStream(stream)
|
||||||
@@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer {
|
|||||||
return SherpaOnnxOfflineRecongitionResult(result: result)
|
return SherpaOnnxOfflineRecongitionResult(result: result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func sherpaOnnxSileroVadModelConfig(
|
||||||
|
model: String,
|
||||||
|
threshold: Float = 0.5,
|
||||||
|
minSilenceDuration: Float = 0.25,
|
||||||
|
minSpeechDuration: Float = 0.5,
|
||||||
|
windowSize: Int = 512
|
||||||
|
) -> SherpaOnnxSileroVadModelConfig {
|
||||||
|
return SherpaOnnxSileroVadModelConfig(
|
||||||
|
model: toCPointer(model),
|
||||||
|
threshold: threshold,
|
||||||
|
min_silence_duration: minSilenceDuration,
|
||||||
|
min_speech_duration: minSpeechDuration,
|
||||||
|
window_size: Int32(windowSize)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sherpaOnnxVadModelConfig(
|
||||||
|
sileroVad: SherpaOnnxSileroVadModelConfig,
|
||||||
|
sampleRate: Int32 = 16000,
|
||||||
|
numThreads: Int = 1,
|
||||||
|
provider: String = "cpu",
|
||||||
|
debug: Int = 0
|
||||||
|
) -> SherpaOnnxVadModelConfig {
|
||||||
|
return SherpaOnnxVadModelConfig(
|
||||||
|
silero_vad: sileroVad,
|
||||||
|
sample_rate: sampleRate,
|
||||||
|
num_threads: Int32(numThreads),
|
||||||
|
provider: toCPointer(provider),
|
||||||
|
debug: Int32(debug)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
class SherpaOnnxCircularBufferWrapper {
|
||||||
|
let buffer: OpaquePointer!
|
||||||
|
|
||||||
|
init(capacity: Int) {
|
||||||
|
buffer = SherpaOnnxCreateCircularBuffer(Int32(capacity))
|
||||||
|
}
|
||||||
|
|
||||||
|
deinit {
|
||||||
|
if let buffer {
|
||||||
|
SherpaOnnxDestroyCircularBuffer(buffer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func push(samples: [Float]) {
|
||||||
|
SherpaOnnxCircularBufferPush(buffer, samples, Int32(samples.count))
|
||||||
|
}
|
||||||
|
|
||||||
|
func get(startIndex: Int, n: Int) -> [Float] {
|
||||||
|
let p: UnsafePointer<Float>! = SherpaOnnxCircularBufferGet(buffer, Int32(startIndex), Int32(n))
|
||||||
|
|
||||||
|
var samples: [Float] = []
|
||||||
|
|
||||||
|
for index in 0..<n {
|
||||||
|
samples.append(p[Int(index)])
|
||||||
|
}
|
||||||
|
|
||||||
|
SherpaOnnxCircularBufferFree(p)
|
||||||
|
|
||||||
|
return samples
|
||||||
|
}
|
||||||
|
|
||||||
|
func pop(n: Int) {
|
||||||
|
SherpaOnnxCircularBufferPop(buffer, Int32(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
func size() -> Int {
|
||||||
|
return Int(SherpaOnnxCircularBufferSize(buffer))
|
||||||
|
}
|
||||||
|
|
||||||
|
func reset() {
|
||||||
|
SherpaOnnxCircularBufferReset(buffer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SherpaOnnxSpeechSegmentWrapper {
|
||||||
|
let p: UnsafePointer<SherpaOnnxSpeechSegment>!
|
||||||
|
|
||||||
|
init(p: UnsafePointer<SherpaOnnxSpeechSegment>!) {
|
||||||
|
self.p = p
|
||||||
|
}
|
||||||
|
|
||||||
|
deinit {
|
||||||
|
if let p {
|
||||||
|
SherpaOnnxDestroySpeechSegment(p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var start: Int {
|
||||||
|
return Int(p.pointee.start)
|
||||||
|
}
|
||||||
|
|
||||||
|
var n: Int {
|
||||||
|
return Int(p.pointee.n)
|
||||||
|
}
|
||||||
|
|
||||||
|
var samples: [Float] {
|
||||||
|
var samples: [Float] = []
|
||||||
|
for index in 0..<n {
|
||||||
|
samples.append(p.pointee.samples[Int(index)])
|
||||||
|
}
|
||||||
|
return samples
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SherpaOnnxVoiceActivityDetectorWrapper {
|
||||||
|
/// A pointer to the underlying counterpart in C
|
||||||
|
let vad: OpaquePointer!
|
||||||
|
|
||||||
|
init(config: UnsafePointer<SherpaOnnxVadModelConfig>!, buffer_size_in_seconds: Float) {
|
||||||
|
vad = SherpaOnnxCreateVoiceActivityDetector(config, buffer_size_in_seconds)
|
||||||
|
}
|
||||||
|
|
||||||
|
deinit {
|
||||||
|
if let vad {
|
||||||
|
SherpaOnnxDestroyVoiceActivityDetector(vad)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func acceptWaveform(samples: [Float]) {
|
||||||
|
SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, samples, Int32(samples.count))
|
||||||
|
}
|
||||||
|
|
||||||
|
func isEmpty() -> Bool {
|
||||||
|
return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1 ? true : false
|
||||||
|
}
|
||||||
|
|
||||||
|
func pop() {
|
||||||
|
SherpaOnnxVoiceActivityDetectorPop(vad)
|
||||||
|
}
|
||||||
|
|
||||||
|
func front() -> SherpaOnnxSpeechSegmentWrapper {
|
||||||
|
let p: UnsafePointer<SherpaOnnxSpeechSegment>? = SherpaOnnxVoiceActivityDetectorFront(vad)
|
||||||
|
return SherpaOnnxSpeechSegmentWrapper(p: p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func reset() {
|
||||||
|
SherpaOnnxVoiceActivityDetectorReset(vad)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ extension AVAudioPCMBuffer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func run() {
|
func run() {
|
||||||
|
|
||||||
var recognizer: SherpaOnnxOfflineRecognizer
|
var recognizer: SherpaOnnxOfflineRecognizer
|
||||||
var modelConfig: SherpaOnnxOfflineModelConfig
|
var modelConfig: SherpaOnnxOfflineModelConfig
|
||||||
var modelType = "whisper"
|
var modelType = "whisper"
|
||||||
|
|||||||
217
swift-api-examples/generate-subtitles.swift
Normal file
217
swift-api-examples/generate-subtitles.swift
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
/*
|
||||||
|
This file shows how to use Swift API to generate subtitles.
|
||||||
|
|
||||||
|
You can use the files from
|
||||||
|
https://huggingface.co/csukuangfj/vad/tree/main
|
||||||
|
for testing.
|
||||||
|
|
||||||
|
For instance, to generate subtitles for Obama.mov, please first
|
||||||
|
use
|
||||||
|
|
||||||
|
ffmpeg -i ./Obama.mov -acodec pcm_s16le -ac 1 -ar 16000 Obama.wav
|
||||||
|
|
||||||
|
to extract the audio part from the video.
|
||||||
|
|
||||||
|
This file supports only processing WAV sound files, so you have to first
|
||||||
|
extract audios from videos.
|
||||||
|
|
||||||
|
Please see
|
||||||
|
./run-generate-subtitles.sh
|
||||||
|
for usages.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import AVFoundation
|
||||||
|
|
||||||
|
extension AudioBuffer {
|
||||||
|
func array() -> [Float] {
|
||||||
|
return Array(UnsafeBufferPointer(self))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extension AVAudioPCMBuffer {
|
||||||
|
func array() -> [Float] {
|
||||||
|
return self.audioBufferList.pointee.mBuffers.array()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extension TimeInterval {
|
||||||
|
var hourMinuteSecondMS: String {
|
||||||
|
String(format: "%d:%02d:%02d,%03d", hour, minute, second, millisecond)
|
||||||
|
}
|
||||||
|
|
||||||
|
var hour: Int {
|
||||||
|
Int((self / 3600).truncatingRemainder(dividingBy: 3600))
|
||||||
|
}
|
||||||
|
var minute: Int {
|
||||||
|
Int((self / 60).truncatingRemainder(dividingBy: 60))
|
||||||
|
}
|
||||||
|
var second: Int {
|
||||||
|
Int(truncatingRemainder(dividingBy: 60))
|
||||||
|
}
|
||||||
|
var millisecond: Int {
|
||||||
|
Int((self * 1000).truncatingRemainder(dividingBy: 1000))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extension String {
|
||||||
|
var fileURL: URL {
|
||||||
|
return URL(fileURLWithPath: self)
|
||||||
|
}
|
||||||
|
var pathExtension: String {
|
||||||
|
return fileURL.pathExtension
|
||||||
|
}
|
||||||
|
var lastPathComponent: String {
|
||||||
|
return fileURL.lastPathComponent
|
||||||
|
}
|
||||||
|
var stringByDeletingPathExtension: String {
|
||||||
|
return fileURL.deletingPathExtension().path
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SpeechSegment: CustomStringConvertible {
|
||||||
|
|
||||||
|
let start: Float
|
||||||
|
let end: Float
|
||||||
|
let text: String
|
||||||
|
|
||||||
|
init(start: Float, duration: Float, text: String) {
|
||||||
|
self.start = start
|
||||||
|
self.end = start + duration
|
||||||
|
self.text = text
|
||||||
|
}
|
||||||
|
public var description: String {
|
||||||
|
var s: String
|
||||||
|
s = TimeInterval(self.start).hourMinuteSecondMS
|
||||||
|
s += " --> "
|
||||||
|
s += TimeInterval(self.end).hourMinuteSecondMS
|
||||||
|
s += "\n"
|
||||||
|
s += self.text
|
||||||
|
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func run() {
|
||||||
|
var recognizer: SherpaOnnxOfflineRecognizer
|
||||||
|
var modelConfig: SherpaOnnxOfflineModelConfig
|
||||||
|
var modelType = "whisper"
|
||||||
|
// modelType = "paraformer"
|
||||||
|
var filePath = "/Users/fangjun/Desktop/Obama.wav" // English
|
||||||
|
// filePath = "/Users/fangjun/Desktop/lei-jun.wav" // Chinese
|
||||||
|
// please go to https://huggingface.co/csukuangfj/vad
|
||||||
|
// to download the above two files
|
||||||
|
|
||||||
|
if modelType == "whisper" {
|
||||||
|
// for English
|
||||||
|
let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"
|
||||||
|
let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"
|
||||||
|
let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"
|
||||||
|
|
||||||
|
let whisperConfig = sherpaOnnxOfflineWhisperModelConfig(
|
||||||
|
encoder: encoder,
|
||||||
|
decoder: decoder
|
||||||
|
)
|
||||||
|
|
||||||
|
modelConfig = sherpaOnnxOfflineModelConfig(
|
||||||
|
tokens: tokens,
|
||||||
|
whisper: whisperConfig,
|
||||||
|
debug: 0,
|
||||||
|
modelType: "whisper"
|
||||||
|
)
|
||||||
|
} else if modelType == "paraformer" {
|
||||||
|
// for Chinese
|
||||||
|
let model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx"
|
||||||
|
let tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt"
|
||||||
|
let paraformerConfig = sherpaOnnxOfflineParaformerModelConfig(
|
||||||
|
model: model
|
||||||
|
)
|
||||||
|
|
||||||
|
modelConfig = sherpaOnnxOfflineModelConfig(
|
||||||
|
tokens: tokens,
|
||||||
|
paraformer: paraformerConfig,
|
||||||
|
debug: 0,
|
||||||
|
modelType: "paraformer"
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
print("Please specify a supported modelType \(modelType)")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
let sampleRate = 16000
|
||||||
|
let featConfig = sherpaOnnxFeatureConfig(
|
||||||
|
sampleRate: sampleRate,
|
||||||
|
featureDim: 80
|
||||||
|
)
|
||||||
|
var config = sherpaOnnxOfflineRecognizerConfig(
|
||||||
|
featConfig: featConfig,
|
||||||
|
modelConfig: modelConfig
|
||||||
|
)
|
||||||
|
|
||||||
|
recognizer = SherpaOnnxOfflineRecognizer(config: &config)
|
||||||
|
|
||||||
|
let audioFile = try! AVAudioFile(forReading: filePath.fileURL)
|
||||||
|
|
||||||
|
let audioFormat = audioFile.processingFormat
|
||||||
|
assert(audioFormat.sampleRate == Double(sampleRate))
|
||||||
|
assert(audioFormat.channelCount == 1)
|
||||||
|
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
|
||||||
|
|
||||||
|
let sileroVadConfig = sherpaOnnxSileroVadModelConfig(
|
||||||
|
model: "./silero_vad.onnx"
|
||||||
|
)
|
||||||
|
|
||||||
|
var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)
|
||||||
|
let vad = SherpaOnnxVoiceActivityDetectorWrapper(
|
||||||
|
config: &vadModelConfig, buffer_size_in_seconds: 120)
|
||||||
|
|
||||||
|
let audioFrameCount = UInt32(audioFile.length)
|
||||||
|
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
|
||||||
|
|
||||||
|
try! audioFile.read(into: audioFileBuffer!)
|
||||||
|
var array: [Float]! = audioFileBuffer?.array()
|
||||||
|
|
||||||
|
let windowSize = Int(vadModelConfig.silero_vad.window_size)
|
||||||
|
|
||||||
|
var segments: [SpeechSegment] = []
|
||||||
|
|
||||||
|
while array.count > windowSize {
|
||||||
|
// todo(fangjun): avoid extra copies here
|
||||||
|
vad.acceptWaveform(samples: [Float](array[0..<windowSize]))
|
||||||
|
array = [Float](array[windowSize..<array.count])
|
||||||
|
|
||||||
|
while !vad.isEmpty() {
|
||||||
|
let s = vad.front()
|
||||||
|
vad.pop()
|
||||||
|
let result = recognizer.decode(samples: s.samples)
|
||||||
|
|
||||||
|
segments.append(
|
||||||
|
SpeechSegment(
|
||||||
|
start: Float(s.start) / Float(sampleRate),
|
||||||
|
duration: Float(s.samples.count) / Float(sampleRate),
|
||||||
|
text: result.text))
|
||||||
|
|
||||||
|
print(segments.last!)
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let srt = zip(segments.indices, segments).map { (index, element) in
|
||||||
|
return "\(index+1)\n\(element)"
|
||||||
|
}.joined(separator: "\n\n")
|
||||||
|
|
||||||
|
let srtFilename = filePath.stringByDeletingPathExtension + ".srt"
|
||||||
|
do {
|
||||||
|
try srt.write(to: srtFilename.fileURL, atomically: true, encoding: .utf8)
|
||||||
|
} catch {
|
||||||
|
print("Error writing: \(error.localizedDescription)")
|
||||||
|
}
|
||||||
|
|
||||||
|
print("Saved to \(srtFilename)")
|
||||||
|
}
|
||||||
|
|
||||||
|
@main
|
||||||
|
struct App {
|
||||||
|
static func main() {
|
||||||
|
run()
|
||||||
|
}
|
||||||
|
}
|
||||||
36
swift-api-examples/run-generate-subtitles.sh
Executable file
36
swift-api-examples/run-generate-subtitles.sh
Executable file
@@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
if [ ! -d ../build-swift-macos ]; then
|
||||||
|
echo "Please run ../build-swift-macos.sh first!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
|
||||||
|
echo "Please download the pre-trained model for testing."
|
||||||
|
echo "You can refer to"
|
||||||
|
echo ""
|
||||||
|
echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
|
||||||
|
echo ""
|
||||||
|
echo "for help"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -e ./generate-subtitles ]; then
|
||||||
|
# Note: We use -lc++ to link against libc++ instead of libstdc++
|
||||||
|
swiftc \
|
||||||
|
-lc++ \
|
||||||
|
-I ../build-swift-macos/install/include \
|
||||||
|
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
|
||||||
|
./generate-subtitles.swift ./SherpaOnnx.swift \
|
||||||
|
-L ../build-swift-macos/install/lib/ \
|
||||||
|
-l sherpa-onnx \
|
||||||
|
-l onnxruntime \
|
||||||
|
-o generate-subtitles
|
||||||
|
else
|
||||||
|
echo "./generate-subtitles exists - skip building"
|
||||||
|
fi
|
||||||
|
|
||||||
|
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
|
||||||
|
./generate-subtitles
|
||||||
Reference in New Issue
Block a user